5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/doc/notebooks/demo_derivatives.ipynb
+++ b/doc/notebooks/demo_derivatives.ipynb
+%% Cell type:code id: tags:
+
+``` python
+from pystencils.session import *
+```
+
+%% Cell type:markdown id: tags:
+
+# Demo: Working with derivatives
+
+
+## Overview
+This notebook demonstrates how to formulate continuous differential operators in *pystencils* and automatically derive finite difference stencils from them.
+
+Instead of using the built-in derivatives in *sympy*, *pystencils* comes with its own derivative objects. They represent spatial derivatives of pystencils fields.
+
+%% Cell type:code id: tags:
+
+``` python
+f = ps.fields("f: [2D]")
+first_derivative_x = ps.fd.diff(f, 0)
+first_derivative_x
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {f}_{(0,0)}}$
+    D(f[0,0])
+
+%% Cell type:markdown id: tags:
+
+This object is the derivative of the field $f$ with respect to the first spatial coordinate $x$. To get a finite difference approximation a discretization strategy is required:
+
+%% Cell type:code id: tags:
+
+``` python
+discretize_2nd_order = ps.fd.Discretization2ndOrder(dx=sp.symbols("h"))
+discretize_2nd_order(first_derivative_x)
+```
+
+%% Output
+
+
+    $\displaystyle \frac{{f}_{(1,0)} - {f}_{(-1,0)}}{2 h}$
+    f_E - f_W
+    ─────────
+       2⋅h
+
+%% Cell type:markdown id: tags:
+
+Strictly speaking, derivative objects act on *field accesses*, not *fields*, that why there is a $(0,0)$ index at the field:
+
+%% Cell type:code id: tags:
+
+``` python
+first_derivative_x
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {f}_{(0,0)}}$
+    D(f[0,0])
+
+%% Cell type:markdown id: tags:
+
+Sometimes it might be useful to specify derivatives at an offset e.g.
+
+%% Cell type:code id: tags:
+
+``` python
+derivative_offset = ps.fd.diff(f[0, 1], 0)
+derivative_offset, discretize_2nd_order(derivative_offset)
+```
+
+%% Output
+
+
+    $\displaystyle \left( {\partial_{0} {f}_{(0,1)}}, \  \frac{{f}_{(1,1)} - {f}_{(-1,1)}}{2 h}\right)$
+    ⎛           f_NE - f_NW⎞
+    ⎜D(f[0,1]), ───────────⎟
+    ⎝               2⋅h    ⎠
+
+%% Cell type:markdown id: tags:
+
+Another example with second order derivatives:
+
+%% Cell type:code id: tags:
+
+``` python
+laplacian = ps.fd.diff(f, 0, 0) + ps.fd.diff(f, 1, 1)
+laplacian
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {\partial_{0} {f}_{(0,0)}}} + {\partial_{1} {\partial_{1} {f}_{(0,0)}}}$
+    D(D(f[0,0])) + D(D(f[0,0]))
+
+%% Cell type:code id: tags:
+
+``` python
+discretize_2nd_order(laplacian)
+```
+
+%% Output
+
+
+    $\displaystyle \frac{- 2 {f}_{(0,0)} + {f}_{(1,0)} + {f}_{(-1,0)}}{h^{2}} + \frac{- 2 {f}_{(0,0)} + {f}_{(0,1)} + {f}_{(0,-1)}}{h^{2}}$
+    -2⋅f_C + f_E + f_W   -2⋅f_C + f_N + f_S
+    ────────────────── + ──────────────────
+             2                    2
+            h                    h
+
+%% Cell type:markdown id: tags:
+
+## Working with derivative terms
+
+No automatic simplifications are done on derivative terms i.e. linearity relations or product rule are not applied automatically.
+
+%% Cell type:code id: tags:
+
+``` python
+f, g = ps.fields("f, g :[2D]")
+c = sp.symbols("c")
+δ = ps.fd.diff
+
+expr = δ( δ(f, 0) +  δ(g, 0) + c + 5 , 0)
+expr
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} (c + {\partial_{0} {f}_{(0,0)}} + {\partial_{0} {g}_{(0,0)}} + 5) }$
+    D(c + Diff(f_C, 0, -1) + Diff(g_C, 0, -1) + 5)
+
+%% Cell type:markdown id: tags:
+
+This nested term can not be discretized automatically.
+
+%% Cell type:code id: tags:
+
+``` python
+try:
+    discretize_2nd_order(expr)
+except ValueError as e:
+    print(e)
+```
+
+%% Output
+
+    Only derivatives with field or field accesses as arguments can be discretized
+
+%% Cell type:markdown id: tags:
+
+### Linearity
+The following function expands all derivatives exploiting linearity:
+
+%% Cell type:code id: tags:
+
+``` python
+ps.fd.expand_diff_linear(expr)
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} c} + {\partial_{0} {\partial_{0} {f}_{(0,0)}}} + {\partial_{0} {\partial_{0} {g}_{(0,0)}}}$
+    D(c) + D(D(f[0,0])) + D(D(g[0,0]))
+
+%% Cell type:markdown id: tags:
+
+The symbol $c$ that was included is interpreted as a function by default.
+We can control the simplification behaviour by specifying all functions or all constants:
+
+%% Cell type:code id: tags:
+
+``` python
+ps.fd.expand_diff_linear(expr, functions=(f[0,0], g[0, 0]))
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {\partial_{0} {f}_{(0,0)}}} + {\partial_{0} {\partial_{0} {g}_{(0,0)}}}$
+    D(D(f[0,0])) + D(D(g[0,0]))
+
+%% Cell type:code id: tags:
+
+``` python
+ps.fd.expand_diff_linear(expr, constants=[c])
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {\partial_{0} {f}_{(0,0)}}} + {\partial_{0} {\partial_{0} {g}_{(0,0)}}}$
+    D(D(f[0,0])) + D(D(g[0,0]))
+
+%% Cell type:markdown id: tags:
+
+The expanded term can then be discretized:
+
+%% Cell type:code id: tags:
+
+``` python
+discretize_2nd_order(ps.fd.expand_diff_linear(expr, constants=[c]))
+```
+
+%% Output
+
+
+    $\displaystyle \frac{- 2 {f}_{(0,0)} + {f}_{(1,0)} + {f}_{(-1,0)}}{h^{2}} + \frac{- 2 {g}_{(0,0)} + {g}_{(1,0)} + {g}_{(-1,0)}}{h^{2}}$
+    -2⋅f_C + f_E + f_W   -2⋅g_C + g_E + g_W
+    ────────────────── + ──────────────────
+             2                    2
+            h                    h
+
+%% Cell type:markdown id: tags:
+
+### Product rule
+
+The next cells show how to apply product rule and its reverse:
+
+%% Cell type:code id: tags:
+
+``` python
+expr = δ(f[0, 0] * g[0, 0], 0 )
+expr
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} ({f}_{(0,0)} {g}_{(0,0)}) }$
+    D(f_C*g_C)
+
+%% Cell type:code id: tags:
+
+``` python
+expanded_expr = ps.fd.expand_diff_products(expr)
+expanded_expr
+```
+
+%% Output
+
+
+    $\displaystyle {f}_{(0,0)} {\partial_{0} {g}_{(0,0)}} + {g}_{(0,0)} {\partial_{0} {f}_{(0,0)}}$
+    f_C⋅D(g[0,0]) + g_C⋅D(f[0,0])
+
+%% Cell type:code id: tags:
+
+``` python
+recombined_expr = ps.fd.combine_diff_products(expanded_expr)
+recombined_expr
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} ({f}_{(0,0)} {g}_{(0,0)}) }$
+    D(f_C*g_C)
+
+%% Cell type:code id: tags:
+
+``` python
+assert recombined_expr == expr
+```
+
+%% Cell type:markdown id: tags:
+
+### Evaluate derivatives
+
+Arguments of derivative need not be to be fields, only when trying to discretize them.
+The next cells show how to transform them to *sympy* derivatives and evaluate them.
+
+%% Cell type:code id: tags:
+
+``` python
+k = sp.symbols("k")
+expr = δ(k**3 + 2 * k, 0 )
+expr
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} (k^{3} + 2 k) }$
+    D(k**3 + 2*k)
+
+%% Cell type:code id: tags:
+
+``` python
+ps.fd.evaluate_diffs(expr, var=k)
+```
+
+%% Output
+
+
+    $\displaystyle 3 k^{2} + 2$
+       2
+    3⋅k  + 2
+%% Cell type:code id: tags:
+
+``` python
+from pystencils.session import *
+```
+
+%% Cell type:markdown id: tags:
+
+# Demo: Working with derivatives
+
+
+## Overview
+This notebook demonstrates how to formulate continuous differential operators in *pystencils* and automatically derive finite difference stencils from them.
+
+Instead of using the built-in derivatives in *sympy*, *pystencils* comes with its own derivative objects. They represent spatial derivatives of pystencils fields.
+
+%% Cell type:code id: tags:
+
+``` python
+f = ps.fields("f: [2D]")
+first_derivative_x = ps.fd.diff(f, 0)
+first_derivative_x
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {f}_{(0,0)}}$
+    D(f[0,0])
+
+%% Cell type:markdown id: tags:
+
+This object is the derivative of the field $f$ with respect to the first spatial coordinate $x$. To get a finite difference approximation a discretization strategy is required:
+
+%% Cell type:code id: tags:
+
+``` python
+discretize_2nd_order = ps.fd.Discretization2ndOrder(dx=sp.symbols("h"))
+discretize_2nd_order(first_derivative_x)
+```
+
+%% Output
+
+
+    $\displaystyle \frac{{f}_{(1,0)} - {f}_{(-1,0)}}{2 h}$
+    f_E - f_W
+    ─────────
+       2⋅h
+
+%% Cell type:markdown id: tags:
+
+Strictly speaking, derivative objects act on *field accesses*, not *fields*, that why there is a $(0,0)$ index at the field:
+
+%% Cell type:code id: tags:
+
+``` python
+first_derivative_x
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {f}_{(0,0)}}$
+    D(f[0,0])
+
+%% Cell type:markdown id: tags:
+
+Sometimes it might be useful to specify derivatives at an offset e.g.
+
+%% Cell type:code id: tags:
+
+``` python
+derivative_offset = ps.fd.diff(f[0, 1], 0)
+derivative_offset, discretize_2nd_order(derivative_offset)
+```
+
+%% Output
+
+
+    $\displaystyle \left( {\partial_{0} {f}_{(0,1)}}, \  \frac{{f}_{(1,1)} - {f}_{(-1,1)}}{2 h}\right)$
+    ⎛           f_NE - f_NW⎞
+    ⎜D(f[0,1]), ───────────⎟
+    ⎝               2⋅h    ⎠
+
+%% Cell type:markdown id: tags:
+
+Another example with second order derivatives:
+
+%% Cell type:code id: tags:
+
+``` python
+laplacian = ps.fd.diff(f, 0, 0) + ps.fd.diff(f, 1, 1)
+laplacian
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {\partial_{0} {f}_{(0,0)}}} + {\partial_{1} {\partial_{1} {f}_{(0,0)}}}$
+    D(D(f[0,0])) + D(D(f[0,0]))
+
+%% Cell type:code id: tags:
+
+``` python
+discretize_2nd_order(laplacian)
+```
+
+%% Output
+
+
+    $\displaystyle \frac{- 2 {f}_{(0,0)} + {f}_{(1,0)} + {f}_{(-1,0)}}{h^{2}} + \frac{- 2 {f}_{(0,0)} + {f}_{(0,1)} + {f}_{(0,-1)}}{h^{2}}$
+    -2⋅f_C + f_E + f_W   -2⋅f_C + f_N + f_S
+    ────────────────── + ──────────────────
+             2                    2
+            h                    h
+
+%% Cell type:markdown id: tags:
+
+## Working with derivative terms
+
+No automatic simplifications are done on derivative terms i.e. linearity relations or product rule are not applied automatically.
+
+%% Cell type:code id: tags:
+
+``` python
+f, g = ps.fields("f, g :[2D]")
+c = sp.symbols("c")
+δ = ps.fd.diff
+
+expr = δ( δ(f, 0) +  δ(g, 0) + c + 5 , 0)
+expr
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} (c + {\partial_{0} {f}_{(0,0)}} + {\partial_{0} {g}_{(0,0)}} + 5) }$
+    D(c + Diff(f_C, 0, -1) + Diff(g_C, 0, -1) + 5)
+
+%% Cell type:markdown id: tags:
+
+This nested term can not be discretized automatically.
+
+%% Cell type:code id: tags:
+
+``` python
+try:
+    discretize_2nd_order(expr)
+except ValueError as e:
+    print(e)
+```
+
+%% Output
+
+    Only derivatives with field or field accesses as arguments can be discretized
+
+%% Cell type:markdown id: tags:
+
+### Linearity
+The following function expands all derivatives exploiting linearity:
+
+%% Cell type:code id: tags:
+
+``` python
+ps.fd.expand_diff_linear(expr)
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} c} + {\partial_{0} {\partial_{0} {f}_{(0,0)}}} + {\partial_{0} {\partial_{0} {g}_{(0,0)}}}$
+    D(c) + D(D(f[0,0])) + D(D(g[0,0]))
+
+%% Cell type:markdown id: tags:
+
+The symbol $c$ that was included is interpreted as a function by default.
+We can control the simplification behaviour by specifying all functions or all constants:
+
+%% Cell type:code id: tags:
+
+``` python
+ps.fd.expand_diff_linear(expr, functions=(f[0,0], g[0, 0]))
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {\partial_{0} {f}_{(0,0)}}} + {\partial_{0} {\partial_{0} {g}_{(0,0)}}}$
+    D(D(f[0,0])) + D(D(g[0,0]))
+
+%% Cell type:code id: tags:
+
+``` python
+ps.fd.expand_diff_linear(expr, constants=[c])
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} {\partial_{0} {f}_{(0,0)}}} + {\partial_{0} {\partial_{0} {g}_{(0,0)}}}$
+    D(D(f[0,0])) + D(D(g[0,0]))
+
+%% Cell type:markdown id: tags:
+
+The expanded term can then be discretized:
+
+%% Cell type:code id: tags:
+
+``` python
+discretize_2nd_order(ps.fd.expand_diff_linear(expr, constants=[c]))
+```
+
+%% Output
+
+
+    $\displaystyle \frac{- 2 {f}_{(0,0)} + {f}_{(1,0)} + {f}_{(-1,0)}}{h^{2}} + \frac{- 2 {g}_{(0,0)} + {g}_{(1,0)} + {g}_{(-1,0)}}{h^{2}}$
+    -2⋅f_C + f_E + f_W   -2⋅g_C + g_E + g_W
+    ────────────────── + ──────────────────
+             2                    2
+            h                    h
+
+%% Cell type:markdown id: tags:
+
+### Product rule
+
+The next cells show how to apply product rule and its reverse:
+
+%% Cell type:code id: tags:
+
+``` python
+expr = δ(f[0, 0] * g[0, 0], 0 )
+expr
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} ({f}_{(0,0)} {g}_{(0,0)}) }$
+    D(f_C*g_C)
+
+%% Cell type:code id: tags:
+
+``` python
+expanded_expr = ps.fd.expand_diff_products(expr)
+expanded_expr
+```
+
+%% Output
+
+
+    $\displaystyle {f}_{(0,0)} {\partial_{0} {g}_{(0,0)}} + {g}_{(0,0)} {\partial_{0} {f}_{(0,0)}}$
+    f_C⋅D(g[0,0]) + g_C⋅D(f[0,0])
+
+%% Cell type:code id: tags:
+
+``` python
+recombined_expr = ps.fd.combine_diff_products(expanded_expr)
+recombined_expr
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} ({f}_{(0,0)} {g}_{(0,0)}) }$
+    D(f_C*g_C)
+
+%% Cell type:code id: tags:
+
+``` python
+assert recombined_expr == expr
+```
+
+%% Cell type:markdown id: tags:
+
+### Evaluate derivatives
+
+Arguments of derivative need not be to be fields, only when trying to discretize them.
+The next cells show how to transform them to *sympy* derivatives and evaluate them.
+
+%% Cell type:code id: tags:
+
+``` python
+k = sp.symbols("k")
+expr = δ(k**3 + 2 * k, 0 )
+expr
+```
+
+%% Output
+
+
+    $\displaystyle {\partial_{0} (k^{3} + 2 k) }$
+    D(k**3 + 2*k)
+
+%% Cell type:code id: tags:
+
+``` python
+ps.fd.evaluate_diffs(expr, var=k)
+```
+
+%% Output
+
+
+    $\displaystyle 3 k^{2} + 2$
+       2
+    3⋅k  + 2
--- a/doc/notebooks/demo_plotting_and_animation.ipynb
+++ b/doc/notebooks/demo_plotting_and_animation.ipynb
--- a/doc/notebooks/demo_wave_equation.ipynb
+++ b/doc/notebooks/demo_wave_equation.ipynb
+%% Cell type:code id: tags:
+
+``` python
+import psutil
+
+from pystencils.session import *
+
+import shutil
+```
+
+%% Cell type:markdown id: tags:
+
+# Demo: Finite differences - 2D wave equation
+
+In this tutorial we show how to use the finite difference module of *pystencils* to solve a 2D wave equations. The time derivative is discretized by a simple forward Euler method.
+
+%% Cell type:markdown id: tags:
+
+$$ \frac{\partial^2 u}{\partial t^2} = \mbox{div} \left( q(x,y) \nabla u \right)$$
+
+%% Cell type:markdown id: tags:
+
+We begin by creating three *numpy* arrays for the current, the previous and the next timestep. Actually we will see later that two fields are enough, but let's keep it simple. From these *numpy* arrays we create *pystencils* fields to formulate our update rule.
+
+%% Cell type:code id: tags:
+
+``` python
+size = (60, 70) #  domain size
+u_arrays = [np.zeros(size), np.zeros(size), np.zeros(size)]
+
+u_fields = [ps.Field.create_from_numpy_array("u%s" % (name,), arr)
+            for name, arr in zip(["0", "1", "2"], u_arrays)]
+
+# Nicer display for fields
+for i, field in enumerate(u_fields):
+    field.latex_name = "u^{(%d)}" % (i,)
+```
+
+%% Cell type:markdown id: tags:
+
+*pystencils* contains already simple rules to discretize the a diffusion term. The time discretization is done manually.
+
+%% Cell type:code id: tags:
+
+``` python
+discretize = ps.fd.Discretization2ndOrder()
+
+def central2nd_time_derivative(fields):
+    f_next, f_current, f_last = fields
+    return (f_next[0, 0] - 2 * f_current[0, 0] + f_last[0, 0]) / discretize.dt**2
+
+rhs = ps.fd.diffusion(u_fields[1], 1)
+
+wave_eq = sp.Eq(central2nd_time_derivative(u_fields), discretize(rhs))
+
+wave_eq = sp.simplify(wave_eq)
+wave_eq
+```
+
+%% Output
+
+
+    $\displaystyle \frac{{u^{(0)}}_{(0,0)} - 2 {u^{(1)}}_{(0,0)} + {u^{(2)}}_{(0,0)}}{dt^{2}} = \frac{- 4 {u^{(1)}}_{(0,0)} + {u^{(1)}}_{(1,0)} + {u^{(1)}}_{(0,1)} + {u^{(1)}}_{(0,-1)} + {u^{(1)}}_{(-1,0)}}{dx^{2}}$
+    u_0_C - 2⋅u_1_C + u_2_C   -4⋅u_1_C + u_1_E + u_1_N + u_1_S + u_1_W
+    ─────────────────────── = ────────────────────────────────────────
+                2                                 2
+              dt                                dx
+
+%% Cell type:markdown id: tags:
+
+The explicit Euler scheme is now obtained by solving above equation with respect to $u_C^{next}$.
+
+%% Cell type:code id: tags:
+
+``` python
+u_next_C = u_fields[-1][0, 0]
+update_rule = ps.Assignment(u_next_C, sp.solve(wave_eq, u_next_C)[0])
+update_rule
+```
+
+%% Output
+
+
+    $\displaystyle {u^{(2)}}_{(0,0)} \leftarrow \frac{- 4 {u^{(1)}}_{(0,0)} dt^{2} + {u^{(1)}}_{(1,0)} dt^{2} + {u^{(1)}}_{(0,1)} dt^{2} + {u^{(1)}}_{(0,-1)} dt^{2} + {u^{(1)}}_{(-1,0)} dt^{2} + dx^{2} \left(- {u^{(0)}}_{(0,0)} + 2 {u^{(1)}}_{(0,0)}\right)}{dx^{2}}$
+                         2           2           2           2           2     2
+             - 4⋅u_1_C⋅dt  + u_1_E⋅dt  + u_1_N⋅dt  + u_1_S⋅dt  + u_1_W⋅dt  + dx ⋅(-u_0_C + 2⋅u_1_C)
+    u_2_C := ──────────────────────────────────────────────────────────────────────────────────────
+                                                        2
+                                                      dx
+
+%% Cell type:markdown id: tags:
+
+Before creating the kernel, we substitute numeric values for $dx$ and $dt$.
+Then a kernel is created just like in the last tutorial.
+
+%% Cell type:code id: tags:
+
+``` python
+update_rule = update_rule.subs({discretize.dx: 0.1, discretize.dt: 0.05})
+ast = ps.create_kernel(update_rule)
+kernel = ast.compile()
+
+ps.show_code(ast)
+```
+
+%% Output
+
+
+
+%% Cell type:code id: tags:
+
+``` python
+ast.get_parameters()
+```
+
+%% Output
+
+    [_data_u0, _data_u1, _data_u2]
+
+%% Cell type:code id: tags:
+
+``` python
+ast.fields_accessed
+```
+
+%% Output
+
+    {u0: double[60,70], u1: double[60,70], u2: double[60,70]}
+
+%% Cell type:markdown id: tags:
+
+To run simulation a suitable initial condition and boundary treatment is required. We chose an initial condition which is zero at the borders of the domain. The outermost layer is not changed by the update kernel, so we have an implicit homogenous Dirichlet boundary condition.
+
+%% Cell type:code id: tags:
+
+``` python
+X,Y = np.meshgrid( np.linspace(0, 1, size[1]), np.linspace(0,1, size[0]))
+Z = np.sin(2*X*np.pi) * np.sin(2*Y*np.pi)
+
+# Initialize the previous and current values with the initial function
+np.copyto(u_arrays[0], Z)
+np.copyto(u_arrays[1], Z)
+# The values for the next timesteps do not matter, since they are overwritten
+u_arrays[2][:, :] = 0
+```
+
+%% Cell type:markdown id: tags:
+
+One timestep now consists of applying the kernel once, then shifting the arrays.
+
+%% Cell type:code id: tags:
+
+``` python
+def run(timesteps=1):
+    for t in range(timesteps):
+        kernel(u0=u_arrays[0], u1=u_arrays[1], u2=u_arrays[2])
+        u_arrays[0], u_arrays[1], u_arrays[2] = u_arrays[1], u_arrays[2], u_arrays[0]
+    return u_arrays[2]
+```
+
+%% Cell type:markdown id: tags:
+
+Lets create an animation of the solution:
+
+%% Cell type:code id: tags:
+
+``` python
+if shutil.which("ffmpeg") is not None:
+    ani = plt.surface_plot_animation(run, zlim=(-1, 1))
+    ps.jupyter.display_as_html_video(ani)
+else:
+    print("No ffmpeg installed")
+```
+
+%% Output
+
+    No ffmpeg installed
+
+%% Cell type:code id: tags:
+
+``` python
+assert np.isfinite(np.max(u_arrays[2]))
+```
+
+%% Cell type:markdown id: tags:
+
+__Runing on GPU__
+
+We can also run the same kernel on the GPU, by using the ``cupy`` package.
+
+%% Cell type:code id: tags:
+
+``` python
+try:
+    import cupy
+except ImportError:
+    cupy=None
+    print('No cupy installed')
+
+
+res = None
+if cupy:
+    gpu_ast = ps.create_kernel(update_rule, target=ps.Target.GPU)
+    gpu_kernel = gpu_ast.compile()
+    res = ps.show_code(gpu_ast)
+res
+```
+
+%% Output
+
+
+
+%% Cell type:markdown id: tags:
+
+The run function has to be changed now slightly, since the data has to be transfered to the GPU first, then the kernel can be executed, and in the end the data has to be transfered back
+
+%% Cell type:code id: tags:
+
+``` python
+if cupy:
+    def run_on_gpu(timesteps=1):
+        # Transfer arrays to GPU
+        gpuArrs = [cupy.asarray(cpu_array) for cpu_array in u_arrays]
+
+        for t in range(timesteps):
+            gpu_kernel(u0=gpuArrs[0], u1=gpuArrs[1], u2=gpuArrs[2])
+            gpuArrs[0], gpuArrs[1], gpuArrs[2] = gpuArrs[1], gpuArrs[2], gpuArrs[0]
+
+        # Transfer arrays to CPU
+        for gpuArr, cpuArr in zip(gpuArrs, u_arrays):
+            cpuArr[:] = gpuArr.get()
+assert np.isfinite(np.max(u_arrays[2]))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+if cupy:
+    run_on_gpu(400)
+```
+%% Cell type:code id: tags:
+
+``` python
+import psutil
+
+from pystencils.session import *
+
+import shutil
+```
+
+%% Cell type:markdown id: tags:
+
+# Demo: Finite differences - 2D wave equation
+
+In this tutorial we show how to use the finite difference module of *pystencils* to solve a 2D wave equations. The time derivative is discretized by a simple forward Euler method.
+
+%% Cell type:markdown id: tags:
+
+$$ \frac{\partial^2 u}{\partial t^2} = \mbox{div} \left( q(x,y) \nabla u \right)$$
+
+%% Cell type:markdown id: tags:
+
+We begin by creating three *numpy* arrays for the current, the previous and the next timestep. Actually we will see later that two fields are enough, but let's keep it simple. From these *numpy* arrays we create *pystencils* fields to formulate our update rule.
+
+%% Cell type:code id: tags:
+
+``` python
+size = (60, 70) #  domain size
+u_arrays = [np.zeros(size), np.zeros(size), np.zeros(size)]
+
+u_fields = [ps.Field.create_from_numpy_array("u%s" % (name,), arr)
+            for name, arr in zip(["0", "1", "2"], u_arrays)]
+
+# Nicer display for fields
+for i, field in enumerate(u_fields):
+    field.latex_name = "u^{(%d)}" % (i,)
+```
+
+%% Cell type:markdown id: tags:
+
+*pystencils* contains already simple rules to discretize the a diffusion term. The time discretization is done manually.
+
+%% Cell type:code id: tags:
+
+``` python
+discretize = ps.fd.Discretization2ndOrder()
+
+def central2nd_time_derivative(fields):
+    f_next, f_current, f_last = fields
+    return (f_next[0, 0] - 2 * f_current[0, 0] + f_last[0, 0]) / discretize.dt**2
+
+rhs = ps.fd.diffusion(u_fields[1], 1)
+
+wave_eq = sp.Eq(central2nd_time_derivative(u_fields), discretize(rhs))
+
+wave_eq = sp.simplify(wave_eq)
+wave_eq
+```
+
+%% Output
+
+
+    $\displaystyle \frac{{u^{(0)}}_{(0,0)} - 2 {u^{(1)}}_{(0,0)} + {u^{(2)}}_{(0,0)}}{dt^{2}} = \frac{- 4 {u^{(1)}}_{(0,0)} + {u^{(1)}}_{(1,0)} + {u^{(1)}}_{(0,1)} + {u^{(1)}}_{(0,-1)} + {u^{(1)}}_{(-1,0)}}{dx^{2}}$
+    u_0_C - 2⋅u_1_C + u_2_C   -4⋅u_1_C + u_1_E + u_1_N + u_1_S + u_1_W
+    ─────────────────────── = ────────────────────────────────────────
+                2                                 2
+              dt                                dx
+
+%% Cell type:markdown id: tags:
+
+The explicit Euler scheme is now obtained by solving above equation with respect to $u_C^{next}$.
+
+%% Cell type:code id: tags:
+
+``` python
+u_next_C = u_fields[-1][0, 0]
+update_rule = ps.Assignment(u_next_C, sp.solve(wave_eq, u_next_C)[0])
+update_rule
+```
+
+%% Output
+
+
+    $\displaystyle {u^{(2)}}_{(0,0)} \leftarrow \frac{- 4 {u^{(1)}}_{(0,0)} dt^{2} + {u^{(1)}}_{(1,0)} dt^{2} + {u^{(1)}}_{(0,1)} dt^{2} + {u^{(1)}}_{(0,-1)} dt^{2} + {u^{(1)}}_{(-1,0)} dt^{2} + dx^{2} \left(- {u^{(0)}}_{(0,0)} + 2 {u^{(1)}}_{(0,0)}\right)}{dx^{2}}$
+                         2           2           2           2           2     2
+             - 4⋅u_1_C⋅dt  + u_1_E⋅dt  + u_1_N⋅dt  + u_1_S⋅dt  + u_1_W⋅dt  + dx ⋅(-u_0_C + 2⋅u_1_C)
+    u_2_C := ──────────────────────────────────────────────────────────────────────────────────────
+                                                        2
+                                                      dx
+
+%% Cell type:markdown id: tags:
+
+Before creating the kernel, we substitute numeric values for $dx$ and $dt$.
+Then a kernel is created just like in the last tutorial.
+
+%% Cell type:code id: tags:
+
+``` python
+update_rule = update_rule.subs({discretize.dx: 0.1, discretize.dt: 0.05})
+ast = ps.create_kernel(update_rule)
+kernel = ast.compile()
+
+ps.show_code(ast)
+```
+
+%% Output
+
+
+
+%% Cell type:code id: tags:
+
+``` python
+ast.get_parameters()
+```
+
+%% Output
+
+    [_data_u0, _data_u1, _data_u2]
+
+%% Cell type:code id: tags:
+
+``` python
+ast.fields_accessed
+```
+
+%% Output
+
+    {u0: double[60,70], u1: double[60,70], u2: double[60,70]}
+
+%% Cell type:markdown id: tags:
+
+To run simulation a suitable initial condition and boundary treatment is required. We chose an initial condition which is zero at the borders of the domain. The outermost layer is not changed by the update kernel, so we have an implicit homogenous Dirichlet boundary condition.
+
+%% Cell type:code id: tags:
+
+``` python
+X,Y = np.meshgrid( np.linspace(0, 1, size[1]), np.linspace(0,1, size[0]))
+Z = np.sin(2*X*np.pi) * np.sin(2*Y*np.pi)
+
+# Initialize the previous and current values with the initial function
+np.copyto(u_arrays[0], Z)
+np.copyto(u_arrays[1], Z)
+# The values for the next timesteps do not matter, since they are overwritten
+u_arrays[2][:, :] = 0
+```
+
+%% Cell type:markdown id: tags:
+
+One timestep now consists of applying the kernel once, then shifting the arrays.
+
+%% Cell type:code id: tags:
+
+``` python
+def run(timesteps=1):
+    for t in range(timesteps):
+        kernel(u0=u_arrays[0], u1=u_arrays[1], u2=u_arrays[2])
+        u_arrays[0], u_arrays[1], u_arrays[2] = u_arrays[1], u_arrays[2], u_arrays[0]
+    return u_arrays[2]
+```
+
+%% Cell type:markdown id: tags:
+
+Lets create an animation of the solution:
+
+%% Cell type:code id: tags:
+
+``` python
+if shutil.which("ffmpeg") is not None:
+    ani = plt.surface_plot_animation(run, zlim=(-1, 1))
+    ps.jupyter.display_as_html_video(ani)
+else:
+    print("No ffmpeg installed")
+```
+
+%% Output
+
+    No ffmpeg installed
+
+%% Cell type:code id: tags:
+
+``` python
+assert np.isfinite(np.max(u_arrays[2]))
+```
+
+%% Cell type:markdown id: tags:
+
+__Runing on GPU__
+
+We can also run the same kernel on the GPU, by using the ``cupy`` package.
+
+%% Cell type:code id: tags:
+
+``` python
+try:
+    import cupy
+except ImportError:
+    cupy=None
+    print('No cupy installed')
+
+
+res = None
+if cupy:
+    gpu_ast = ps.create_kernel(update_rule, target=ps.Target.GPU)
+    gpu_kernel = gpu_ast.compile()
+    res = ps.show_code(gpu_ast)
+res
+```
+
+%% Output
+
+
+
+%% Cell type:markdown id: tags:
+
+The run function has to be changed now slightly, since the data has to be transfered to the GPU first, then the kernel can be executed, and in the end the data has to be transfered back
+
+%% Cell type:code id: tags:
+
+``` python
+if cupy:
+    def run_on_gpu(timesteps=1):
+        # Transfer arrays to GPU
+        gpuArrs = [cupy.asarray(cpu_array) for cpu_array in u_arrays]
+
+        for t in range(timesteps):
+            gpu_kernel(u0=gpuArrs[0], u1=gpuArrs[1], u2=gpuArrs[2])
+            gpuArrs[0], gpuArrs[1], gpuArrs[2] = gpuArrs[1], gpuArrs[2], gpuArrs[0]
+
+        # Transfer arrays to CPU
+        for gpuArr, cpuArr in zip(gpuArrs, u_arrays):
+            cpuArr[:] = gpuArr.get()
+assert np.isfinite(np.max(u_arrays[2]))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+if cupy:
+    run_on_gpu(400)
+```
--- a/doc/sphinx/api.rst
+++ b/doc/sphinx/api.rst
+API Reference
+=============
+
+.. toctree::
+   :maxdepth: 3
+
+   kernel_compile_and_call.rst
+   enums.rst
+   simplifications.rst
+   datahandling.rst
+   configuration.rst
+   field.rst
+   stencil.rst
+   finite_differences.rst
+   plot.rst
+   ast.rst
--- a/doc/sphinx/ast.rst
+++ b/doc/sphinx/ast.rst
+*********************************************
+For developers: AST Nodes and Transformations
+*********************************************
+
+
+AST Nodes
+=========
+
+.. automodule:: pystencils.astnodes
+   :members:
+
+
+
+Transformations
+===============
+
+
+.. automodule:: pystencils.transformations
+   :members:
--- a/doc/sphinx/configuration.rst
+++ b/doc/sphinx/configuration.rst
+*************
+Configuration
+*************
+
+.. automodule:: pystencils.cpu.cpujit
\ No newline at end of file
--- a/doc/sphinx/datahandling.rst
+++ b/doc/sphinx/datahandling.rst
+************
+DataHandling
+************
+
+.. autoclass:: pystencils.datahandling.DataHandling
+   :members:
\ No newline at end of file
--- a/doc/sphinx/enums.rst
+++ b/doc/sphinx/enums.rst
+************
+Enumerations
+************
+
+.. automodule:: pystencils.enums
+   :members:
--- a/doc/sphinx/field.rst
+++ b/doc/sphinx/field.rst
+*****
+Field
+*****
+
+.. automodule:: pystencils.field
+   :members:
\ No newline at end of file
--- a/doc/sphinx/finite_differences.rst
+++ b/doc/sphinx/finite_differences.rst
+******************
+Finite Differences
+******************
+
+.. automodule:: pystencils.fd
+   :members:
+
--- a/doc/sphinx/kernel_compile_and_call.rst
+++ b/doc/sphinx/kernel_compile_and_call.rst
+*****************************************
+Creating and calling kernels from Python
+*****************************************
+
+
+Creating kernels
+----------------
+
+.. autofunction:: pystencils.create_kernel
+
+.. autoclass:: pystencils.CreateKernelConfig
+    :members:
+
+.. autofunction:: pystencils.kernelcreation.create_domain_kernel
+
+.. autofunction:: pystencils.kernelcreation.create_indexed_kernel
+
+.. autofunction:: pystencils.kernelcreation.create_staggered_kernel
+
+
+Code printing
+-------------
+
+.. autofunction:: pystencils.show_code
+
+
+GPU Indexing
+-------------
+
+.. autoclass:: pystencils.gpu.AbstractIndexing
+   :members:
+
+.. autoclass:: pystencils.gpu.BlockIndexing
+   :members:
+
+.. autoclass:: pystencils.gpu.LineIndexing
+   :members:
--- a/doc/sphinx/plot.rst
+++ b/doc/sphinx/plot.rst
+**********************
+Plotting and Animation
+**********************
+
+.. automodule:: pystencils.plot
+   :members:
+
+
--- a/doc/sphinx/pystencils.bib
+++ b/doc/sphinx/pystencils.bib
--- a/doc/sphinx/simplifications.rst
+++ b/doc/sphinx/simplifications.rst
+***************************************
+Assignment Collection & Simplifications
+***************************************
+
+
+AssignmentCollection
+====================
+
+.. autoclass:: pystencils.AssignmentCollection
+   :members:
+
+
+SimplificationStrategy
+======================
+
+.. autoclass:: pystencils.simp.SimplificationStrategy
+    :members:
+
+Simplifications
+===============
+
+.. automodule:: pystencils.simp.simplifications
+    :members:
+
+Subexpression insertion
+=======================
+
+The subexpression insertions have the goal to insert subexpressions which will not reduce the number of FLOPs.
+For example a constant value kept as subexpression will lead to a new variable in the code which will occupy
+a register slot. On the other side a single variable could just be inserted in all assignments.
+
+.. automodule:: pystencils.simp.subexpression_insertion
+    :members:
+
+
+
+
+
--- a/doc/sphinx/stencil.rst
+++ b/doc/sphinx/stencil.rst
+*******
+Stencil
+*******
+
+.. automodule:: pystencils.stencil
+   :members:
--- a/doc/sphinx/tutorials.rst
+++ b/doc/sphinx/tutorials.rst
+Tutorials
+=========
+
+These tutorials are a good place to start if you are new to pystencils.
+All tutorials and demos listed here are based on Jupyter notebooks that you can find in the pystencils repository.
+It is a good idea to download them and run them directly to be able to play around with the code.
+
+.. toctree::
+   :maxdepth: 1
+
+   /notebooks/01_tutorial_getting_started.ipynb
+   /notebooks/02_tutorial_basic_kernels.ipynb
+   /notebooks/03_tutorial_datahandling.ipynb
+   /notebooks/04_tutorial_advection_diffusion.ipynb
+   /notebooks/05_tutorial_phasefield_spinodal_decomposition.ipynb
+   /notebooks/06_tutorial_phasefield_dentritic_growth.ipynb
+   /notebooks/demo_assignment_collection.ipynb
+   /notebooks/demo_plotting_and_animation.ipynb
+   /notebooks/demo_derivatives.ipynb
+   /notebooks/demo_benchmark.ipynb
+   /notebooks/demo_wave_equation.ipynb
--- a/gpucuda/indexing.py
+++ b/gpucuda/indexing.py
-import abc
-from typing import Tuple  # noqa
-import sympy as sp
-from pystencils.astnodes import Conditional, Block
-from pystencils.integer_functions import div_ceil
-from pystencils.slicing import normalize_slice
-from pystencils.data_types import TypedSymbol, create_type
-from functools import partial
-
-AUTO_BLOCK_SIZE_LIMITING = False
-
-BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-
-
-class AbstractIndexing(abc.ABC):
-    """
-    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
-    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
-    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
-    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
-    """
-
-    @property
-    @abc.abstractmethod
-    def coordinates(self):
-        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
-        These symbolic indices can be obtained with the method `index_variables` """
-
-    @property
-    def index_variables(self):
-        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
-        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
-
-    @abc.abstractmethod
-    def call_parameters(self, arr_shape):
-        """Determine grid and block size for kernel call.
-
-        Args:
-            arr_shape: the numeric (not symbolic) shape of the array
-        Returns:
-            dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
-            the kernel should be started with
-        """
-
-    @abc.abstractmethod
-    def guard(self, kernel_content, arr_shape):
-        """In some indexing schemes not all threads of a block execute the kernel content.
-
-        This function can return a Conditional ast node, defining this execution guard.
-
-        Args:
-            kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
-            arr_shape: the numeric or symbolic shape of the field
-
-        Returns:
-            ast node, which is put inside the kernel function
-        """
-
-
-# -------------------------------------------- Implementations ---------------------------------------------------------
-
-
-class BlockIndexing(AbstractIndexing):
-    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
-
-    Args:
-        field: pystencils field (common to all Indexing classes)
-        iteration_slice: slice that defines rectangular subarea which is iterated over
-        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
-                                                gets the largest amount of threads
-        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
-    """
-    def __init__(self, field, iteration_slice=None,
-                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False):
-        if field.spatial_dimensions > 3:
-            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
-
-        if permute_block_size_dependent_on_layout:
-            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
-
-        if AUTO_BLOCK_SIZE_LIMITING:
-            block_size = self.limit_block_size_to_device_maximum(block_size)
-
-        self._block_size = block_size
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._dim = field.spatial_dimensions
-        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
-        self._compile_time_block_size = compile_time_block_size
-
-    @property
-    def coordinates(self):
-        offsets = _get_start_from_slice(self._iterationSlice)
-        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
-        coordinates = [block_index * bs + thread_idx + off
-                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)]
-
-        return coordinates[:self._dim]
-
-    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
-
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
-        extend_bs = (1,) * (3 - len(self._block_size))
-        block_size = self._block_size + extend_bs
-        if not self._compile_time_block_size:
-            block_size = tuple(sp.Min(bs, shape) for bs, shape in zip(block_size, widths)) + extend_bs
-
-        grid = tuple(div_ceil(length, block_size)
-                     for length, block_size in zip(widths, block_size))
-        extend_gr = (1,) * (3 - len(grid))
-
-        return {'block': block_size,
-                'grid': grid + extend_gr}
-
-    def guard(self, kernel_content, arr_shape):
-        arr_shape = arr_shape[:self._dim]
-        conditions = [c < end
-                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
-        condition = conditions[0]
-        for c in conditions[1:]:
-            condition = sp.And(condition, c)
-        return Block([Conditional(condition, kernel_content)])
-
-    @staticmethod
-    def limit_block_size_to_device_maximum(block_size):
-        """Changes block size according to match device limits.
-
-        * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
-        * next, if one component is still too big, the component which is too big is divided by 2 and the smallest
-          component is multiplied by 2, such that the total amount of threads stays the same
-
-        Returns:
-            the altered block_size
-        """
-        # Get device limits
-        import pycuda.driver as cuda
-        # noinspection PyUnresolvedReferences
-        import pycuda.autoinit  # NOQA
-
-        da = cuda.device_attribute
-        device = cuda.Context.get_device()
-
-        block_size = list(block_size)
-        max_threads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
-        max_block_size = [device.get_attribute(a)
-                          for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]
-
-        def prod(seq):
-            result = 1
-            for e in seq:
-                result *= e
-            return result
-
-        def get_index_of_too_big_element():
-            for i, bs in enumerate(block_size):
-                if bs > max_block_size[i]:
-                    return i
-            return None
-
-        def get_index_of_too_small_element():
-            for i, bs in enumerate(block_size):
-                if bs // 2 <= max_block_size[i]:
-                    return i
-            return None
-
-        # Reduce the total number of threads if necessary
-        while prod(block_size) > max_threads:
-            item_to_reduce = block_size.index(max(block_size))
-            for j, block_size_entry in enumerate(block_size):
-                if block_size_entry > max_block_size[j]:
-                    item_to_reduce = j
-            block_size[item_to_reduce] //= 2
-
-        # Cap individual elements
-        too_big_element_index = get_index_of_too_big_element()
-        while too_big_element_index is not None:
-            too_small_element_index = get_index_of_too_small_element()
-            block_size[too_small_element_index] *= 2
-            block_size[too_big_element_index] //= 2
-            too_big_element_index = get_index_of_too_big_element()
-
-        return tuple(block_size)
-
-    @staticmethod
-    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
-        """Shrinks the block_size if there are too many registers used per multiprocessor.
-        This is not done automatically, since the required_registers_per_thread are not known before compilation.
-        They can be obtained by ``func.num_regs`` from a pycuda function.
-        :returns smaller block_size if too many registers are used.
-        """
-        import pycuda.driver as cuda
-        # noinspection PyUnresolvedReferences
-        import pycuda.autoinit  # NOQA
-
-        da = cuda.device_attribute
-        if device is None:
-            device = cuda.Context.get_device()
-        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
-
-        block = block_size
-
-        while True:
-            num_threads = 1
-            for t in block:
-                num_threads *= t
-            required_registers_per_mt = num_threads * required_registers_per_thread
-            if required_registers_per_mt <= available_registers_per_mp:
-                return block
-            else:
-                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
-                assert block[largest_grid_entry_idx] >= 2
-                block[largest_grid_entry_idx] //= 2
-
-    @staticmethod
-    def permute_block_size_according_to_layout(block_size, layout):
-        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
-        sorted_block_size = list(sorted(block_size, reverse=True))
-        while len(sorted_block_size) > len(layout):
-            sorted_block_size[0] *= sorted_block_size[-1]
-            sorted_block_size = sorted_block_size[:-1]
-
-        result = list(block_size)
-        for l, bs in zip(reversed(layout), sorted_block_size):
-            result[l] = bs
-        return tuple(result[:len(layout)])
-
-
-class LineIndexing(AbstractIndexing):
-    """
-    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
-    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
-    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
-    maximum amount of threads allowed in a CUDA block (which depends on device).
-    """
-
-    def __init__(self, field, iteration_slice=None):
-        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
-        if field.spatial_dimensions > 4:
-            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
-
-        coordinates = available_indices[:field.spatial_dimensions]
-
-        fastest_coordinate = field.layout[-1]
-        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
-
-        self._coordinates = coordinates
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
-
-    @property
-    def coordinates(self):
-        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
-
-    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
-
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
-
-        def get_shape_of_cuda_idx(cuda_idx):
-            if cuda_idx not in self._coordinates:
-                return 1
-            else:
-                idx = self._coordinates.index(cuda_idx)
-                return widths[idx]
-
-        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
-                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
-
-    def guard(self, kernel_content, arr_shape):
-        return kernel_content
-
-
-# -------------------------------------- Helper functions --------------------------------------------------------------
-
-def _get_start_from_slice(iteration_slice):
-    res = []
-    for slice_component in iteration_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.start if slice_component.start is not None else 0)
-        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component)
-    return res
-
-
-def _get_end_from_slice(iteration_slice, arr_shape):
-    iter_slice = normalize_slice(iteration_slice, arr_shape)
-    res = []
-    for slice_component in iter_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.stop)
-        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component + 1)
-    return res
-
-
-def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
-    if isinstance(gpu_indexing, str):
-        if gpu_indexing == 'block':
-            indexing_creator = BlockIndexing
-        elif gpu_indexing == 'line':
-            indexing_creator = LineIndexing
-        else:
-            raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpu_indexing,))
-        if gpu_indexing_params:
-            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
-        return indexing_creator
-    else:
-        return gpu_indexing
--- a/kerncraft_coupling/__init__.py
+++ b/kerncraft_coupling/__init__.py
-from .kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
-
-__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters']
--- a/kerncraft_coupling/generate_benchmark.py
+++ b/kerncraft_coupling/generate_benchmark.py
-from jinja2 import Template
-from pystencils.backends.cbackend import generate_c
-from pystencils.sympyextensions import prod
-from pystencils.data_types import get_base_type
-
-benchmark_template = Template("""
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(double *);
-extern int var_false;
-
-
-{{kernel_code}}
-
-
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  likwid_markerThreadInit();
-  {%- endif %}
-
-  {%- for field_name, dataType, size in fields %}
-
-  // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 32);
-  for (unsigned long long i = 0; i < {{size}}; ++i)
-    {{field_name}}[i] = 0.23;
-
-  if(var_false)
-    dummy({{field_name}});
-
-  {%- endfor %}
-
-
-
-  {%- for constantName, dataType in constants %}
-
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-
-  {%- endfor %}
-
-  int repeat = atoi(argv[1]);
-  {%- if likwid %}
-  likwid_markerStartRegion("loop");
-  {%- endif %}
-
-  for (; repeat > 0; --repeat)
-  {
-    {{kernelName}}({{call_argument_list}});
-
-    // Dummy calls
-    {%- for field_name, dataType, size in fields %}
-    if(var_false) dummy({{field_name}});
-    {%- endfor %}
-    {%- for constantName, dataType in constants %}
-    if(var_false) dummy(&{{constantName}});
-    {%- endfor %}
-  }
-
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- endif %}
-
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-}
-""")
-
-
-def generate_benchmark(ast, likwid=False):
-    accessed_fields = {f.name: f for f in ast.fields_accessed}
-    constants = []
-    fields = []
-    call_parameters = []
-    for p in ast.get_parameters():
-        if not p.is_field_parameter:
-            constants.append((p.symbol.name, str(p.symbol.dtype)))
-            call_parameters.append(p.symbol.name)
-        else:
-            assert p.is_field_pointer, "Benchmark implemented only for kernels with fixed loop size"
-            field = accessed_fields[p.field_name]
-            dtype = str(get_base_type(p.symbol.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
-            call_parameters.append(p.field_name)
-
-    args = {
-        'likwid': likwid,
-        'kernel_code': generate_c(ast),
-        'kernelName': ast.function_name,
-        'fields': fields,
-        'constants': constants,
-        'call_argument_list': ",".join(call_parameters),
-    }
-    return benchmark_template.render(**args)
--- a/kerncraft_coupling/kerncraft_interface.py
+++ b/kerncraft_coupling/kerncraft_interface.py
-from tempfile import TemporaryDirectory
-
-import sympy as sp
-import os
-from collections import defaultdict
-import subprocess
-import kerncraft
-import kerncraft.kernel
-from kerncraft.iaca import iaca_analyse_instrumented_binary, iaca_instrumentation
-from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-from pystencils.astnodes import LoopOverCoordinate, SympyAssignment, ResolvedFieldAccess
-from pystencils.field import get_layout_from_strides
-from pystencils.sympyextensions import count_operations_in_ast
-from pystencils.utils import DotDict
-
-
-class PyStencilsKerncraftKernel(kerncraft.kernel.Kernel):
-    """
-    Implementation of kerncraft's kernel interface for pystencils CPU kernels.
-    Analyses a list of equations assuming they will be executed on a CPU
-    """
-    LIKWID_BASE = '/usr/local/likwid'
-
-    def __init__(self, ast, machine=None):
-        super(PyStencilsKerncraftKernel, self).__init__(machine)
-
-        self.ast = ast
-        self.temporary_dir = TemporaryDirectory()
-
-        # Loops
-        inner_loops = [l for l in ast.atoms(LoopOverCoordinate) if l.is_innermost_loop]
-        if len(inner_loops) == 0:
-            raise ValueError("No loop found in pystencils AST")
-        elif len(inner_loops) > 1:
-            raise ValueError("pystencils AST contains multiple inner loops - only one can be analyzed")
-        else:
-            inner_loop = inner_loops[0]
-
-        self._loop_stack = []
-        cur_node = inner_loop
-        while cur_node is not None:
-            if isinstance(cur_node, LoopOverCoordinate):
-                loop_counter_sym = cur_node.loop_counter_symbol
-                loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, cur_node.step)
-                self._loop_stack.append(loop_info)
-            cur_node = cur_node.parent
-        self._loop_stack = list(reversed(self._loop_stack))
-
-        # Data sources & destinations
-        self.sources = defaultdict(list)
-        self.destinations = defaultdict(list)
-
-        reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
-        for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
-            for fa in accesses:
-                coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off
-                         for i, off in enumerate(fa.offsets)]
-                coord += list(fa.idx_coordinate_values)
-                layout = get_layout_from_strides(fa.field.strides)
-                permuted_coord = [coord[i] for i in layout]
-                target_dict[fa.field.name].append(permuted_coord)
-
-        # Variables (arrays)
-        fields_accessed = ast.fields_accessed
-        for field in fields_accessed:
-            layout = get_layout_from_strides(field.strides)
-            permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
-
-        for param in ast.get_parameters():
-            if not param.is_field_parameter:
-                self.set_variable(param.symbol.name, str(param.symbol.dtype), None)
-                self.sources[param.symbol.name] = [None]
-
-        # data type
-        self.datatype = list(self.variables.values())[0][0]
-
-        # flops
-        operation_count = count_operations_in_ast(inner_loop)
-        self._flops = {
-            '+': operation_count['adds'],
-            '*': operation_count['muls'],
-            '/': operation_count['divs'],
-        }
-        for k in [k for k, v in self._flops.items() if v == 0]:
-            del self._flops[k]
-        self.check()
-
-    def iaca_analysis(self, micro_architecture, asm_block='auto',
-                      pointer_increment='auto_with_manual_fallback', verbose=False):
-        compiler, compiler_args = self._machine.get_compiler()
-        if '-std=c99' not in compiler_args:
-            compiler_args += ['-std=c99']
-        header_path = kerncraft.get_header_path()
-
-        compiler_cmd = [compiler] + compiler_args + ['-I' + header_path]
-
-        src_file = os.path.join(self.temporary_dir.name, "source.c")
-        asm_file = os.path.join(self.temporary_dir.name, "source.s")
-        iaca_asm_file = os.path.join(self.temporary_dir.name, "source.iaca.s")
-        dummy_src_file = os.path.join(header_path, "dummy.c")
-        dummy_asm_file = os.path.join(self.temporary_dir.name, "dummy.s")
-        binary_file = os.path.join(self.temporary_dir.name, "binary")
-
-        # write source code to file
-        with open(src_file, 'w') as f:
-            f.write(generate_benchmark(self.ast, likwid=False))
-
-        # compile to asm files
-        subprocess.check_output(compiler_cmd + [src_file, '-S', '-o', asm_file])
-        subprocess.check_output(compiler_cmd + [dummy_src_file, '-S', '-o', dummy_asm_file])
-
-        with open(asm_file) as read, open(iaca_asm_file, 'w') as write:
-            instrumented_asm_block = iaca_instrumentation(read, write)
-
-        # assemble asm files to executable
-        subprocess.check_output(compiler_cmd + [iaca_asm_file, dummy_asm_file, '-o', binary_file])
-
-        result = iaca_analyse_instrumented_binary(binary_file, micro_architecture)
-    
-        return result, instrumented_asm_block
-
-    def build(self, lflags=None, verbose=False):
-        compiler, compiler_args = self._machine.get_compiler()
-        if '-std=c99' not in compiler_args:
-            compiler_args.append('-std=c99')
-        header_path = kerncraft.get_header_path()
-
-        cmd = [compiler] + compiler_args + [
-            '-I' + os.path.join(self.LIKWID_BASE, 'include'),
-            '-L' + os.path.join(self.LIKWID_BASE, 'lib'),
-            '-I' + header_path,
-            '-Wl,-rpath=' + os.path.join(self.LIKWID_BASE, 'lib'),
-        ]
-
-        dummy_src_file = os.path.join(header_path, 'dummy.c')
-        src_file = os.path.join(self.temporary_dir.name, "source_likwid.c")
-        bin_file = os.path.join(self.temporary_dir.name, "benchmark")
-
-        with open(src_file, 'w') as f:
-            f.write(generate_benchmark(self.ast, likwid=True))
-
-        subprocess.check_output(cmd + [src_file, dummy_src_file, '-pthread', '-llikwid', '-o', bin_file])
-        return bin_file
-
-
-class KerncraftParameters(DotDict):
-    def __init__(self, **kwargs):
-        super(KerncraftParameters, self).__init__(**kwargs)
-        self['asm_block'] = 'auto'
-        self['asm_increment'] = 0
-        self['cores'] = 1
-        self['cache_predictor'] = 'SIM'
-        self['verbose'] = 0
-        self['pointer_increment'] = 'auto'
-        self['iterations'] = 10
-
-
-# ------------------------------------------- Helper functions ---------------------------------------------------------
-
-
-def search_resolved_field_accesses_in_ast(ast):
-    def visit(node, reads, writes):
-        if not isinstance(node, SympyAssignment):
-            for a in node.args:
-                visit(a, reads, writes)
-            return
-
-        for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
-            accesses.update(expr.atoms(ResolvedFieldAccess))
-
-    read_accesses = set()
-    write_accesses = set()
-    visit(ast, read_accesses, write_accesses)
-    return read_accesses, write_accesses
No results found