5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/src/pystencils/fd/__init__.py
+++ b/src/pystencils/fd/__init__.py
+from .derivative import (
+    Diff, DiffOperator, collect_diffs, combine_diff_products, diff, diff_terms, evaluate_diffs,
+    expand_diff_full, expand_diff_linear, expand_diff_products, functional_derivative,
+    normalize_diff_order, zero_diffs)
+from .finitedifferences import Discretization2ndOrder, advection, diffusion, transient
+from .finitevolumes import FVM1stOrder, VOF
+from .spatial import discretize_spatial, discretize_spatial_staggered
+__all__ = ['Diff', 'diff', 'DiffOperator', 'diff_terms', 'collect_diffs',
+           'zero_diffs', 'evaluate_diffs', 'normalize_diff_order', 'expand_diff_full', 'expand_diff_linear',
+           'expand_diff_products', 'combine_diff_products', 'functional_derivative',
+           'advection', 'diffusion', 'transient', 'Discretization2ndOrder', 'discretize_spatial',
+           'discretize_spatial_staggered', 'FVM1stOrder', 'VOF']
--- a/src/pystencils/fd/derivation.py
+++ b/src/pystencils/fd/derivation.py
+import itertools
+from collections import defaultdict
+import numpy as np
+import sympy as sp
+from pystencils.field import Field
+from pystencils.stencil import direction_string_to_offset
+from pystencils.sympyextensions import multidimensional_sum, prod
+from pystencils.utils import LinearEquationSystem, fully_contains
+class FiniteDifferenceStencilDerivation:
+    """Derives finite difference stencils.
+    Can derive standard finite difference stencils, as well as isotropic versions
+    (see Isotropic Finite Differences by A. Kumar)
+    Args:
+        derivative_coordinates: tuple indicating which derivative should be approximated,
+                                (1, ) stands for first derivative in second direction (y),
+                                (0, 1) would be a mixed second derivative in x and y
+                                (0, 0, 0) would be a third derivative in x direction
+        stencil: list of offset tuples, defining the stencil
+        dx: spacing between grid points, one for all directions, i.e. dx=dy=dz
+    Examples:
+        Central differences
+        >>> fd_1d = FiniteDifferenceStencilDerivation((0,), stencil=[(-1,), (0,), (1,)])
+        >>> result = fd_1d.get_stencil()
+        >>> result
+        Finite difference stencil of accuracy 2, isotropic error: False
+        >>> result.weights
+        [-1/2, 0, 1/2]
+        Forward differences
+        >>> fd_1d = FiniteDifferenceStencilDerivation((0,), stencil=[(0,), (1,)])
+        >>> result = fd_1d.get_stencil()
+        >>> result
+        Finite difference stencil of accuracy 1, isotropic error: False
+        >>> result.weights
+        [-1, 1]
+    """
+    def __init__(self, derivative_coordinates, stencil, dx=1):
+        self.dim = len(stencil[0])
+        self.field = Field.create_generic('f', spatial_dimensions=self.dim)
+        self._derivative = tuple(sorted(derivative_coordinates))
+        self._stencil = stencil
+        self._dx = dx
+        self.weights = {tuple(d): self.symbolic_weight(*d) for d in self._stencil}
+    def assume_symmetric(self, dim, anti_symmetric=False):
+        """Adds restriction that weight in opposite directions of a dimension are equal (symmetric) or
+        the negative of each other (anti symmetric)
+        For example: dim=1, assumes that w(1, 1) == w(1, -1), if anti_symmetric=False or
+                                         w(1, 1) == -w(1, -1) if anti_symmetric=True
+        """
+        update = {}
+        for direction, value in self.weights.items():
+            inv_direction = tuple(-offset if i == dim else offset for i, offset in enumerate(direction))
+            if direction[dim] < 0:
+                inv_weight = self.weights[inv_direction]
+                update[direction] = -inv_weight if anti_symmetric else inv_weight
+        self.weights.update(update)
+    def set_weight(self, offset, value):
+        assert offset in self.weights
+        self.weights[offset] = value
+    def get_stencil(self, isotropic=False) -> 'FiniteDifferenceStencilDerivation.Result':
+        weights = [self.weights[d] for d in self._stencil]
+        system = LinearEquationSystem(sp.Matrix(weights).atoms(sp.Symbol))
+        order = 0
+        while True:
+            new_system = system.copy()
+            eq = self.error_term_equations(order)
+            new_system.add_equations(eq)
+            sol_structure = new_system.solution_structure()
+            if sol_structure == 'single':
+                system = new_system
+            elif sol_structure == 'multiple':
+                system = new_system
+            elif sol_structure == 'none':
+                break
+            else:
+                assert False
+            order += 1
+        accuracy = order - len(self._derivative)
+        error_is_isotropic = False
+        if isotropic:
+            new_system = system.copy()
+            new_system.add_equations(self.isotropy_equations(order))
+            sol_structure = new_system.solution_structure()
+            error_is_isotropic = sol_structure != 'none'
+            if error_is_isotropic:
+                system = new_system
+        solve_res = system.solution()
+        weight_list = [self.weights[d].subs(solve_res) for d in self._stencil]
+        return self.Result(self._stencil, weight_list, accuracy, error_is_isotropic)
+    @staticmethod
+    def symbolic_weight(*args):
+        str_args = [str(e) for e in args]
+        return sp.Symbol(f"w_({','.join(str_args)})")
+    def error_term_dict(self, order):
+        error_terms = defaultdict(lambda: 0)
+        for direction in self._stencil:
+            weight = self.weights[tuple(direction)]
+            x = tuple(self._dx * d_i for d_i in direction)
+            for offset in multidimensional_sum(order, dim=self.field.spatial_dimensions):
+                fac = sp.factorial(order)
+                error_terms[tuple(sorted(offset))] += weight / fac * prod(x[off] for off in offset)
+        if self._derivative in error_terms:
+            error_terms[self._derivative] -= 1
+        return error_terms
+    def error_term_equations(self, order):
+        return list(self.error_term_dict(order).values())
+    def isotropy_equations(self, order):
+        def cycle_int_sequence(sequence, modulus):
+            result = []
+            arr = np.array(sequence, dtype=int)
+            while True:
+                if tuple(arr) in result:
+                    break
+                result.append(tuple(arr))
+                arr = (arr + 1) % modulus
+            return tuple(set(tuple(sorted(t)) for t in result))
+        error_dict = self.error_term_dict(order)
+        eqs = []
+        for derivative_tuple in list(error_dict.keys()):
+            if fully_contains(self._derivative, derivative_tuple):
+                remaining = list(derivative_tuple)
+                for e in self._derivative:
+                    del remaining[remaining.index(e)]
+                permutations = cycle_int_sequence(remaining, self.dim)
+                if len(permutations) == 1:
+                    eqs.append(error_dict[derivative_tuple])
+                else:
+                    for i in range(1, len(permutations)):
+                        new_eq = (error_dict[tuple(sorted(permutations[i] + self._derivative))]
+                                  - error_dict[tuple(sorted(permutations[i - 1] + self._derivative))])
+                        if new_eq:
+                            eqs.append(new_eq)
+            else:
+                eqs.append(error_dict[derivative_tuple])
+        return eqs
+    class Result:
+        def __init__(self, stencil, weights, accuracy, is_isotropic):
+            self.stencil = stencil
+            self.weights = weights
+            self.accuracy = accuracy
+            self.is_isotropic = is_isotropic
+        def visualize(self):
+            from pystencils.stencil import plot
+            plot(self.stencil, data=self.weights)
+        def apply(self, field_access: Field.Access):
+            f = field_access
+            return sum(f.get_shifted(*offset) * weight for offset, weight in zip(self.stencil, self.weights))
+        def __array__(self):
+            return np.array(self.as_array().tolist())
+        def as_array(self):
+            dim = len(self.stencil[0])
+            assert (dim == 2 or dim == 3), "Only 2D or 3D matrix representations are available"
+            max_offset = max(max(abs(e) for e in direction) for direction in self.stencil)
+            shape_list = []
+            for i in range(dim):
+                shape_list.append(2 * max_offset + 1)
+            number_of_elements = np.prod(shape_list)
+            shape = tuple(shape_list)
+            result = sp.MutableDenseNDimArray([0] * number_of_elements, shape)
+            if dim == 2:
+                for direction, weight in zip(self.stencil, self.weights):
+                    result[max_offset - direction[1], max_offset + direction[0]] = weight
+            if dim == 3:
+                for direction, weight in zip(self.stencil, self.weights):
+                    result[max_offset - direction[1], max_offset + direction[0], max_offset + direction[2]] = weight
+            return result
+        def rotate_weights_and_apply(self, field_access: Field.Access, axes):
+            """derive gradient weights of other direction with already calculated weights of one direction
+               via rotation and apply them to a field."""
+            dim = len(self.stencil[0])
+            assert (dim == 2 or dim == 3), "This function is only for 2D or 3D stencils available"
+            rotated_weights = np.rot90(np.array(self.__array__()), 1, axes)
+            result = []
+            max_offset = max(max(abs(e) for e in direction) for direction in self.stencil)
+            if dim == 2:
+                for direction in self.stencil:
+                    result.append(rotated_weights[max_offset - direction[1],
+                                                  max_offset + direction[0]])
+            if dim == 3:
+                for direction in self.stencil:
+                    result.append(rotated_weights[max_offset - direction[1],
+                                                  max_offset + direction[0],
+                                                  max_offset + direction[2]])
+            f = field_access
+            return sum(f.get_shifted(*offset) * weight for offset, weight in zip(self.stencil, result))
+        def __repr__(self):
+            return "Finite difference stencil of accuracy {}, isotropic error: {}".format(self.accuracy,
+                                                                                          self.is_isotropic)
+class FiniteDifferenceStaggeredStencilDerivation:
+    """Derives a finite difference stencil for application at a staggered position
+    Args:
+        neighbor: the neighbor direction string or vector at whose staggered position to calculate the derivative
+        dim: how many dimensions (2 or 3)
+        derivative: a tuple of directions over which to perform derivatives
+        free_weights_prefix: a string to prefix to free weight symbols. If None, do not return free weights
+    """
+    def __init__(self, neighbor, dim, derivative=tuple(), free_weights_prefix=None):
+        if type(neighbor) is str:
+            neighbor = direction_string_to_offset(neighbor)
+        if dim == 2:
+            assert neighbor[dim:] == 0
+        assert derivative is tuple() or max(derivative) < dim
+        neighbor = sp.Matrix(neighbor[:dim])
+        pos = neighbor / 2
+        def unitvec(i):
+            """return the `i`-th unit vector in three dimensions"""
+            a = np.zeros(dim, dtype=int)
+            a[i] = 1
+            return a
+        def flipped(a, i):
+            """return `a` with its `i`-th element's sign flipped"""
+            a = a.copy()
+            a[i] *= -1
+            return a
+        # determine the points to use, coordinates are relative to position
+        points = []
+        if np.linalg.norm(neighbor, 1) == 1:
+            main_points = [neighbor / 2, neighbor / -2]
+        elif np.linalg.norm(neighbor, 1) == 2:
+            nonzero_indices = [i for i, v in enumerate(neighbor) if v != 0 and i < dim]
+            main_points = [neighbor / 2, neighbor / -2, flipped(neighbor / 2, nonzero_indices[0]),
+                           flipped(neighbor / -2, nonzero_indices[0])]
+        else:
+            main_points = [sp.Matrix(np.multiply(neighbor, sp.Matrix(c) / 2))
+                           for c in itertools.product([-1, 1], repeat=3)]
+        points += main_points
+        zero_indices = [i for i, v in enumerate(neighbor) if v == 0 and i < dim]
+        for i in zero_indices:
+            points += [point + sp.Matrix(unitvec(i)) for point in main_points]
+            points += [point - sp.Matrix(unitvec(i)) for point in main_points]
+        points_tuple = tuple([tuple(p) for p in points])
+        self._stencil = points_tuple
+        # determine the stencil weights
+        if len(derivative) == 0:
+            weights = None
+        else:
+            derivation = FiniteDifferenceStencilDerivation(derivative, points_tuple).get_stencil()
+            if not derivation.accuracy:
+                raise Exception('the requested derivative cannot be performed with the available neighbors')
+            weights = derivation.weights
+            # if the weights are underdefined, we can choose the free symbols to find the sparsest stencil
+            free_weights = set(itertools.chain(*[w.free_symbols for w in weights]))
+            if free_weights_prefix is not None:
+                weights = [w.subs({fw: sp.Symbol(f"{free_weights_prefix}_{i}") for i, fw in enumerate(free_weights)})
+                           for w in weights]
+            elif len(free_weights) > 0:
+                zero_counts = defaultdict(list)
+                for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)],
+                                                repeat=len(free_weights)):
+                    subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
+                    weights = [w.subs(subs) for w in derivation.weights]
+                    if not all(a == 0 for a in weights):
+                        zero_count = sum([1 for w in weights if w == 0])
+                        zero_counts[zero_count].append(weights)
+                best = zero_counts[max(zero_counts.keys())]
+                if len(best) > 1:  # if there are multiple, pick the one that contains a nonzero center weight
+                    center = [tuple(p + pos) for p in points].index((0, 0, 0)[:dim])
+                    best = [b for b in best if b[center] != 0]
+                if len(best) > 1:  # if there are still multiple, they are equivalent, so we average
+                    weights = [sum([b[i] for b in best]) / len(best) for i in range(len(weights))]
+                else:
+                    weights = best[0]
+                assert weights
+        points_tuple = tuple([tuple(p + pos) for p in points])
+        self._points = points_tuple
+        self._weights = weights
+    @property
+    def points(self):
+        """return the points of the stencil"""
+        return self._points
+    @property
+    def stencil(self):
+        """return the points of the stencil relative to the staggered position specified by neighbor"""
+        return self._stencil
+    @property
+    def weights(self):
+        """return the weights of the stencil"""
+        assert self._weights is not None
+        return self._weights
+    def visualize(self):
+        if self._weights is None:
+            ws = None
+        else:
+            ws = np.array([w for w in self.weights if w != 0], dtype=float)
+        pts = np.array([p for i, p in enumerate(self.points) if self.weights[i] != 0], dtype=int)
+        from pystencils.stencil import plot
+        plot(pts, data=ws)
+    def apply(self, access: Field.Access):
+        return sum([access.get_shifted(*point) * weight for point, weight in zip(self.points, self.weights)])
--- a/src/pystencils/fd/derivative.py
+++ b/src/pystencils/fd/derivative.py
+from collections import defaultdict, namedtuple
+import sympy as sp
+from pystencils.field import Field
+from pystencils.sympyextensions import normalize_product, prod
+def _default_diff_sort_key(d):
+    return str(d.superscript), str(d.target)
+class Diff(sp.Expr):
+    """Sympy Node representing a derivative.
+    The difference to sympy's built in differential is:
+        - shortened latex representation
+        - all simplifications have to be done manually
+        - optional marker displayed as superscript
+    """
+    is_number = False
+    is_Rational = False
+    _diff_wrt = True
+    def __new__(cls, argument, target=-1, superscript=-1):
+        if argument == 0:
+            return sp.Rational(0, 1)
+        if isinstance(argument, Field):
+            argument = argument.center
+        return sp.Expr.__new__(cls, argument.expand(), sp.sympify(target), sp.sympify(superscript))
+    @property
+    def is_commutative(self):
+        any_non_commutative = any(not s.is_commutative for s in self.atoms(sp.Symbol))
+        if any_non_commutative:
+            return False
+        else:
+            return True
+    def get_arg_recursive(self):
+        """Returns the argument the derivative acts on, for nested derivatives the inner argument is returned"""
+        if not isinstance(self.arg, Diff):
+            return self.arg
+        else:
+            return self.arg.get_arg_recursive()
+    def change_arg_recursive(self, new_arg):
+        """Returns a Diff node with the given 'new_arg' instead of the current argument. For nested derivatives
+        a new nested derivative is returned where the inner Diff has the 'new_arg'"""
+        if not isinstance(self.arg, Diff):
+            return Diff(new_arg, self.target, self.superscript)
+        else:
+            return Diff(self.arg.change_arg_recursive(new_arg), self.target, self.superscript)
+    def split_linear(self, functions):
+        """
+        Applies linearity property of Diff: i.e.  'Diff(c*a+b)' is transformed to 'c * Diff(a) + Diff(b)'
+        The parameter functions is a list of all symbols that are considered functions, not constants.
+        For the example above: functions=[a, b]
+        """
+        constant, variable = 1, 1
+        if self.arg.func != sp.Mul:
+            constant, variable = 1, self.arg
+        else:
+            for factor in normalize_product(self.arg):
+                if factor in functions or isinstance(factor, Diff):
+                    variable *= factor
+                else:
+                    constant *= factor
+        if isinstance(variable, sp.Symbol) and variable not in functions:
+            return 0
+        if isinstance(variable, int) or variable.is_number:
+            return 0
+        else:
+            return constant * Diff(variable, target=self.target, superscript=self.superscript)
+    @property
+    def arg(self):
+        """Expression the derivative acts on"""
+        return self.args[0]
+    @property
+    def target(self):
+        """Subscript, usually the variable the Diff is w.r.t. """
+        return self.args[1]
+    @property
+    def superscript(self):
+        """Superscript, for example used as the Chapman-Enskog order index"""
+        return self.args[2]
+    def _latex(self, printer, *_):
+        result = r"{\partial"
+        if self.superscript >= 0:
+            result += "^{(%s)}" % (self.superscript,)
+        if self.target != -1:
+            result += "_{%s}" % (printer.doprint(self.target),)
+        contents = printer.doprint(self.arg)
+        if isinstance(self.arg, int) or isinstance(self.arg, sp.Symbol) or self.arg.is_number or self.arg.func == Diff:
+            result += " " + contents
+        else:
+            result += " (" + contents + ") "
+        result += "}"
+        return result
+    def __str__(self):
+        return f"D({self.arg})"
+    def interpolated_access(self, offset, **kwargs):
+        """Represents an interpolated access on a spatially differentiated field
+        Args:
+            offset (Tuple[sympy.Expr]): Absolute position to determine the value of the spatial derivative
+        """
+        from pystencils.interpolation_astnodes import DiffInterpolatorAccess
+        assert isinstance(self.arg.field, Field), "Must be field to enable interpolated accesses"
+        return DiffInterpolatorAccess(self.arg.field.interpolated_access(offset, **kwargs).symbol, self.target, *offset)
+class DiffOperator(sp.Expr):
+    """Un-applied differential, i.e. differential operator
+    Args:
+        target: the differential is w.r.t to this variable.
+                 This target is mainly for display purposes (its the subscript) and to distinguish DiffOperators
+                 If the target is '-1' no subscript is displayed
+        superscript: optional marker displayed as superscript
+                     is not displayed if set to '-1'
+    The DiffOperator behaves much like a variable with special name. Its main use is to be applied later, using the
+    DiffOperator.apply(expr, arg) which transforms 'DiffOperator's to applied 'Diff's
+    """
+    is_commutative = True
+    is_number = False
+    is_Rational = False
+    def __new__(cls, target=-1, superscript=-1):
+        return sp.Expr.__new__(cls, sp.sympify(target), sp.sympify(superscript))
+    @property
+    def target(self):
+        return self.args[0]
+    @property
+    def superscript(self):
+        return self.args[1]
+    def _latex(self, *_):
+        result = r"{\partial"
+        if self.superscript >= 0:
+            result += "^{(%s)}" % (self.superscript,)
+        if self.target != -1:
+            result += "_{%s}" % (self.target,)
+        result += "}"
+        return result
+    @staticmethod
+    def apply(expr, argument, apply_to_constants=True):
+        """
+        Returns a new expression where each 'DiffOperator' is replaced by a 'Diff' node.
+        Multiplications of 'DiffOperator's are interpreted as nested application of differentiation:
+        i.e. DiffOperator('x')*DiffOperator('x') is a second derivative replaced by Diff(Diff(arg, x), t)
+        """
+        def handle_mul(mul):
+            args = normalize_product(mul)
+            diffs = [a for a in args if isinstance(a, DiffOperator)]
+            if len(diffs) == 0:
+                return mul * argument if apply_to_constants else mul
+            rest = [a for a in args if not isinstance(a, DiffOperator)]
+            diffs.sort(key=_default_diff_sort_key)
+            result = argument
+            for d in reversed(diffs):
+                result = Diff(result, target=d.target, superscript=d.superscript)
+            return prod(rest) * result
+        expr = expr.expand()
+        if expr.func == sp.Mul or expr.func == sp.Pow:
+            return handle_mul(expr)
+        elif expr.func == sp.Add:
+            return expr.func(*[handle_mul(a) for a in expr.args])
+        else:
+            return expr * argument if apply_to_constants else expr
+# ----------------------------------------------------------------------------------------------------------------------
+def diff(expr, *args):
+    """Shortcut function to create nested derivatives
+    >>> f = sp.Symbol("f")
+    >>> diff(f, 0, 0, 1) == Diff(Diff( Diff(f, 1), 0), 0)
+    True
+    """
+    if len(args) == 0:
+        return expr
+    result = expr
+    for index in reversed(args):
+        result = Diff(result, index)
+    return result
+def diff_args(expr):
+    """Extracts the indices and argument of possibly nested derivative - inverse of diff function
+    >>> args = (sp.Symbol("x"), 0, 1, 2, 5, 1)
+    >>> e = diff(*args)
+    >>> assert diff_args(e) == args
+    """
+    if not isinstance(expr, Diff):
+        return expr,
+    else:
+        inner_res = diff_args(expr.args[0])
+        return (inner_res[0], expr.args[1], *inner_res[1:])
+def diff_terms(expr):
+    """Returns set of all derivatives in an expression.
+    This function yields different results than 'expr.atoms(Diff)' when nested derivatives are in the expression,
+    since this function only returns the outer derivatives
+    Example:
+        >>> x, y = sp.symbols("x, y")
+        >>> diff_terms( diff(x, 0, 0) )
+        {Diff(Diff(x, 0, -1), 0, -1)}
+        >>> diff_terms( diff(x, 0, 0) + y )
+        {Diff(Diff(x, 0, -1), 0, -1)}
+    """
+    result = set()
+    def visit(e):
+        if isinstance(e, Diff):
+            result.add(e)
+        else:
+            for a in e.args:
+                visit(a)
+    visit(expr)
+    return result
+def collect_diffs(expr):
+    """Rewrites expression into a sum of distinct derivatives with pre-factors"""
+    return expr.collect(diff_terms(expr))
+def zero_diffs(expr, label):
+    """Replaces all differentials with the given target by 0
+    Example:
+        >>> x, y, f = sp.symbols("x y f")
+        >>> expression = Diff(f, x) + Diff(f, y) + Diff(Diff(f, y), x) + 7
+        >>> zero_diffs(expression, x)
+        Diff(f, y, -1) + 7
+    """
+    def visit(e):
+        if isinstance(e, Diff):
+            if e.target == label:
+                return 0
+        new_args = [visit(arg) for arg in e.args]
+        return e.func(*new_args) if new_args else e
+    return visit(expr)
+def evaluate_diffs(expr, var=None):
+    """Replaces pystencils diff objects by sympy diff objects and evaluates them.
+    Replaces Diff nodes by sp.diff , the free variable is either the target (if var=None) otherwise
+    the specified var
+    """
+    if isinstance(expr, Diff):
+        if var is None:
+            var = expr.target
+        return sp.diff(evaluate_diffs(expr.arg, var), var)
+    else:
+        new_args = [evaluate_diffs(arg, var) for arg in expr.args]
+        return expr.func(*new_args) if new_args else expr
+def normalize_diff_order(expression, functions=None, constants=None, sort_key=_default_diff_sort_key):
+    """Assumes order of differentiation can be exchanged. Changes the order of nested Diffs to a standard order defined
+    by the sorting key 'sort_key' such that the derivative terms can be further simplified """
+    def visit(expr):
+        if isinstance(expr, Diff):
+            nodes = [expr]
+            while isinstance(nodes[-1].arg, Diff):
+                nodes.append(nodes[-1].arg)
+            processed_arg = visit(nodes[-1].arg)
+            nodes.sort(key=sort_key)
+            result = processed_arg
+            for d in reversed(nodes):
+                result = Diff(result, target=d.target, superscript=d.superscript)
+            return result
+        else:
+            new_args = [visit(e) for e in expr.args]
+            return expr.func(*new_args) if new_args else expr
+    expression = expand_diff_linear(expression.expand(), functions, constants).expand()
+    return visit(expression)
+def expand_diff_full(expr, functions=None, constants=None):
+    if functions is None:
+        functions = expr.atoms(sp.Symbol)
+        if constants is not None:
+            functions.difference_update(constants)
+    def visit(e):
+        if not isinstance(e, sp.Tuple):
+            e = e.expand()
+        if e.func == Diff:
+            result = 0
+            diff_args = {'target': e.target, 'superscript': e.superscript}
+            diff_inner = e.args[0]
+            diff_inner = visit(diff_inner)
+            if diff_inner.func not in (sp.Add, sp.Mul):
+                return e
+            for term in diff_inner.args if diff_inner.func == sp.Add else [diff_inner]:
+                independent_terms = 1
+                dependent_terms = []
+                for factor in normalize_product(term):
+                    if factor in functions or isinstance(factor, Diff):
+                        dependent_terms.append(factor)
+                    else:
+                        independent_terms *= factor
+                for i in range(len(dependent_terms)):
+                    dependent_term = dependent_terms[i]
+                    other_dependent_terms = dependent_terms[:i] + dependent_terms[i + 1:]
+                    processed_diff = normalize_diff_order(Diff(dependent_term, **diff_args))
+                    result += independent_terms * prod(other_dependent_terms) * processed_diff
+            return result
+        elif isinstance(e, sp.Piecewise):
+            return sp.Piecewise(*((expand_diff_full(a, functions, constants), b) for a, b in e.args))
+        elif isinstance(expr, sp.Tuple):
+            new_args = [visit(arg) for arg in e.args]
+            return sp.Tuple(*new_args)
+        else:
+            new_args = [visit(arg) for arg in e.args]
+            return e.func(*new_args) if new_args else e
+    if isinstance(expr, sp.Matrix):
+        return expr.applyfunc(visit)
+    else:
+        return visit(expr)
+def expand_diff_linear(expr, functions=None, constants=None):
+    """Expands all derivative nodes by applying Diff.split_linear
+    Args:
+        expr: expression containing derivatives
+        functions: sequence of symbols that are considered functions and can not be pulled before the derivative.
+                   if None, all symbols are viewed as functions
+        constants: sequence of symbols which are considered constants and can be pulled before the derivative
+    """
+    if functions is None:
+        functions = expr.atoms(sp.Symbol)
+        if constants is not None:
+            functions.difference_update(constants)
+    if isinstance(expr, Diff):
+        arg = expand_diff_linear(expr.arg, functions)
+        if hasattr(arg, 'func') and arg.func == sp.Add:
+            result = 0
+            for a in arg.args:
+                result += Diff(a, target=expr.target, superscript=expr.superscript).split_linear(functions)
+            return result
+        else:
+            diff = Diff(arg, target=expr.target, superscript=expr.superscript)
+            if diff == 0:
+                return 0
+            else:
+                return diff.split_linear(functions)
+    elif isinstance(expr, sp.Piecewise):
+        return sp.Piecewise(*((expand_diff_linear(a, functions, constants), b) for a, b in expr.args))
+    elif isinstance(expr, sp.Tuple):
+        new_args = [expand_diff_linear(e, functions) for e in expr.args]
+        return sp.Tuple(*new_args)
+    else:
+        new_args = [expand_diff_linear(e, functions) for e in expr.args]
+        result = sp.expand(expr.func(*new_args) if new_args else expr)
+        return result
+def expand_diff_products(expr):
+    """Fully expands all derivatives by applying product rule"""
+    if isinstance(expr, Diff):
+        arg = expand_diff_products(expr.args[0])
+        if arg.func == sp.Add:
+            new_args = [Diff(e, target=expr.target, superscript=expr.superscript)
+                        for e in arg.args]
+            return sp.Add(*new_args)
+        if arg.func not in (sp.Mul, sp.Pow):
+            return Diff(arg, target=expr.target, superscript=expr.superscript)
+        else:
+            prod_list = normalize_product(arg)
+            result = 0
+            for i in range(len(prod_list)):
+                pre_factor = prod(prod_list[j] for j in range(len(prod_list)) if i != j)
+                result += pre_factor * Diff(prod_list[i], target=expr.target, superscript=expr.superscript)
+            return result
+    else:
+        new_args = [expand_diff_products(e) for e in expr.args]
+        return expr.func(*new_args) if new_args else expr
+def combine_diff_products(expr):
+    """Inverse product rule"""
+    def expr_to_diff_decomposition(expression):
+        """Decomposes a sp.Add node containing CeDiffs into:
+        diff_dict: maps (target, superscript) -> [ (pre_factor, argument), ... ]
+        i.e.  a partial(b) ( a is pre-factor, b is argument)
+            in case of partial(a) partial(b) two entries are created  (0.5 partial(a), b), (0.5 partial(b), a)
+        """
+        DiffInfo = namedtuple("DiffInfo", ["target", "superscript"])
+        class DiffSplit:
+            def __init__(self, fac, argument):
+                self.pre_factor = fac
+                self.argument = argument
+            def __repr__(self):
+                return str((self.pre_factor, self.argument))
+        assert isinstance(expression, sp.Add)
+        diff_dict = defaultdict(list)
+        rest = 0
+        for term in expression.args:
+            if isinstance(term, Diff):
+                diff_dict[DiffInfo(term.target, term.superscript)].append(DiffSplit(1, term.arg))
+            else:
+                mul_args = normalize_product(term)
+                diffs = [d for d in mul_args if isinstance(d, Diff)]
+                factor = prod(d for d in mul_args if not isinstance(d, Diff))
+                if len(diffs) == 0:
+                    rest += factor
+                else:
+                    for i, diff in enumerate(diffs):
+                        all_but_current = [d for j, d in enumerate(diffs) if i != j]
+                        pre_factor = factor * prod(all_but_current) * sp.Rational(1, len(diffs))
+                        diff_dict[DiffInfo(diff.target, diff.superscript)].append(DiffSplit(pre_factor, diff.arg))
+        return diff_dict, rest
+    def match_diff_splits(own, other):
+        own_fac = own.pre_factor / other.argument
+        other_fac = other.pre_factor / own.argument
+        count = sp.count_ops
+        if count(own_fac) > count(own.pre_factor) or count(other_fac) > count(other.pre_factor):
+            return None
+        new_other_factor = own_fac - other_fac
+        return new_other_factor
+    def process_diff_list(diff_list, label, superscript):
+        if len(diff_list) == 0:
+            return 0
+        elif len(diff_list) == 1:
+            return diff_list[0].pre_factor * Diff(diff_list[0].argument, label, superscript)
+        result = 0
+        matches = []
+        for i in range(1, len(diff_list)):
+            match_result = match_diff_splits(diff_list[i], diff_list[0])
+            if match_result is not None:
+                matches.append((i, match_result))
+        if len(matches) == 0:
+            result += diff_list[0].pre_factor * Diff(diff_list[0].argument, label, superscript)
+        else:
+            other_idx, match_result = sorted(matches, key=lambda e: sp.count_ops(e[1]))[0]
+            new_argument = diff_list[0].argument * diff_list[other_idx].argument
+            result += (diff_list[0].pre_factor / diff_list[other_idx].argument) * Diff(new_argument, label, superscript)
+            if match_result == 0:
+                del diff_list[other_idx]
+            else:
+                diff_list[other_idx].pre_factor = match_result * diff_list[0].argument
+        result += process_diff_list(diff_list[1:], label, superscript)
+        return result
+    def combine(expression):
+        expression = expression.expand()
+        if isinstance(expression, sp.Add):
+            diff_dict, rest = expr_to_diff_decomposition(expression)
+            for (label, superscript), diff_list in diff_dict.items():
+                rest += process_diff_list(diff_list, label, superscript)
+            return rest
+        else:
+            new_args = [combine_diff_products(e) for e in expression.args]
+            return expression.func(*new_args) if new_args else expression
+    return combine(expr)
+def replace_generic_laplacian(expr, dim=None):
+    """Laplacian can be written as Diff(Diff(term)) without explicitly giving the dimensions.
+    This function replaces these constructs by diff(term, 0, 0) + diff(term, 1, 1) + ...
+    For this to work, the arguments of the derivative have to be field or field accesses such that the spatial
+    dimension can be determined.
+    >>> l = Diff(Diff(sp.symbols('x')))
+    >>> replace_generic_laplacian(l, 3)
+    Diff(Diff(x, 0, -1), 0, -1) + Diff(Diff(x, 1, -1), 1, -1) + Diff(Diff(x, 2, -1), 2, -1)
+    """
+    if isinstance(expr, Diff):
+        arg, *indices = diff_args(expr)
+        if isinstance(arg, Field.Access):
+            dim = arg.field.spatial_dimensions
+        assert dim is not None
+        if len(indices) == 2 and all(i == -1 for i in indices):
+            return sum(diff(arg, i, i) for i in range(dim))
+        else:
+            return expr
+    else:
+        new_args = [replace_generic_laplacian(a, dim) for a in expr.args]
+        return expr.func(*new_args) if new_args else expr
+def functional_derivative(functional, v):
+    r"""Computes functional derivative of functional with respect to v using Euler-Lagrange equation
+    .. math ::
+        \frac{\delta F}{\delta v} =
+                \frac{\partial F}{\partial v} - \nabla \cdot \frac{\partial F}{\partial \nabla v}
+    - assumes that gradients are represented by Diff() node
+    - Diff(Diff(r)) represents the divergence of r
+    - the constants parameter is a list with symbols not affected by the derivative. This is used for simplification
+      of the derivative terms.
+    """
+    diffs = functional.atoms(Diff)
+    bulk_substitutions = {d: sp.Dummy() for d in diffs}
+    bulk_substitutions_inverse = {v: k for k, v in bulk_substitutions.items()}
+    non_diff_part = functional.subs(bulk_substitutions)
+    partial_f_partial_v = sp.diff(non_diff_part, v).subs(bulk_substitutions_inverse)
+    gradient_part = 0
+    for diff_obj in diffs:
+        if diff_obj.args[0] != v:
+            continue
+        dummy = sp.Dummy()
+        partial_f_partial_grad_v = functional.subs(diff_obj, dummy).diff(dummy).subs(dummy, diff_obj)
+        gradient_part += Diff(partial_f_partial_grad_v, target=diff_obj.target, superscript=diff_obj.superscript)
+    result = partial_f_partial_v - gradient_part
+    return result
--- a/src/pystencils/fd/finitedifferences.py
+++ b/src/pystencils/fd/finitedifferences.py
+from typing import Optional, Union
+import numpy as np
+import sympy as sp
+from pystencils.fd import Diff
+from pystencils.fd.derivative import diff_args
+from pystencils.fd.spatial import fd_stencils_standard
+from pystencils.field import Field
+from pystencils.simp.assignment_collection import AssignmentCollection
+from pystencils.sympyextensions import fast_subs
+FieldOrFieldAccess = Union[Field, Field.Access]
+# --------------------------------------- Advection Diffusion ----------------------------------------------------------
+def diffusion(scalar, diffusion_coeff, idx=None):
+    """Diffusion term ∇·( diffusion_coeff · ∇(scalar))
+    Examples:
+        >>> f = Field.create_generic('f', spatial_dimensions=2)
+        >>> d = sp.Symbol("d")
+        >>> dx = sp.Symbol("dx")
+        >>> diffusion_term = diffusion(scalar=f, diffusion_coeff=d)
+        >>> discretization = Discretization2ndOrder()
+        >>> expected_output = ((f[-1, 0] + f[0, -1] - 4 * f[0, 0] + f[0, 1] + f[1, 0]) * d) / dx**2
+        >>> sp.simplify(discretization(diffusion_term) - expected_output)
+        0
+    """
+    if isinstance(scalar, Field):
+        first_arg = scalar.center
+    elif isinstance(scalar, Field.Access):
+        first_arg = scalar
+    else:
+        raise ValueError("Diffused scalar has to be a pystencils Field or Field.Access")
+    args = [first_arg, diffusion_coeff if not isinstance(diffusion_coeff, Field) else diffusion_coeff.center]
+    if idx is not None:
+        args.append(idx)
+    return Diffusion(*args)
+def advection(advected_scalar: FieldOrFieldAccess, velocity_field: FieldOrFieldAccess, idx: Optional[int] = None):
+    """Advection term  ∇·(velocity_field · advected_scalar)
+    Term that describes the advection of a scalar quantity in a velocity field.
+    """
+    if isinstance(advected_scalar, Field):
+        first_arg = advected_scalar.center
+    elif isinstance(advected_scalar, Field.Access):
+        first_arg = advected_scalar
+    else:
+        raise ValueError("Advected scalar has to be a pystencils Field or Field.Access")
+    args = [first_arg, velocity_field if not isinstance(velocity_field, Field) else velocity_field.center]
+    if idx is not None:
+        args.append(idx)
+    return Advection(*args)
+def transient(scalar, idx=None):
+    """Transient term ∂_t(scalar)"""
+    if isinstance(scalar, Field):
+        args = [scalar.center]
+    elif isinstance(scalar, Field.Access):
+        args = [scalar]
+    else:
+        raise ValueError("Scalar has to be a pystencils Field or Field.Access")
+    if idx is not None:
+        args.append(idx)
+    return Transient(*args)
+class Discretization2ndOrder:
+    def __init__(self, dx=sp.Symbol("dx"), dt=sp.Symbol("dt"), discretization_stencil_func=fd_stencils_standard):
+        self.dx = dx
+        self.dt = dt
+        self.spatial_stencil = discretization_stencil_func
+    def _discretize_diffusion(self, e):
+        result = 0
+        for c in range(e.dim):
+            first_diffs = [offset
+                           * (e.diffusion_scalar_at_offset(c, offset) * e.diffusion_coefficient_at_offset(c, offset)
+                              - e.diffusion_scalar_at_offset(0, 0) * e.diffusion_coefficient_at_offset(0, 0))
+                           for offset in [-1, 1]]
+            result += first_diffs[1] - first_diffs[0]
+        return result / (self.dx ** 2)
+    def _discretize_advection(self, expr):
+        result = 0
+        for c in range(expr.dim):
+            interpolated = [(expr.advected_scalar_at_offset(c, offset) * expr.velocity_field_at_offset(c, offset, c)
+                             + expr.advected_scalar_at_offset(c, 0) * expr.velocity_field_at_offset(c, 0, c)) / 2
+                            for offset in [-1, 1]]
+            result += interpolated[1] - interpolated[0]
+        return result / self.dx
+    def _discretize_spatial(self, e):
+        if isinstance(e, Diffusion):
+            return self._discretize_diffusion(e)
+        elif isinstance(e, Advection):
+            return self._discretize_advection(e)
+        elif isinstance(e, Diff):
+            arg, *indices = diff_args(e)
+            if not isinstance(arg, Field.Access):
+                raise ValueError("Only derivatives with field or field accesses as arguments can be discretized")
+            return self.spatial_stencil(indices, self.dx, arg)
+        else:
+            new_args = [self._discretize_spatial(a) for a in e.args]
+            return e.func(*new_args) if new_args else e
+    def __call__(self, expr):
+        if isinstance(expr, list):
+            return [self(e) for e in expr]
+        elif isinstance(expr, sp.Matrix) or isinstance(expr, sp.ImmutableDenseMatrix):
+            return expr.applyfunc(self.__call__)
+        elif isinstance(expr, AssignmentCollection):
+            return expr.copy(main_assignments=[e for e in expr.main_assignments],
+                             subexpressions=[e for e in expr.subexpressions])
+        transient_terms = expr.atoms(Transient)
+        if len(transient_terms) == 0:
+            return self._discretize_spatial(expr)
+        elif len(transient_terms) == 1:
+            transient_term = transient_terms.pop()
+            solve_result = sp.solve(expr, transient_term)
+            if len(solve_result) != 1:
+                raise ValueError("Could not solve for transient term" + str(solve_result))
+            rhs = solve_result.pop()
+            # explicit euler
+            return transient_term.scalar + self.dt * self._discretize_spatial(rhs)
+        else:
+            print(transient_terms)
+            raise NotImplementedError("Cannot discretize expression with more than one transient term")
+# -------------------------------------- Helper Classes ----------------------------------------------------------------
+class Advection(sp.Function):
+    @property
+    def scalar(self):
+        return self.args[0].field
+    @property
+    def vector(self):
+        if isinstance(self.args[1], Field.Access):
+            return self.args[1].field
+        else:
+            return self.args[1]
+    @property
+    def scalar_index(self):
+        return None if len(self.args) <= 2 else int(self.args[2])
+    @property
+    def dim(self):
+        return self.scalar.spatial_dimensions
+    def _latex(self, printer):
+        name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
+        if isinstance(self.vector, Field):
+            return r"\nabla \cdot(%s %s)" % (printer.doprint(sp.Symbol(self.vector.name)),
+                                             printer.doprint(sp.Symbol(self.scalar.name + name_suffix)))
+        else:
+            args = [r"\partial_%d(%s %s)" % (i, printer.doprint(sp.Symbol(self.scalar.name + name_suffix)),
+                                             printer.doprint(self.vector[i]))
+                    for i in range(self.dim)]
+            return " + ".join(args)
+    # --- Interface for discretization strategy
+    def velocity_field_at_offset(self, offset_dim, offset_value, index):
+        v = self.vector
+        if isinstance(v, Field):
+            assert v.index_dimensions == 1
+            return v.neighbor(offset_dim, offset_value)(index)
+        else:
+            return v[index]
+    def advected_scalar_at_offset(self, offset_dim, offset_value):
+        idx = 0 if self.scalar_index is None else int(self.scalar_index)
+        return self.scalar.neighbor(offset_dim, offset_value)(idx)
+class Diffusion(sp.Function):
+    @property
+    def scalar(self):
+        return self.args[0].field
+    @property
+    def diffusion_coeff(self):
+        if isinstance(self.args[1], Field.Access):
+            return self.args[1].field
+        else:
+            return self.args[1]
+    @property
+    def scalar_index(self):
+        return None if len(self.args) <= 2 else int(self.args[2])
+    @property
+    def dim(self):
+        return self.scalar.spatial_dimensions
+    def _latex(self, printer):
+        name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
+        coeff = self.diffusion_coeff
+        diff_coeff = sp.Symbol(coeff.name) if isinstance(coeff, Field) else coeff
+        return r"div(%s \nabla %s)" % (printer.doprint(diff_coeff),
+                                       printer.doprint(sp.Symbol(self.scalar.name + name_suffix)))
+    # --- Interface for discretization strategy
+    def diffusion_scalar_at_offset(self, offset_dim, offset_value):
+        idx = 0 if self.scalar_index is None else self.scalar_index
+        return self.scalar.neighbor(offset_dim, offset_value)(idx)
+    def diffusion_coefficient_at_offset(self, offset_dim, offset_value):
+        d = self.diffusion_coeff
+        if isinstance(d, Field):
+            return d.neighbor(offset_dim, offset_value)
+        else:
+            return d
+class Transient(sp.Function):
+    @property
+    def scalar(self):
+        if self.scalar_index is None:
+            return self.args[0].field.center
+        else:
+            return self.args[0].field(self.scalar_index)
+    @property
+    def scalar_index(self):
+        return None if len(self.args) <= 1 else int(self.args[1])
+    def _latex(self, printer):
+        name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
+        return r"\partial_t %s" % (printer.doprint(sp.Symbol(self.scalar.name + name_suffix)),)
+# -------------------------------------------- Deprecated Functions ----------------------------------------------------
+def grad(var, dim=3):
+    r"""
+    Gradients are represented as a special symbol:
+    e.g. :math:`\nabla x = (x^{\Delta 0}, x^{\Delta 1}, x^{\Delta 2})`
+    This function takes a symbol and creates the gradient symbols according to convention above
+    Args:
+        var: symbol to take the gradient of
+        dim: dimension (length) of the gradient vector
+    """
+    if hasattr(var, "__getitem__"):
+        return [[sp.Symbol("%s^Delta^%d" % (v.name, i)) for v in var] for i in range(dim)]
+    else:
+        return [sp.Symbol("%s^Delta^%d" % (var.name, i)) for i in range(dim)]
+def discretize_center(term, symbols_to_field_dict, dx, dim=3):
+    """
+    Expects term that contains given symbols and gradient components of these symbols and replaces them
+    by field accesses. Gradients are replaced by centralized approximations:
+    ``(upper neighbor - lower neighbor ) / ( 2*dx)``
+    Args:
+        term: term where symbols and gradient(symbol) should be replaced
+        symbols_to_field_dict: mapping of symbols to Field
+        dx: width and height of one cell
+        dim: dimension
+    Example:
+      >>> x = sp.Symbol("x")
+      >>> grad_x = grad(x, dim=3)
+      >>> term = x * grad_x[0]
+      >>> term
+      x*x^Delta^0
+      >>> f = Field.create_generic('f', spatial_dimensions=3)
+      >>> expected_output = f[0, 0, 0] * (-f[-1, 0, 0]/2 + f[1, 0, 0]/2)
+      >>> sp.simplify(discretize_center(term, { x: f }, dx=1, dim=3) - expected_output)
+      0
+    """
+    substitutions = {}
+    for symbols, field in symbols_to_field_dict.items():
+        if not hasattr(symbols, "__getitem__"):
+            symbols = [symbols]
+        g = grad(symbols, dim)
+        substitutions.update({symbol: field(i) for i, symbol in enumerate(symbols)})
+        for d in range(dim):
+            up, down = __up_down_offsets(d, dim)
+            substitutions.update({g[d][i]: (field[up](i) - field[down](i)) / dx / 2 for i in range(len(symbols))})
+    return fast_subs(term, substitutions)
+def discretize_staggered(term, symbols_to_field_dict, coordinate, coordinate_offset, dx, dim=3):
+    """
+    Expects term that contains given symbols and gradient components of these symbols and replaces them
+    by field accesses. Gradients in coordinate direction  are replaced by staggered version at cell boundary.
+    Symbols themselves and gradients in other directions are replaced by interpolated version at cell face.
+    Args:
+        term: input term where symbols and gradients are replaced
+        symbols_to_field_dict: mapping of symbols to Field
+        coordinate: id for coordinate (0 for x, 1 for y, ... ) defining cell boundary.
+                    Only gradients in this direction are replaced e.g. if symbol^Delta^coordinate
+        coordinate_offset: either +1 or -1 for upper or lower face in coordinate direction
+        dx: width and height of one cell
+        dim: dimension
+    Examples:
+      Discretizing at right/east face of cell i.e. coordinate=0, offset=1)
+      >>> x, dx = sp.symbols("x dx")
+      >>> grad_x = grad(x, dim=3)
+      >>> term = x * grad_x[0]
+      >>> term
+      x*x^Delta^0
+      >>> f = Field.create_generic('f', spatial_dimensions=3)
+      >>> discretize_staggered(term, symbols_to_field_dict={ x: f}, dx=dx, coordinate=0, coordinate_offset=1, dim=3)
+      (-f_C + f_E)*(f_C/2 + f_E/2)/dx
+    """
+    assert coordinate_offset == 1 or coordinate_offset == -1
+    assert 0 <= coordinate < dim
+    substitutions = {}
+    for symbols, field in symbols_to_field_dict.items():
+        if not hasattr(symbols, "__getitem__"):
+            symbols = [symbols]
+        offset = [0] * dim
+        offset[coordinate] = coordinate_offset
+        offset = np.array(offset, dtype=int)
+        gradient = grad(symbols)[coordinate]
+        substitutions.update({s: (field[offset](i) + field(i)) / 2 for i, s in enumerate(symbols)})
+        substitutions.update({g: (field[offset](i) - field(i)) / dx * coordinate_offset
+                              for i, g in enumerate(gradient)})
+        for d in range(dim):
+            if d == coordinate:
+                continue
+            up, down = __up_down_offsets(d, dim)
+            for i, s in enumerate(symbols):
+                center_grad = (field[up](i) - field[down](i)) / (2 * dx)
+                neighbor_grad = (field[up + offset](i) - field[down + offset](i)) / (2 * dx)
+                substitutions[grad(s)[d]] = (center_grad + neighbor_grad) / 2
+    return fast_subs(term, substitutions)
+def discretize_divergence(vector_term, symbols_to_field_dict, dx):
+    """
+    Computes discrete divergence of symbolic vector
+    Args:
+        vector_term: sequence of terms, interpreted as vector
+        symbols_to_field_dict: mapping of symbols to Field
+        dx: length of a cell
+    Examples:
+        Laplace stencil
+        >>> x, dx = sp.symbols("x dx")
+        >>> grad_x = grad(x, dim=3)
+        >>> f = Field.create_generic('f', spatial_dimensions=3)
+        >>> expected_output = (f[-1, 0, 0] + f[0, -1, 0] + f[0, 0, -1] -
+        ... 6*f[0, 0, 0] + f[0, 0, 1] + f[0, 1, 0] + f[1, 0, 0])/dx**2
+        >>> sp.simplify(discretize_divergence(grad_x, {x : f}, dx) - expected_output)
+        0
+    """
+    dim = len(vector_term)
+    result = 0
+    for d in range(dim):
+        for offset in [-1, 1]:
+            result += offset * discretize_staggered(vector_term[d], symbols_to_field_dict, d, offset, dx, dim)
+    return result / dx
+def __up_down_offsets(d, dim):
+    coord = [0] * dim
+    coord[d] = 1
+    up = np.array(coord, dtype=int)
+    coord[d] = -1
+    down = np.array(coord, dtype=int)
+    return up, down
--- a/src/pystencils/fd/finitevolumes.py
+++ b/src/pystencils/fd/finitevolumes.py
+import pystencils as ps
+import sympy as sp
+from pystencils.fd.derivation import FiniteDifferenceStaggeredStencilDerivation as FDS, \
+    FiniteDifferenceStencilDerivation as FD
+import itertools
+from collections import defaultdict
+from collections.abc import Iterable
+def get_access_and_direction(term):
+    direction1 = term.args[1]
+    if isinstance(term.args[0], ps.Field.Access):  # first derivative
+        access = term.args[0]
+        direction = (direction1,)
+    elif isinstance(term.args[0], ps.fd.Diff):  # nested derivative
+        if isinstance(term.args[0].args[0], ps.fd.Diff):  # third or higher derivative
+            raise ValueError("can only handle first and second derivatives")
+        elif not isinstance(term.args[0].args[0], ps.Field.Access):
+            raise ValueError("can only handle derivatives of field accesses")
+        access, direction2 = term.args[0].args[:2]
+        direction = (direction1, direction2)
+    else:
+        raise NotImplementedError(f"can only deal with derivatives of field accesses, "
+                                  f"but not {type(term.args[0])}; expansion of derivatives probably failed")
+    return access, direction
+class FVM1stOrder:
+    """Finite-volume discretization
+    Args:
+        field: the field with the quantity to calculate, e.g. a concentration
+        flux: a list of sympy expressions that specify the flux, one for each cartesian direction
+        source: a list of sympy expressions that specify the source
+    """
+    def __init__(self, field: ps.field.Field, flux=0, source=0):
+        def normalize(f, shape):
+            shape = tuple(s for s in shape if s != 1)
+            if not shape:
+                shape = None
+            if isinstance(f, sp.Array) or isinstance(f, Iterable) or isinstance(f, sp.Matrix):
+                return sp.Array(f, shape)
+            else:
+                return sp.Array([f] * (sp.Mul(*shape) if shape else 1))
+        self.c = field
+        self.dim = self.c.spatial_dimensions
+        self.j = normalize(flux, (self.dim, ) + self.c.index_shape)
+        self.q = normalize(source, self.c.index_shape)
+    def discrete_flux(self, flux_field: ps.field.Field):
+        """Return a list of assignments for the discrete fluxes
+        Args:
+            flux_field: a staggered field to which the fluxes should be assigned
+        """
+        assert ps.FieldType.is_staggered(flux_field)
+        num = 0
+        def discretize(term, neighbor):
+            nonlocal num
+            if isinstance(term, sp.Matrix):
+                nw = term.applyfunc(lambda t: discretize(t, neighbor))
+                return nw
+            elif isinstance(term, ps.field.Field.Access):
+                avg = (term.get_shifted(*neighbor) + term) * sp.Rational(1, 2)
+                return avg
+            elif isinstance(term, ps.fd.Diff):
+                access, direction = get_access_and_direction(term)
+                fds = FDS(neighbor, access.field.spatial_dimensions, direction,
+                          free_weights_prefix=f'fvm_free_{num}' if sp.Matrix(neighbor).dot(neighbor) > 2 else None)
+                num += 1
+                return fds.apply(access)
+            if term.args:
+                new_args = [discretize(a, neighbor) for a in term.args]
+                return term.func(*new_args)
+            else:
+                return term
+        fluxes = self.j.applyfunc(ps.fd.derivative.expand_diff_full)
+        fluxes = [sp.Matrix(fluxes.tolist()[i]) if flux_field.index_dimensions > 1 else fluxes.tolist()[i] 
+                  for i in range(self.dim)]
+        A0 = sum([sp.Matrix(ps.stencil.direction_string_to_offset(d)).norm()
+                  for d in flux_field.staggered_stencil]) / self.dim
+        discrete_fluxes = []
+        for neighbor in flux_field.staggered_stencil:
+            neighbor = ps.stencil.direction_string_to_offset(neighbor)
+            directional_flux = fluxes[0] * int(neighbor[0])
+            for i in range(1, self.dim):
+                directional_flux += fluxes[i] * int(neighbor[i])
+            discrete_flux = sp.simplify(discretize(directional_flux, neighbor))
+            free_weights = [s for s in discrete_flux.atoms(sp.Symbol) if s.name.startswith('fvm_free_')]
+            if len(free_weights) > 0:
+                discrete_flux = discrete_flux.collect(discrete_flux.atoms(ps.field.Field.Access))
+                access_counts = defaultdict(list)
+                for values in itertools.product([-1, 0, 1],
+                                                repeat=len(free_weights)):
+                    subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
+                    simp = discrete_flux.subs(subs)
+                    access_count = len(simp.atoms(ps.field.Field.Access))
+                    access_counts[access_count].append(simp)
+                best_count = min(access_counts.keys())
+                discrete_flux = sum(access_counts[best_count]) / len(access_counts[best_count])
+            discrete_fluxes.append(discrete_flux / sp.Matrix(neighbor).norm())
+        if flux_field.index_dimensions > 1:
+            return [ps.Assignment(lhs, rhs / A0)
+                    for i, d in enumerate(flux_field.staggered_stencil) if discrete_fluxes[i]
+                    for lhs, rhs in zip(flux_field.staggered_vector_access(d), sp.simplify(discrete_fluxes[i]))]
+        else:
+            return [ps.Assignment(flux_field.staggered_access(d), sp.simplify(discrete_fluxes[i]) / A0)
+                    for i, d in enumerate(flux_field.staggered_stencil)]
+    def discrete_source(self):
+        """Return a list of assignments for the discrete source term"""
+        def discretize(term):
+            if isinstance(term, ps.fd.Diff):
+                access, direction = get_access_and_direction(term)
+                if self.dim == 2:
+                    stencil = ["".join(a).replace(" ", "") for a in itertools.product("NS ", "EW ")
+                               if "".join(a).strip()]
+                else:
+                    stencil = ["".join(a).replace(" ", "") for a in itertools.product("NS ", "EW ", "TB ")
+                               if "".join(a).strip()]
+                weights = None
+                for stencil in [["N", "S", "E", "W", "T", "B"][:2 * self.dim], stencil]:
+                    stencil = [tuple(ps.stencil.direction_string_to_offset(d, self.dim)) for d in stencil]
+                    derivation = FD(direction, stencil).get_stencil()
+                    if not derivation.accuracy:
+                        continue
+                    weights = derivation.weights
+                    # if the weights are underdefined, we can choose the free symbols to find the sparsest stencil
+                    free_weights = set(itertools.chain(*[w.free_symbols for w in weights]))
+                    if len(free_weights) > 0:
+                        zero_counts = defaultdict(list)
+                        for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)],
+                                                        repeat=len(free_weights)):
+                            subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
+                            weights = [w.subs(subs) for w in derivation.weights]
+                            if not all(a == 0 for a in weights):
+                                zero_count = sum([1 for w in weights if w == 0])
+                                zero_counts[zero_count].append(weights)
+                        best = zero_counts[max(zero_counts.keys())]
+                        if len(best) > 1:
+                            raise NotImplementedError("more than one suitable set of weights found, "
+                                                      "don't know how to proceed")
+                        weights = best[0]
+                    break
+                if not weights:
+                    raise Exception('the requested derivative cannot be performed with the available neighbors')
+                assert weights
+                if access._field.index_dimensions == 0:
+                    return sum([access._field.__getitem__(point) * weight for point, weight in zip(stencil, weights)])
+                else:
+                    total = access.get_shifted(*stencil[0]).at_index(*access.index) * weights[0]
+                    for point, weight in zip(stencil[1:], weights[1:]):
+                        addl = access.get_shifted(*point).at_index(*access.index) * weight
+                        total += addl
+                    return total
+            if term.args:
+                new_args = [discretize(a) for a in term.args]
+                return term.func(*new_args)
+            else:
+                return term
+        source = self.q.applyfunc(ps.fd.derivative.expand_diff_full)
+        source = source.applyfunc(discretize)
+        return [ps.Assignment(lhs, rhs) for lhs, rhs in zip(self.c.center_vector, sp.flatten(source)) if rhs]
+    def discrete_continuity(self, flux_field: ps.field.Field):
+        """Return a list of assignments for the continuity equation, which includes the source term
+        Args:
+            flux_field: a staggered field from which the fluxes are taken
+        """
+        assert ps.FieldType.is_staggered(flux_field)
+        neighbors = flux_field.staggered_stencil + [ps.stencil.inverse_direction_string(d)
+                                                    for d in flux_field.staggered_stencil]
+        divergence = flux_field.staggered_vector_access(neighbors[0])
+        for d in neighbors[1:]:
+            divergence += flux_field.staggered_vector_access(d)
+        source = self.discrete_source()
+        source = {s.lhs: s.rhs for s in source}
+        return [ps.Assignment(lhs, (lhs - rhs + source[lhs]) if lhs in source else (lhs - rhs))
+                for lhs, rhs in zip(self.c.center_vector, divergence)]
+def VOF(j: ps.field.Field, v: ps.field.Field, ρ: ps.field.Field):
+    """Volume-of-fluid discretization of advection
+    Args:
+        j: the staggered field to write the fluxes to. Should have a D2Q9/D3Q27 stencil. Other stencils work too, but
+           incur a small error (D2Q5/D3Q7: v^2, D3Q19: v^3).
+        v: the flow velocity field
+        ρ: the quantity to advect
+    """
+    assert ps.FieldType.is_staggered(j)
+    fluxes = [[] for i in range(j.index_shape[0])]
+    v0 = v.center_vector
+    for d, neighbor in enumerate(j.staggered_stencil):
+        c = ps.stencil.direction_string_to_offset(neighbor)
+        v1 = v.neighbor_vector(c)
+        # going out
+        cond = sp.And(*[sp.Or(c[i] * v0[i] > 0, c[i] == 0) for i in range(len(v0))])
+        overlap1 = [1 - sp.Abs(v0[i]) for i in range(len(v0))]
+        overlap2 = [c[i] * v0[i] for i in range(len(v0))]
+        overlap = sp.Mul(*[(overlap1[i] if c[i] == 0 else overlap2[i]) for i in range(len(v0))])
+        fluxes[d].append(ρ.center_vector * overlap * sp.Piecewise((1, cond), (0, True)))
+        # coming in
+        cond = sp.And(*[sp.Or(c[i] * v1[i] < 0, c[i] == 0) for i in range(len(v1))])
+        overlap1 = [1 - sp.Abs(v1[i]) for i in range(len(v1))]
+        overlap2 = [v1[i] for i in range(len(v1))]
+        overlap = sp.Mul(*[(overlap1[i] if c[i] == 0 else overlap2[i]) for i in range(len(v1))])
+        sign = (c == 1).sum() % 2 * 2 - 1
+        fluxes[d].append(sign * ρ.neighbor_vector(c) * overlap * sp.Piecewise((1, cond), (0, True)))
+    for i, ff in enumerate(fluxes):
+        fluxes[i] = ff[0]
+        for f in ff[1:]:
+            fluxes[i] += f
+    assignments = []
+    for i, d in enumerate(j.staggered_stencil):
+        for lhs, rhs in zip(j.staggered_vector_access(d).values(), fluxes[i].values()):
+            assignments.append(ps.Assignment(lhs, rhs))
+    return assignments
--- a/src/pystencils/fd/spatial.py
+++ b/src/pystencils/fd/spatial.py
+from functools import lru_cache
+from typing import Tuple
+import sympy as sp
+from pystencils.astnodes import LoopOverCoordinate
+from pystencils.fd import Diff
+from pystencils.field import Field
+from pystencils.transformations import generic_visit
+from .derivation import FiniteDifferenceStencilDerivation
+from .derivative import diff_args
+def fd_stencils_standard(indices, dx, fa):
+    order = len(indices)
+    assert all(i >= 0 for i in indices), "Can only discretize objects with (integer) subscripts"
+    if order == 1:
+        idx = indices[0]
+        return (fa.neighbor(idx, 1) - fa.neighbor(idx, -1)) / (2 * dx)
+    elif order == 2:
+        if indices[0] == indices[1]:
+            return (-2 * fa + fa.neighbor(indices[0], -1) + fa.neighbor(indices[0], +1)) / (dx ** 2)
+        else:
+            offsets = [(1, 1), [-1, 1], [1, -1], [-1, -1]]
+            return sum(o1 * o2 * fa.neighbor(indices[0], o1).neighbor(indices[1], o2)
+                       for o1, o2 in offsets) / (4 * dx ** 2)
+    raise NotImplementedError("Supports only derivatives up to order 2")
+def fd_stencils_isotropic(indices, dx, fa):
+    dim = fa.field.spatial_dimensions
+    if dim == 1:
+        return fd_stencils_standard(indices, dx, fa)
+    elif dim == 2:
+        order = len(indices)
+        if order == 1:
+            idx = indices[0]
+            assert 0 <= idx < 2
+            other_idx = 1 if indices[0] == 0 else 0
+            weights = {-1: sp.Rational(1, 12) / dx,
+                       0: sp.Rational(1, 3) / dx,
+                       1: sp.Rational(1, 12) / dx}
+            upper_terms = sum(fa.neighbor(idx, +1).neighbor(other_idx, off) * w for off, w in weights.items())
+            lower_terms = sum(fa.neighbor(idx, -1).neighbor(other_idx, off) * w for off, w in weights.items())
+            return upper_terms - lower_terms
+        elif order == 2:
+            if indices[0] == indices[1]:
+                idx = indices[0]
+                other_idx = 1 if idx == 0 else 0
+                diagonals = sp.Rational(1, 12) * sum(fa.neighbor(0, i).neighbor(1, j) for i in (-1, 1) for j in (-1, 1))
+                div_direction = sp.Rational(5, 6) * sum(fa.neighbor(idx, i) for i in (-1, 1))
+                other_direction = - sp.Rational(1, 6) * sum(fa.neighbor(other_idx, i) for i in (-1, 1))
+                center = - sp.Rational(5, 3) * fa
+                return (diagonals + div_direction + other_direction + center) / (dx ** 2)
+            else:
+                return fd_stencils_standard(indices, dx, fa)
+    raise NotImplementedError("Supports only derivatives up to order 2 for 1D and 2D setups")
+def fd_stencils_forth_order_isotropic(indices, dx, fa):
+    order = len(indices)
+    if order != 1:
+        raise NotImplementedError("Forth order finite difference discretization is "
+                                  "currently only supported for first derivatives")
+    dim = indices[0]
+    if dim not in (0, 1):
+        raise NotImplementedError("Forth order finite difference discretization is only implemented for 2D")
+    stencils = forth_order_2d_derivation()
+    return stencils[dim].apply(fa) / dx
+def discretize_spatial(expr, dx, stencil=fd_stencils_standard):
+    if isinstance(stencil, str):
+        if stencil == 'standard':
+            stencil = fd_stencils_standard
+        elif stencil == 'isotropic':
+            stencil = fd_stencils_isotropic
+        else:
+            raise ValueError("Unknown stencil. Supported 'standard' and 'isotropic'")
+    def visitor(e):
+        if isinstance(e, Diff):
+            arg, *indices = diff_args(e)
+            if not isinstance(arg, Field.Access):
+                raise ValueError("Only derivatives with field or field accesses as arguments can be discretized")
+            return stencil(indices, dx, arg)
+        else:
+            new_args = [discretize_spatial(a, dx, stencil) for a in e.args]
+            return e.func(*new_args) if new_args else e
+    return generic_visit(expr, visitor)
+def discretize_spatial_staggered(expr, dx, stencil=fd_stencils_standard):
+    def staggered_visitor(e, coordinate, sign):
+        if isinstance(e, Diff):
+            arg, *indices = diff_args(e)
+            if len(indices) != 1:
+                raise ValueError("Function supports only up to second derivatives")
+            if not isinstance(arg, Field.Access):
+                raise ValueError("Argument of inner derivative has to be field access")
+            target = indices[0]
+            if target == coordinate:
+                assert sign in (-1, 1)
+                return (arg.neighbor(coordinate, sign) - arg) / dx * sign
+            else:
+                return (stencil(indices, dx, arg.neighbor(coordinate, sign))
+                        + stencil(indices, dx, arg)) / 2
+        elif isinstance(e, Field.Access):
+            return (e.neighbor(coordinate, sign) + e) / 2
+        elif isinstance(e, sp.Symbol):
+            loop_idx = LoopOverCoordinate.is_loop_counter_symbol(e)
+            return e + sign / 2 if loop_idx == coordinate else e
+        else:
+            new_args = [staggered_visitor(a, coordinate, sign) for a in e.args]
+            return e.func(*new_args) if new_args else e
+    def visitor(e):
+        if isinstance(e, Diff):
+            arg, *indices = diff_args(e)
+            if isinstance(arg, Field.Access):
+                return stencil(indices, dx, arg)
+            else:
+                if not len(indices) == 1:
+                    raise ValueError("This term is not support by the staggered discretization strategy")
+                target = indices[0]
+                return (staggered_visitor(arg, target, 1) - staggered_visitor(arg, target, -1)) / dx
+        else:
+            new_args = [visitor(a) for a in e.args]
+            return e.func(*new_args) if new_args else e
+    return generic_visit(expr, visitor)
+# -------------------------------------- special stencils --------------------------------------------------------------
+@lru_cache(maxsize=1)
+def forth_order_2d_derivation() -> Tuple[FiniteDifferenceStencilDerivation.Result, ...]:
+    # Symmetry, isotropy and 4th order conditions are not enough to fully specify the stencil
+    # one weight has to be specifically set to a somewhat arbitrary value
+    second_neighbor_weight = sp.Rational(1, 10)
+    second_neighbor_stencil = [(i, j)
+                               for i in (-2, -1, 0, 1, 2)
+                               for j in (-2, -1, 0, 1, 2)
+                               ]
+    x_diff = FiniteDifferenceStencilDerivation((0,), second_neighbor_stencil)
+    x_diff.set_weight((2, 0), second_neighbor_weight)
+    x_diff.assume_symmetric(0, anti_symmetric=True)
+    x_diff.assume_symmetric(1)
+    x_diff_stencil = x_diff.get_stencil(isotropic=True)
+    y_diff = FiniteDifferenceStencilDerivation((1,), second_neighbor_stencil)
+    y_diff.set_weight((0, 2), second_neighbor_weight)
+    y_diff.assume_symmetric(1, anti_symmetric=True)
+    y_diff.assume_symmetric(0)
+    y_diff_stencil = y_diff.get_stencil(isotropic=True)
+    return x_diff_stencil, y_diff_stencil
--- a/src/pystencils/field.py
+++ b/src/pystencils/field.py
+import functools
+import hashlib
+import operator
+import pickle
+import re
+from enum import Enum
+from itertools import chain
+from typing import List, Optional, Sequence, Set, Tuple, Union
+import numpy as np
+import sympy as sp
+from sympy.core.cache import cacheit
+import pystencils
+from pystencils.alignedarray import aligned_empty
+from pystencils.typing import StructType, TypedSymbol, BasicType, create_type
+from pystencils.typing.typed_sympy import FieldShapeSymbol, FieldStrideSymbol
+from pystencils.stencil import (
+    direction_string_to_offset, inverse_direction, offset_to_direction_string)
+from pystencils.sympyextensions import is_integer_sequence
+__all__ = ['Field', 'fields', 'FieldType', 'Field']
+class FieldType(Enum):
+    # generic fields
+    GENERIC = 0
+    # index fields are currently only used for boundary handling
+    # the coordinates are not the loop counters in that case, but are read from this index field
+    INDEXED = 1
+    # communication buffer, used for (un)packing data in communication.
+    BUFFER = 2
+    # unsafe fields may be accessed in an absolute fashion - the index depends on the data
+    # and thus may lead to out-of-bounds accesses
+    CUSTOM = 3
+    # staggered field
+    STAGGERED = 4
+    # staggered field that reverses sign when accessed via opposite direction
+    STAGGERED_FLUX = 5
+    @staticmethod
+    def is_generic(field):
+        assert isinstance(field, Field)
+        return field.field_type == FieldType.GENERIC
+    @staticmethod
+    def is_indexed(field):
+        assert isinstance(field, Field)
+        return field.field_type == FieldType.INDEXED
+    @staticmethod
+    def is_buffer(field):
+        assert isinstance(field, Field)
+        return field.field_type == FieldType.BUFFER
+    @staticmethod
+    def is_custom(field):
+        assert isinstance(field, Field)
+        return field.field_type == FieldType.CUSTOM
+    @staticmethod
+    def is_staggered(field):
+        assert isinstance(field, Field)
+        return field.field_type == FieldType.STAGGERED or field.field_type == FieldType.STAGGERED_FLUX
+    @staticmethod
+    def is_staggered_flux(field):
+        assert isinstance(field, Field)
+        return field.field_type == FieldType.STAGGERED_FLUX
+class Field:
+    """
+    With fields one can formulate stencil-like update rules on structured grids.
+    This Field class knows about the dimension, memory layout (strides) and optionally about the size of an array.
+    Creating Fields:
+        The preferred method to create fields is the `fields` function.
+        Alternatively one can use one of the static functions `Field.create_generic`, `Field.create_from_numpy_array`
+        and `Field.create_fixed_size`. Don't instantiate the Field directly!
+        Fields can be created with known or unknown shapes:
+        1. If you want to create a kernel with fixed loop sizes i.e. the shape of the array is already known.
+           This is usually the case if just-in-time compilation directly from Python is done.
+           (see `Field.create_from_numpy_array`
+        2. create a more general kernel that works for variable array sizes. This can be used to create kernels
+           beforehand for a library. (see `Field.create_generic`)
+    Dimensions and Indexing:
+        A field has spatial and index dimensions, where the spatial dimensions come first.
+        The interpretation is that the field has multiple cells in (usually) two or three dimensional space which are
+        looped over. Additionally N values are stored per cell. In this case spatial_dimensions is two or three,
+        and index_dimensions equals N. If you want to store a matrix on each point in a two dimensional grid, there
+        are four dimensions, two spatial and two index dimensions: ``len(arr.shape) == spatial_dims + index_dims``
+        The shape of the index dimension does not have to be specified. Just use the 'index_dimensions' parameter.
+        However, it is good practice to define the shape, since out of bounds accesses can be directly detected in this
+        case. The shape can be passed with the 'index_shape' parameter of the field creation functions.
+        When accessing (indexing) a field the result is a `Field.Access` which is derived from sympy Symbol.
+        First specify the spatial offsets in [], then in case index_dimension>0 the indices in ()
+        e.g. ``f[-1,0,0](7)``
+    Staggered Fields:
+        Staggered fields are used to store a value on a second grid shifted by half a cell with respect to the usual
+        grid.
+        The first index dimension is used to specify the position on the staggered grid (e.g. 0 means half-way to the
+        eastern neighbor, 1 is half-way to the northern neighbor, etc.), while additional indices can be used to store
+        multiple values at each position.
+    Example using no index dimensions:
+        >>> a = np.zeros([10, 10])
+        >>> f = Field.create_from_numpy_array("f", a, index_dimensions=0)
+        >>> jacobi = (f[-1,0] + f[1,0] + f[0,-1] + f[0,1]) / 4
+    Examples for index dimensions to create LB field and implement stream pull:
+        >>> from pystencils import Assignment
+        >>> stencil = np.array([[0,0], [0,1], [0,-1]])
+        >>> src, dst = fields("src(3), dst(3) : double[2D]")
+        >>> assignments = [Assignment(dst[0,0](i), src[-offset](i)) for i, offset in enumerate(stencil)];
+    """
+    @staticmethod
+    def create_generic(field_name, spatial_dimensions, dtype=np.float64, index_dimensions=0, layout='numpy',
+                       index_shape=None, field_type=FieldType.GENERIC) -> 'Field':
+        """
+        Creates a generic field where the field size is not fixed i.e. can be called with arrays of different sizes
+        Args:
+            field_name: symbolic name for the field
+            dtype: numpy data type of the array the kernel is called with later
+            spatial_dimensions: see documentation of Field
+            index_dimensions: see documentation of Field
+            layout: tuple specifying the loop ordering of the spatial dimensions e.g. (2, 1, 0 ) means that
+                    the outer loop loops over dimension 2, the second outer over dimension 1, and the inner loop
+                    over dimension 0. Also allowed: the strings 'numpy' (0,1,..d) or 'reverse_numpy' (d, ..., 1, 0)
+            index_shape: optional shape of the index dimensions i.e. maximum values allowed for each index dimension,
+                        has to be a list or tuple
+            field_type: besides the normal GENERIC fields, there are INDEXED fields that store indices of the domain
+                        that should be iterated over, BUFFER fields that are used to generate communication
+                        packing/unpacking kernels, and STAGGERED fields, which store values half-way to the next
+                        cell
+        """
+        if index_shape is not None:
+            assert index_dimensions == 0 or index_dimensions == len(index_shape)
+            index_dimensions = len(index_shape)
+        if isinstance(layout, str):
+            layout = spatial_layout_string_to_tuple(layout, dim=spatial_dimensions)
+        total_dimensions = spatial_dimensions + index_dimensions
+        if index_shape is None or len(index_shape) == 0:
+            shape = tuple([FieldShapeSymbol([field_name], i) for i in range(total_dimensions)])
+        else:
+            shape = tuple([FieldShapeSymbol([field_name], i) for i in range(spatial_dimensions)] + list(index_shape))
+        strides = tuple([FieldStrideSymbol(field_name, i) for i in range(total_dimensions)])
+        np_data_type = np.dtype(dtype)
+        if np_data_type.fields is not None:
+            if index_dimensions != 0:
+                raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
+            shape += (1,)
+            strides += (1,)
+        if field_type == FieldType.STAGGERED and index_dimensions == 0:
+            raise ValueError("A staggered field needs at least one index dimension")
+        return Field(field_name, field_type, dtype, layout, shape, strides)
+    @staticmethod
+    def create_from_numpy_array(field_name: str, array: np.ndarray, index_dimensions: int = 0,
+                                field_type=FieldType.GENERIC) -> 'Field':
+        """Creates a field based on the layout, data type, and shape of a given numpy array.
+        Kernels created for these kind of fields can only be called with arrays of the same layout, shape and type.
+        Args:
+            field_name: symbolic name for the field
+            array: numpy array
+            index_dimensions: see documentation of Field
+            field_type: kind of field
+        """
+        spatial_dimensions = len(array.shape) - index_dimensions
+        if spatial_dimensions < 1:
+            raise ValueError("Too many index dimensions. At least one spatial dimension required")
+        full_layout = get_layout_of_array(array)
+        spatial_layout = tuple([i for i in full_layout if i < spatial_dimensions])
+        assert len(spatial_layout) == spatial_dimensions
+        strides = tuple([s // np.dtype(array.dtype).itemsize for s in array.strides])
+        shape = tuple(int(s) for s in array.shape)
+        numpy_dtype = np.dtype(array.dtype)
+        if numpy_dtype.fields is not None:
+            if index_dimensions != 0:
+                raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
+            shape += (1,)
+            strides += (1,)
+        if field_type == FieldType.STAGGERED and index_dimensions == 0:
+            raise ValueError("A staggered field needs at least one index dimension")
+        return Field(field_name, field_type, array.dtype, spatial_layout, shape, strides)
+    @staticmethod
+    def create_fixed_size(field_name: str, shape: Tuple[int, ...], index_dimensions: int = 0,
+                          dtype=np.float64, layout: str = 'numpy', strides: Optional[Sequence[int]] = None,
+                          field_type=FieldType.GENERIC) -> 'Field':
+        """
+        Creates a field with fixed sizes i.e. can be called only with arrays of the same size and layout
+        Args:
+            field_name: symbolic name for the field
+            shape: overall shape of the array
+            index_dimensions: how many of the trailing dimensions are interpreted as index (as opposed to spatial)
+            dtype: numpy data type of the array the kernel is called with later
+            layout: full layout of array, not only spatial dimensions
+            strides: strides in bytes or None to automatically compute them from shape (assuming no padding)
+            field_type: kind of field
+        """
+        spatial_dimensions = len(shape) - index_dimensions
+        assert spatial_dimensions >= 1
+        if isinstance(layout, str):
+            layout = layout_string_to_tuple(layout, spatial_dimensions + index_dimensions)
+        shape = tuple(int(s) for s in shape)
+        if strides is None:
+            strides = compute_strides(shape, layout)
+        else:
+            assert len(strides) == len(shape)
+            strides = tuple([s // np.dtype(dtype).itemsize for s in strides])
+        numpy_dtype = np.dtype(dtype)
+        if numpy_dtype.fields is not None:
+            if index_dimensions != 0:
+                raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
+            shape += (1,)
+            strides += (1,)
+        if field_type == FieldType.STAGGERED and index_dimensions == 0:
+            raise ValueError("A staggered field needs at least one index dimension")
+        spatial_layout = list(layout)
+        for i in range(spatial_dimensions, len(layout)):
+            spatial_layout.remove(i)
+        return Field(field_name, field_type, dtype, tuple(spatial_layout), shape, strides)
+    def __init__(self, field_name, field_type, dtype, layout, shape, strides):
+        """Do not use directly. Use static create* methods"""
+        self._field_name = field_name
+        assert isinstance(field_type, FieldType)
+        assert len(shape) == len(strides)
+        self.field_type = field_type
+        self._dtype = create_type(dtype)
+        self._layout = normalize_layout(layout)
+        self.shape = shape
+        self.strides = strides
+        self.latex_name: Optional[str] = None
+        self.coordinate_origin = sp.Matrix([0] * self.spatial_dimensions)
+        self.coordinate_transform = sp.eye(self.spatial_dimensions)
+        if field_type == FieldType.STAGGERED:
+            assert self.staggered_stencil
+    def new_field_with_different_name(self, new_name):
+        if self.has_fixed_shape:
+            return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides)
+        else:
+            return Field(new_name, self.field_type, self.dtype, self.layout, self.shape, self.strides)
+    @property
+    def spatial_dimensions(self) -> int:
+        return len(self._layout)
+    @property
+    def index_dimensions(self) -> int:
+        return len(self.shape) - len(self._layout)
+    @property
+    def ndim(self) -> int:
+        return len(self.shape)
+    def values_per_cell(self) -> int:
+        return functools.reduce(operator.mul, self.index_shape, 1)
+    @property
+    def layout(self):
+        return self._layout
+    @property
+    def name(self) -> str:
+        return self._field_name
+    @property
+    def spatial_shape(self) -> Tuple[int, ...]:
+        return self.shape[:self.spatial_dimensions]
+    @property
+    def has_fixed_shape(self):
+        return is_integer_sequence(self.shape)
+    @property
+    def index_shape(self):
+        return self.shape[self.spatial_dimensions:]
+    @property
+    def has_fixed_index_shape(self):
+        return is_integer_sequence(self.index_shape)
+    @property
+    def spatial_strides(self):
+        return self.strides[:self.spatial_dimensions]
+    @property
+    def index_strides(self):
+        return self.strides[self.spatial_dimensions:]
+    @property
+    def dtype(self):
+        return self._dtype
+    @property
+    def itemsize(self):
+        return self.dtype.numpy_dtype.itemsize
+    def __repr__(self):
+        if any(isinstance(s, sp.Symbol) for s in self.spatial_shape):
+            spatial_shape_str = f'{self.spatial_dimensions}d'
+        else:
+            spatial_shape_str = ','.join(str(i) for i in self.spatial_shape)
+        index_shape_str = ','.join(str(i) for i in self.index_shape)
+        if self.index_shape:
+            return f'{self._field_name}({index_shape_str}): {self.dtype}[{spatial_shape_str}]'
+        else:
+            return f'{self._field_name}: {self.dtype}[{spatial_shape_str}]'
+    def __str__(self):
+        return self.name
+    def neighbor(self, coord_id, offset):
+        offset_list = [0] * self.spatial_dimensions
+        offset_list[coord_id] = offset
+        return Field.Access(self, tuple(offset_list))
+    def neighbors(self, stencil):
+        return [self.__getitem__(s) for s in stencil]
+    @property
+    def center_vector(self):
+        index_shape = self.index_shape
+        if len(index_shape) == 0:
+            return sp.Matrix([self.center])
+        elif len(index_shape) == 1:
+            return sp.Matrix([self(i) for i in range(index_shape[0])])
+        elif len(index_shape) == 2:
+            return sp.Matrix([[self(i, j) for j in range(index_shape[1])] for i in range(index_shape[0])])
+        elif len(index_shape) == 3:
+            return sp.Array([[[self(i, j, k) for k in range(index_shape[2])]
+                              for j in range(index_shape[1])] for i in range(index_shape[0])])
+        else:
+            raise NotImplementedError("center_vector is not implemented for more than 3 index dimensions")
+    @property
+    def center(self):
+        center = tuple([0] * self.spatial_dimensions)
+        return Field.Access(self, center)
+    def neighbor_vector(self, offset):
+        """Like neighbor, but returns the entire vector/tensor stored at offset."""
+        if self.spatial_dimensions == 2 and len(offset) == 3:
+            assert offset[2] == 0
+            offset = offset[:2]
+        if self.index_dimensions == 0:
+            return sp.Matrix([self.__getitem__(offset)])
+        elif self.index_dimensions == 1:
+            return sp.Matrix([self.__getitem__(offset)(i) for i in range(self.index_shape[0])])
+        elif self.index_dimensions == 2:
+            return sp.Matrix([[self.__getitem__(offset)(i, k) for k in range(self.index_shape[1])]
+                              for i in range(self.index_shape[0])])
+        else:
+            raise NotImplementedError("neighbor_vector is not implemented for more than 2 index dimensions")
+    def __getitem__(self, offset):
+        if type(offset) is np.ndarray:
+            offset = tuple(offset)
+        if type(offset) is str:
+            offset = tuple(direction_string_to_offset(offset, self.spatial_dimensions))
+        if type(offset) is not tuple:
+            offset = (offset,)
+        if len(offset) != self.spatial_dimensions:
+            raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
+        return Field.Access(self, offset)
+    def absolute_access(self, offset, index):
+        assert FieldType.is_custom(self)
+        return Field.Access(self, offset, index, is_absolute_access=True)
+    def staggered_access(self, offset, index=None):
+        """If this field is a staggered field, it can be accessed using half-integer offsets.
+        For example, an offset of ``(0, sp.Rational(1,2))`` or ``"E"`` corresponds to the staggered point to the east
+        of the cell center, i.e. half-way to the eastern-next cell.
+        If the field stores more than one value per staggered point (e.g. a vector or a tensor), the index (integer or
+        tuple of integers) refers to which of these values to access.
+        """
+        assert FieldType.is_staggered(self)
+        offset_orig = offset
+        if type(offset) is np.ndarray:
+            offset = tuple(offset)
+        if type(offset) is str:
+            offset = tuple(direction_string_to_offset(offset, self.spatial_dimensions))
+            offset = tuple([o * sp.Rational(1, 2) for o in offset])
+        if len(offset) != self.spatial_dimensions:
+            raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
+        prefactor = 1
+        neighbor_vec = [0] * len(offset)
+        for i in range(self.spatial_dimensions):
+            if (offset[i] + sp.Rational(1, 2)).is_Integer:
+                neighbor_vec[i] = sp.sign(offset[i])
+        neighbor = offset_to_direction_string(neighbor_vec)
+        if neighbor not in self.staggered_stencil:
+            neighbor_vec = inverse_direction(neighbor_vec)
+            neighbor = offset_to_direction_string(neighbor_vec)
+            if FieldType.is_staggered_flux(self):
+                prefactor = -1
+        if neighbor not in self.staggered_stencil:
+            raise ValueError(f"{offset_orig} is not a valid neighbor for the {self.staggered_stencil_name} stencil")
+        offset = tuple(sp.Matrix(offset) - sp.Rational(1, 2) * sp.Matrix(neighbor_vec))
+        idx = self.staggered_stencil.index(neighbor)
+        if self.index_dimensions == 1:  # this field stores a scalar value at each staggered position
+            if index is not None:
+                raise ValueError("Cannot specify an index for a scalar staggered field")
+            return prefactor * Field.Access(self, offset, (idx,))
+        else:  # this field stores a vector or tensor at each staggered position
+            if index is None:
+                raise ValueError(f"Wrong number of indices: Got 0, expected {self.index_dimensions - 1}")
+            if type(index) is np.ndarray:
+                index = tuple(index)
+            if type(index) is not tuple:
+                index = (index,)
+            if self.index_dimensions != len(index) + 1:
+                raise ValueError(f"Wrong number of indices: Got {len(index)}, expected {self.index_dimensions - 1}")
+            return prefactor * Field.Access(self, offset, (idx, *index))
+    def staggered_vector_access(self, offset):
+        """Like staggered_access, but returns the entire vector/tensor stored at offset."""
+        assert FieldType.is_staggered(self)
+        if self.index_dimensions == 1:
+            return sp.Matrix([self.staggered_access(offset)])
+        elif self.index_dimensions == 2:
+            return sp.Matrix([self.staggered_access(offset, i) for i in range(self.index_shape[1])])
+        elif self.index_dimensions == 3:
+            return sp.Matrix([[self.staggered_access(offset, (i, k)) for k in range(self.index_shape[2])]
+                              for i in range(self.index_shape[1])])
+        else:
+            raise NotImplementedError("staggered_vector_access is not implemented for more than 3 index dimensions")
+    @property
+    def staggered_stencil(self):
+        assert FieldType.is_staggered(self)
+        stencils = {
+            2: {
+                2: ["W", "S"],  # D2Q5
+                4: ["W", "S", "SW", "NW"]  # D2Q9
+            },
+            3: {
+                3: ["W", "S", "B"],  # D3Q7
+                7: ["W", "S", "B", "BSW", "TSW", "BNW", "TNW"],  # D3Q15
+                9: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS"],  # D3Q19
+                13: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS", "BSW", "TSW", "BNW", "TNW"]  # D3Q27
+            }
+        }
+        if not self.index_shape[0] in stencils[self.spatial_dimensions]:
+            raise ValueError(f"No known stencil has {self.index_shape[0]} staggered points")
+        return stencils[self.spatial_dimensions][self.index_shape[0]]
+    @property
+    def staggered_stencil_name(self):
+        assert FieldType.is_staggered(self)
+        return f"D{self.spatial_dimensions}Q{self.index_shape[0] * 2 + 1}"
+    def __call__(self, *args, **kwargs):
+        center = tuple([0] * self.spatial_dimensions)
+        return Field.Access(self, center)(*args, **kwargs)
+    def hashable_contents(self):
+        return (self._layout,
+                self.shape,
+                self.strides,
+                self.field_type,
+                self._field_name,
+                self.latex_name,
+                self._dtype)
+    def __hash__(self):
+        return hash(self.hashable_contents())
+    def __eq__(self, other):
+        if not isinstance(other, Field):
+            return False
+        return self.hashable_contents() == other.hashable_contents()
+    @property
+    def physical_coordinates(self):
+        if hasattr(self.coordinate_transform, '__call__'):
+            return self.coordinate_transform(self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions))
+        else:
+            return self.coordinate_transform @ (self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions))
+    @property
+    def physical_coordinates_staggered(self):
+        return self.coordinate_transform @ \
+            (self.coordinate_origin + pystencils.x_staggered_vector(self.spatial_dimensions))
+    def index_to_physical(self, index_coordinates: sp.Matrix, staggered=False):
+        if staggered:
+            index_coordinates = sp.Matrix([0.5] * len(self.coordinate_origin)) + index_coordinates
+        if hasattr(self.coordinate_transform, '__call__'):
+            return self.coordinate_transform(self.coordinate_origin + index_coordinates)
+        else:
+            return self.coordinate_transform @ (self.coordinate_origin + index_coordinates)
+    def physical_to_index(self, physical_coordinates: sp.Matrix, staggered=False):
+        if hasattr(self.coordinate_transform, '__call__'):
+            if hasattr(self.coordinate_transform, 'inv'):
+                return self.coordinate_transform.inv()(physical_coordinates) - self.coordinate_origin
+            else:
+                idx = sp.Matrix(sp.symbols(f'index_coordinates:{self.ndim}', real=True))
+                rtn = sp.solve(self.index_to_physical(idx) - physical_coordinates, idx)
+                assert rtn, f'Could not find inverese of coordinate_transform: {self.index_to_physical(idx)}'
+                return rtn
+        else:
+            rtn = self.coordinate_transform.inv() @ physical_coordinates - self.coordinate_origin
+        if staggered:
+            rtn = sp.Matrix([i - 0.5 for i in rtn])
+        return rtn
+    def set_coordinate_origin_to_field_center(self):
+        self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape])
+    # noinspection PyAttributeOutsideInit,PyUnresolvedReferences
+    class Access(TypedSymbol):
+        """Class representing a relative access into a `Field`.
+        This class behaves like a normal sympy Symbol, it is actually derived from it. One can built up
+        sympy expressions using field accesses, solve for them, etc.
+        Examples:
+            >>> vector_field_2d = fields("v(2): double[2D]")  # create a 2D vector field
+            >>> northern_neighbor_y_component = vector_field_2d[0, 1](1)
+            >>> northern_neighbor_y_component
+            v_N^1
+            >>> central_y_component = vector_field_2d(1)
+            >>> central_y_component
+            v_C^1
+            >>> central_y_component.get_shifted(1, 0)  # move the existing access
+            v_E^1
+            >>> central_y_component.at_index(0)  # change component
+            v_C^0
+        """
+        _iterable = False  # see https://i10git.cs.fau.de/pycodegen/pystencils/-/merge_requests/166#note_10680
+        def __new__(cls, name, *args, **kwargs):
+            obj = Field.Access.__xnew_cached_(cls, name, *args, **kwargs)
+            return obj
+        def __new_stage2__(self, field, offsets=(0, 0, 0), idx=None, is_absolute_access=False, dtype=None):
+            field_name = field.name
+            offsets_and_index = (*offsets, *idx) if idx is not None else offsets
+            constant_offsets = not any([isinstance(o, sp.Basic) and not o.is_Integer for o in offsets_and_index])
+            if not idx:
+                idx = tuple([0] * field.index_dimensions)
+            if constant_offsets:
+                offset_name = offset_to_direction_string(offsets)
+                if field.index_dimensions == 0:
+                    superscript = None
+                elif field.index_dimensions == 1:
+                    superscript = str(idx[0])
+                else:
+                    idx_str = ",".join([str(e) for e in idx])
+                    superscript = idx_str
+                if field.has_fixed_index_shape and not isinstance(field.dtype, StructType):
+                    for i, bound in zip(idx, field.index_shape):
+                        if i >= bound:
+                            raise ValueError("Field index out of bounds")
+            else:
+                offset_name = hashlib.md5(pickle.dumps(offsets_and_index)).hexdigest()[:12]
+                superscript = None
+            symbol_name = f"{field_name}_{offset_name}"
+            if superscript is not None:
+                symbol_name += "^" + superscript
+            if dtype:
+                obj = super(Field.Access, self).__xnew__(self, symbol_name, dtype)
+            else:
+                obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype)
+            obj._field = field
+            obj._offsets = []
+            for o in offsets:
+                if isinstance(o, sp.Basic):
+                    obj._offsets.append(o)
+                else:
+                    obj._offsets.append(int(o))
+            obj._offsets = tuple(sp.sympify(obj._offsets))
+            obj._offsetName = offset_name
+            obj._superscript = superscript
+            obj._index = idx
+            obj._indirect_addressing_fields = set()
+            for e in chain(obj._offsets, obj._index):
+                if isinstance(e, sp.Basic):
+                    obj._indirect_addressing_fields.update(a.field for a in e.atoms(Field.Access))
+            obj._is_absolute_access = is_absolute_access
+            return obj
+        def __getnewargs__(self):
+            return self.field, self.offsets, self.index, self.is_absolute_access, self.dtype
+        def __getnewargs_ex__(self):
+            return (self.field, self.offsets, self.index, self.is_absolute_access, self.dtype), {}
+        # noinspection SpellCheckingInspection
+        __xnew__ = staticmethod(__new_stage2__)
+        # noinspection SpellCheckingInspection
+        __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
+        def __call__(self, *idx):
+            if self._index != tuple([0] * self.field.index_dimensions):
+                raise ValueError("Indexing an already indexed Field.Access")
+            idx = tuple(idx)
+            if self.field.index_dimensions == 0 and idx == (0,):
+                idx = ()
+            if len(idx) != self.field.index_dimensions:
+                raise ValueError(f"Wrong number of indices: Got {len(idx)}, expected {self.field.index_dimensions}")
+            if len(idx) == 1 and isinstance(idx[0], str):
+                dtype = BasicType(self.field.dtype.numpy_dtype[idx[0]])
+                return Field.Access(self.field, self._offsets, idx,
+                                    is_absolute_access=self.is_absolute_access, dtype=dtype)
+            else:
+                return Field.Access(self.field, self._offsets, idx,
+                                    is_absolute_access=self.is_absolute_access, dtype=self.dtype)
+        def __getitem__(self, *idx):
+            return self.__call__(*idx)
+        @property
+        def field(self) -> 'Field':
+            """Field that the Access points to"""
+            return self._field
+        @property
+        def offsets(self) -> Tuple:
+            """Spatial offset as tuple"""
+            return self._offsets
+        @property
+        def required_ghost_layers(self) -> int:
+            """Largest spatial distance that is accessed."""
+            return int(np.max(np.abs(self._offsets)))
+        @property
+        def nr_of_coordinates(self):
+            return len(self._offsets)
+        @property
+        def offset_name(self) -> str:
+            """Spatial offset as string, East-West for x, North-South for y and Top-Bottom for z coordinate.
+            Example:
+                >>> f = fields("f: double[2D]")
+                >>> f[1, 1].offset_name  # north-east
+                'NE'
+            """
+            return self._offsetName
+        @property
+        def index(self):
+            """Value of index coordinates as tuple."""
+            return self._index
+        def neighbor(self, coord_id: int, offset: int) -> 'Field.Access':
+            """Returns a new Access with changed spatial coordinates.
+            Args:
+                coord_id: index of the coordinate to change (0 for x, 1 for y,...)
+                offset: incremental change of this coordinate
+            Example:
+                >>> f = fields('f: [2D]')
+                >>> f[0,0].neighbor(coord_id=1, offset=-1)
+                f_S
+            """
+            offset_list = list(self.offsets)
+            offset_list[coord_id] += offset
+            return Field.Access(self.field, tuple(offset_list), self.index,
+                                is_absolute_access=self.is_absolute_access, dtype=self.dtype)
+        def get_shifted(self, *shift) -> 'Field.Access':
+            """Returns a new Access with changed spatial coordinates
+            Example:
+                >>> f = fields("f: [2D]")
+                >>> f[0,0].get_shifted(1, 1)
+                f_NE
+            """
+            return Field.Access(self.field,
+                                tuple(a + b for a, b in zip(shift, self.offsets)),
+                                self.index,
+                                is_absolute_access=self.is_absolute_access,
+                                dtype=self.dtype)
+        def at_index(self, *idx_tuple) -> 'Field.Access':
+            """Returns new Access with changed index.
+            Example:
+                >>> f = fields("f(9): [2D]")
+                >>> f(0).at_index(8)
+                f_C^8
+            """
+            return Field.Access(self.field, self.offsets, idx_tuple,
+                                is_absolute_access=self.is_absolute_access, dtype=self.dtype)
+        def _eval_subs(self, old, new):
+            return Field.Access(self.field,
+                                tuple(sp.sympify(a).subs(old, new) for a in self.offsets),
+                                tuple(sp.sympify(a).subs(old, new) for a in self.index),
+                                is_absolute_access=self.is_absolute_access,
+                                dtype=self.dtype)
+        @property
+        def is_absolute_access(self) -> bool:
+            """Indicates if a field access is relative to the loop counters (this is the default) or absolute"""
+            return self._is_absolute_access
+        @property
+        def indirect_addressing_fields(self) -> Set['Field']:
+            """Returns a set of fields that the access depends on.
+             e.g. f[index_field[1, 0]], the outer access to f depends on index_field
+             """
+            return self._indirect_addressing_fields
+        def _hashable_content(self):
+            super_class_contents = super(Field.Access, self)._hashable_content()
+            return (super_class_contents, self._field.hashable_contents(), *self._index,
+                    *self._offsets, self._is_absolute_access)
+        def _staggered_offset(self, offsets, index):
+            assert FieldType.is_staggered(self._field)
+            neighbor = self._field.staggered_stencil[index]
+            neighbor = direction_string_to_offset(neighbor, self._field.spatial_dimensions)
+            return [(o + sp.Rational(int(neighbor[i]), 2)) for i, o in enumerate(offsets)]
+        def _latex(self, _):
+            n = self._field.latex_name if self._field.latex_name else self._field.name
+            offset_str = ",".join([sp.latex(o) for o in self.offsets])
+            if FieldType.is_staggered(self._field):
+                offset_str = ",".join([sp.latex(self._staggered_offset(self.offsets, self.index[0])[i])
+                                       for i in range(len(self.offsets))])
+            if self.is_absolute_access:
+                offset_str = f"\\mathbf{offset_str}"
+            elif self.field.spatial_dimensions > 1:
+                offset_str = f"({offset_str})"
+            if FieldType.is_staggered(self._field):
+                if self.index and self.field.index_dimensions > 1:
+                    return f"{{{n}}}_{{{offset_str}}}^{{{self.index[1:] if len(self.index) > 2 else self.index[1]}}}"
+                else:
+                    return f"{{{n}}}_{{{offset_str}}}"
+            else:
+                if self.index and self.field.index_dimensions > 0:
+                    return f"{{{n}}}_{{{offset_str}}}^{{{self.index if len(self.index) > 1 else self.index[0]}}}"
+                else:
+                    return f"{{{n}}}_{{{offset_str}}}"
+        def __str__(self):
+            n = self._field.latex_name if self._field.latex_name else self._field.name
+            offset_str = ",".join([sp.latex(o) for o in self.offsets])
+            if FieldType.is_staggered(self._field):
+                offset_str = ",".join([sp.latex(self._staggered_offset(self.offsets, self.index[0])[i])
+                                       for i in range(len(self.offsets))])
+            if self.is_absolute_access:
+                offset_str = f"[abs]{offset_str}"
+            if FieldType.is_staggered(self._field):
+                if self.index and self.field.index_dimensions > 1:
+                    return f"{n}[{offset_str}]({self.index[1:] if len(self.index) > 2 else self.index[1]})"
+                else:
+                    return f"{n}[{offset_str}]"
+            else:
+                if self.index and self.field.index_dimensions > 0:
+                    return f"{n}[{offset_str}]({self.index if len(self.index) > 1 else self.index[0]})"
+                else:
+                    return f"{n}[{offset_str}]"
+def fields(description=None, index_dimensions=0, layout=None,
+           field_type=FieldType.GENERIC, **kwargs) -> Union[Field, List[Field]]:
+    """Creates pystencils fields from a string description.
+    Examples:
+        Create a 2D scalar and vector field:
+            >>> s, v = fields("s, v(2): double[2D]")
+            >>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
+            >>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
+        Create an integer field of shape (10, 20):
+            >>> f = fields("f : int32[10, 20]")
+            >>> f.has_fixed_shape, f.shape
+            (True, (10, 20))
+        Numpy arrays can be used as template for shape and data type of field:
+            >>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
+            >>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
+            >>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
+            >>> assert v.index_shape == (2,)
+        Format string can be left out, field names are taken from keyword arguments.
+            >>> fields(f1=arr_s, f2=arr_s)
+            [f1: double[20,20], f2: double[20,20]]
+        The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
+            >>> f = fields(f=arr_v, index_dimensions=1)
+            >>> assert f.index_dimensions == 1
+            >>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
+            >>> f.layout
+            (2, 1, 0)
+    """
+    result = []
+    if description:
+        field_descriptions, dtype, shape = _parse_description(description)
+        layout = 'numpy' if layout is None else layout
+        for field_name, idx_shape in field_descriptions:
+            if field_name in kwargs:
+                arr = kwargs[field_name]
+                idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
+                assert idx_shape_of_arr == idx_shape
+                f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
+                                                  field_type=field_type)
+            elif isinstance(shape, tuple):
+                f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
+                                            index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
+            elif isinstance(shape, int):
+                f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
+                                         index_shape=idx_shape, layout=layout, field_type=field_type)
+            elif shape is None:
+                f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
+                                         index_shape=idx_shape, layout=layout, field_type=field_type)
+            else:
+                assert False
+            result.append(f)
+    else:
+        assert layout is None, "Layout can not be specified when creating Field from numpy array"
+        for field_name, arr in kwargs.items():
+            result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
+                                                        field_type=field_type))
+    if len(result) == 0:
+        raise ValueError("Could not parse field description")
+    elif len(result) == 1:
+        return result[0]
+    else:
+        return result
+def get_layout_from_strides(strides: Sequence[int], index_dimension_ids: Optional[List[int]] = None):
+    index_dimension_ids = [] if index_dimension_ids is None else index_dimension_ids
+    coordinates = list(range(len(strides)))
+    relevant_strides = [stride for i, stride in enumerate(strides) if i not in index_dimension_ids]
+    result = [x for (y, x) in sorted(zip(relevant_strides, coordinates), key=lambda pair: pair[0], reverse=True)]
+    return normalize_layout(result)
+def get_layout_of_array(arr: np.ndarray, index_dimension_ids: Optional[List[int]] = None):
+    """ Returns a list indicating the memory layout (linearization order) of the numpy array.
+    Examples:
+        >>> get_layout_of_array(np.zeros([3,3,3]))
+        (0, 1, 2)
+    In this example the loop over the zeroth coordinate should be the outermost loop,
+    followed by the first and second. Elements arr[x,y,0] and arr[x,y,1] are adjacent in memory.
+    Normally constructed numpy arrays have this order, however by stride tricks or other frameworks, arrays
+    with different memory layout can be created.
+    The index_dimension_ids parameter leaves specifies which coordinates should not be
+    """
+    index_dimension_ids = [] if index_dimension_ids is None else index_dimension_ids
+    return get_layout_from_strides(arr.strides, index_dimension_ids)
+def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0, **kwargs):
+    """Creates numpy array with given memory layout.
+    Args:
+        shape: shape of the resulting array
+        layout: layout as tuple, where the coordinates are ordered from slow to fast
+        alignment: number of bytes to align the beginning and the innermost coordinate to, or False for no alignment
+        byte_offset: only used when alignment is specified, align not beginning but address at this offset
+                     mostly used to align first inner cell, not ghost cells
+    Example:
+        >>> res = create_numpy_array_with_layout(shape=(2, 3, 4, 5), layout=(3, 2, 0, 1))
+        >>> res.shape
+        (2, 3, 4, 5)
+        >>> get_layout_of_array(res)
+        (3, 2, 0, 1)
+    """
+    assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
+    cur_layout = list(range(len(shape)))
+    swaps = []
+    for i in range(len(layout)):
+        if cur_layout[i] != layout[i]:
+            index_to_swap_with = cur_layout.index(layout[i])
+            swaps.append((i, index_to_swap_with))
+            cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
+    assert tuple(cur_layout) == tuple(layout)
+    shape = list(shape)
+    for a, b in swaps:
+        shape[a], shape[b] = shape[b], shape[a]
+    if not alignment:
+        res = np.empty(shape, order='c', **kwargs)
+    else:
+        res = aligned_empty(shape, alignment, byte_offset=byte_offset, **kwargs)
+    for a, b in reversed(swaps):
+        res = res.swapaxes(a, b)
+    return res
+def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]:
+    if dim <= 0:
+        raise ValueError("Dimensionality must be positive")
+    layout_str = layout_str.lower()
+    if layout_str in ('fzyx', 'zyxf', 'soa', 'aos'):
+        if dim > 3:
+            raise ValueError(f"Invalid spatial dimensionality for layout descriptor {layout_str}: May be at most 3.")
+        return tuple(reversed(range(dim)))
+    if layout_str in ('f', 'reverse_numpy'):
+        return tuple(reversed(range(dim)))
+    elif layout_str in ('c', 'numpy'):
+        return tuple(range(dim))
+    raise ValueError("Unknown layout descriptor " + layout_str)
+def layout_string_to_tuple(layout_str, dim):
+    if dim <= 0:
+        raise ValueError("Dimensionality must be positive")
+    layout_str = layout_str.lower()
+    if layout_str == 'fzyx' or layout_str == 'soa':
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
+        return tuple(reversed(range(dim)))
+    elif layout_str == 'zyxf' or layout_str == 'aos':
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
+        return tuple(reversed(range(dim - 1))) + (dim - 1,)
+    elif layout_str == 'f' or layout_str == 'reverse_numpy':
+        return tuple(reversed(range(dim)))
+    elif layout_str == 'c' or layout_str == 'numpy':
+        return tuple(range(dim))
+    raise ValueError("Unknown layout descriptor " + layout_str)
+def normalize_layout(layout):
+    """Takes a layout tuple and subtracts the minimum from all entries"""
+    min_entry = min(layout)
+    return tuple(i - min_entry for i in layout)
+def compute_strides(shape, layout):
+    """
+    Computes strides assuming no padding exists
+    Args:
+        shape: shape (size) of array
+        layout: layout specification as tuple
+    Returns:
+        strides in elements, not in bytes
+    """
+    dim = len(shape)
+    assert len(layout) == dim
+    assert len(set(layout)) == dim
+    strides = [0] * dim
+    product = 1
+    for j in reversed(layout):
+        strides[j] = product
+        product *= shape[j]
+    return tuple(strides)
+# ---------------------------------------- Parsing of string in fields() function --------------------------------------
+field_description_regex = re.compile(r"""
+    \s*                 # ignore leading white spaces
+    (\w+)               # identifier is a sequence of alphanumeric characters, is stored in first group
+    (?:                 # optional index specification e.g. (1, 4, 2)
+        \s*
+        \(
+            ([^\)]+)    # read everything up to closing bracket
+        \)
+        \s*
+    )?
+    \s*,?\s*             # ignore trailing white spaces and comma
+""", re.VERBOSE)
+type_description_regex = re.compile(r"""
+    \s*
+    (\w+)?       # optional dtype
+    \s*
+    \[
+        ([^\]]+)
+    \]
+    \s*
+""", re.VERBOSE | re.IGNORECASE)
+def _parse_part1(d):
+    result = field_description_regex.match(d)
+    while result:
+        name, index_str = result.group(1), result.group(2)
+        index = tuple(int(e) for e in index_str.split(",")) if index_str else ()
+        yield name, index
+        d = d[result.end():]
+        result = field_description_regex.match(d)
+def _parse_description(description):
+    def parse_part2(d):
+        result = type_description_regex.match(d)
+        if result:
+            data_type_str, size_info = result.group(1), result.group(2).strip().lower()
+            if data_type_str is None:
+                data_type_str = 'float64'
+            data_type_str = data_type_str.lower().strip()
+            if not data_type_str:
+                data_type_str = 'float64'
+            if size_info.endswith('d'):
+                size_info = int(size_info[:-1])
+            else:
+                size_info = tuple(int(e) for e in size_info.split(","))
+            return data_type_str, size_info
+        else:
+            raise ValueError("Could not parse field description")
+    if ':' in description:
+        field_description, field_info = description.split(':')
+    else:
+        field_description, field_info = description, 'float64[2D]'
+    fields_info = [e for e in _parse_part1(field_description)]
+    if not field_info:
+        raise ValueError("Could not parse field description")
+    data_type, size = parse_part2(field_info)
+    return fields_info, data_type, size
--- a/src/pystencils/functions.py
+++ b/src/pystencils/functions.py
+import sympy as sp
+from pystencils.typing import PointerType
+class DivFunc(sp.Function):
+    """
+    DivFunc represents a division operation, since sympy represents divisions with ^-1
+    """
+    is_Atom = True
+    is_real = True
+    def __new__(cls, *args, **kwargs):
+        if len(args) != 2:
+            raise ValueError(f'{cls} takes only 2 arguments, instead {len(args)} received!')
+        divisor, dividend, *other_args = args
+        return sp.Function.__new__(cls, divisor, dividend, *other_args, **kwargs)
+    def _eval_evalf(self, *args, **kwargs):
+        return self.divisor.evalf() / self.dividend.evalf()
+    @property
+    def divisor(self):
+        return self.args[0]
+    @property
+    def dividend(self):
+        return self.args[1]
+class AddressOf(sp.Function):
+    """
+    AddressOf is the '&' operation in C. It gets the address of a lvalue.
+    """
+    is_Atom = True
+    def __new__(cls, arg):
+        obj = sp.Function.__new__(cls, arg)
+        return obj
+    @property
+    def canonical(self):
+        if hasattr(self.args[0], 'canonical'):
+            return self.args[0].canonical
+        else:
+            raise NotImplementedError()
+    @property
+    def is_commutative(self):
+        return self.args[0].is_commutative
+    @property
+    def dtype(self):
+        if hasattr(self.args[0], 'dtype'):
+            return PointerType(self.args[0].dtype, restrict=True)
+        else:
+            raise ValueError(f'pystencils supports only non void pointers. Current address_of type: {self.args[0]}')
--- a/src/pystencils/gpu/__init__.py
+++ b/src/pystencils/gpu/__init__.py
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
+from pystencils.gpu.gpujit import make_python_function
+from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
+from .indexing import AbstractIndexing, BlockIndexing, LineIndexing
+__all__ = ['GPUArrayHandler', 'GPUNotAvailableHandler',
+           'create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
+           'AbstractIndexing', 'BlockIndexing', 'LineIndexing']
--- a/src/pystencils/gpu/gpu_array_handler.py
+++ b/src/pystencils/gpu/gpu_array_handler.py
+try:
+    import cupy as cp
+    import cupyx as cpx
+except ImportError:
+    cp = None
+    cpx = None
+import numpy as np
+class GPUArrayHandler:
+    def __init__(self, device_number):
+        self._device_number = device_number
+    def zeros(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.zeros(shape=shape, dtype=dtype, order=order)
+    def ones(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.ones(shape=shape, dtype=dtype, order=order)
+    def empty(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.empty(shape=shape, dtype=dtype, order=order)
+    def to_gpu(self, numpy_array):
+        swaps = _get_index_swaps(numpy_array)
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                gpu_array = cp.asarray(numpy_array.base)
+            for a, b in reversed(swaps):
+                gpu_array = gpu_array.swapaxes(a, b)
+            return gpu_array
+        else:
+            return cp.asarray(numpy_array)
+    def upload(self, array, numpy_array):
+        assert self._device_number == array.device.id
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                array.base.set(numpy_array.base)
+        else:
+            with cp.cuda.Device(self._device_number):
+                array.set(numpy_array)
+    def download(self, array, numpy_array):
+        assert self._device_number == array.device.id
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                numpy_array.base[:] = array.base.get()
+        else:
+            with cp.cuda.Device(self._device_number):
+                numpy_array[:] = array.get()
+    def randn(self, shape, dtype=np.float64):
+        with cp.cuda.Device(self._device_number):
+            return cp.random.randn(*shape, dtype=dtype)
+    @staticmethod
+    def pinned_numpy_array(layout, shape, dtype):
+        assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
+        cur_layout = list(range(len(shape)))
+        swaps = []
+        for i in range(len(layout)):
+            if cur_layout[i] != layout[i]:
+                index_to_swap_with = cur_layout.index(layout[i])
+                swaps.append((i, index_to_swap_with))
+                cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
+        assert tuple(cur_layout) == tuple(layout)
+        shape = list(shape)
+        for a, b in swaps:
+            shape[a], shape[b] = shape[b], shape[a]
+        res = cpx.empty_pinned(tuple(shape), order='c', dtype=dtype)
+        for a, b in reversed(swaps):
+            res = res.swapaxes(a, b)
+        return res
+    from_numpy = to_gpu
+class GPUNotAvailableHandler:
+    def __getattribute__(self, name):
+        raise NotImplementedError("Unable to utilise cupy! Please make sure cupy works correctly in your setup!")
+def _get_index_swaps(array):
+    swaps = []
+    if array.base is not None and isinstance(array.base, np.ndarray):
+        for stride in array.base.strides:
+            index_base = array.base.strides.index(stride)
+            index_view = array.strides.index(stride)
+            if index_base != index_view and (index_view, index_base) not in swaps:
+                swaps.append((index_base, index_view))
+    return swaps
--- a/src/pystencils/gpu/gpujit.py
+++ b/src/pystencils/gpu/gpujit.py
+import numpy as np
+from pystencils.backends.cbackend import get_headers
+from pystencils.backends.cuda_backend import generate_cuda
+from pystencils.typing import StructType
+from pystencils.field import FieldType
+from pystencils.include import get_pystencils_include_path
+from pystencils.kernel_wrapper import KernelWrapper
+from pystencils.typing import BasicType, FieldPointerSymbol
+USE_FAST_MATH = True
+def get_cubic_interpolation_include_paths():
+    from os.path import join, dirname
+    return [join(dirname(__file__), "CubicInterpolationCUDA", "code"),
+            join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
+def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None):
+    """
+    Creates a kernel function from an abstract syntax tree which
+    was created e.g. by :func:`pystencils.gpu.create_cuda_kernel`
+    or :func:`pystencils.gpu.created_indexed_cuda_kernel`
+    Args:
+        kernel_function_node: the abstract syntax tree
+        argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
+                       returned kernel functor.
+        custom_backend: use own custom printer for code generation
+    Returns:
+        compiled kernel as Python function
+    """
+    import cupy as cp
+    if argument_dict is None:
+        argument_dict = {}
+    headers = get_headers(kernel_function_node)
+    if cp.cuda.runtime.is_hip:
+        headers.add('"gpu_defines.h"')
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<hip/hip_fp16.h>')
+    else:
+        headers.update({'"gpu_defines.h"', '<cstdint>'})
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<cuda_fp16.h>')
+    header_list = sorted(headers)
+    includes = "\n".join([f"#include {include_file}" for include_file in header_list])
+    code = includes + "\n"
+    code += "#define FUNC_PREFIX __global__\n"
+    code += "#define RESTRICT __restrict__\n\n"
+    code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
+    options = ["-w", "-std=c++11"]
+    if USE_FAST_MATH:
+        options.append("-use_fast_math")
+    options.append("-I" + get_pystencils_include_path())
+    func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True)
+    parameters = kernel_function_node.get_parameters()
+    cache = {}
+    cache_values = []
+    def wrapper(**kwargs):
+        key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
+                         for k, v in kwargs.items()))
+        try:
+            args, block_and_thread_numbers = cache[key]
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
+                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
+        except KeyError:
+            full_arguments = argument_dict.copy()
+            full_arguments.update(kwargs)
+            shape = _check_arguments(parameters, full_arguments)
+            indexing = kernel_function_node.indexing
+            block_and_thread_numbers = indexing.call_parameters(shape)
+            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
+            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
+            args = tuple(_build_numpy_argument_list(parameters, full_arguments))
+            cache[key] = (args, block_and_thread_numbers)
+            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
+                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
+                # useful for debugging:
+                # cp.cuda.runtime.deviceSynchronize()
+        # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
+    ast = kernel_function_node
+    parameters = kernel_function_node.get_parameters()
+    wrapper = KernelWrapper(wrapper, parameters, ast)
+    wrapper.num_regs = func.num_regs
+    return wrapper
+def _build_numpy_argument_list(parameters, argument_dict):
+    argument_dict = {k: v for k, v in argument_dict.items()}
+    result = []
+    for param in parameters:
+        if param.is_field_pointer:
+            array = argument_dict[param.field_name]
+            actual_type = array.dtype
+            expected_type = param.fields[0].dtype.numpy_dtype
+            if expected_type != actual_type:
+                raise ValueError(f"Data type mismatch for field {param.field_name}. "
+                                 f"Expected {expected_type} got {actual_type}.")
+            result.append(array)
+        elif param.is_field_stride:
+            cast_to_dtype = param.symbol.dtype.numpy_dtype.type
+            array = argument_dict[param.field_name]
+            stride = cast_to_dtype(array.strides[param.symbol.coordinate] // array.dtype.itemsize)
+            result.append(stride)
+        elif param.is_field_shape:
+            cast_to_dtype = param.symbol.dtype.numpy_dtype.type
+            array = argument_dict[param.field_name]
+            result.append(cast_to_dtype(array.shape[param.symbol.coordinate]))
+        else:
+            expected_type = param.symbol.dtype.numpy_dtype
+            result.append(expected_type.type(argument_dict[param.symbol.name]))
+    assert len(result) == len(parameters)
+    return result
+def _check_arguments(parameter_specification, argument_dict):
+    """
+    Checks if parameters passed to kernel match the description in the AST function node.
+    If not it raises a ValueError, on success it returns the array shape that determines the CUDA blocks and threads
+    """
+    argument_dict = {k: v for k, v in argument_dict.items()}
+    array_shapes = set()
+    index_arr_shapes = set()
+    for param in parameter_specification:
+        if isinstance(param.symbol, FieldPointerSymbol):
+            symbolic_field = param.fields[0]
+            try:
+                field_arr = argument_dict[symbolic_field.name]
+            except KeyError:
+                raise KeyError(f"Missing field parameter for kernel call {str(symbolic_field)}")
+            if symbolic_field.has_fixed_shape:
+                symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
+                if isinstance(symbolic_field.dtype, StructType):
+                    symbolic_field_shape = symbolic_field_shape[:-1]
+                if symbolic_field_shape != field_arr.shape:
+                    raise ValueError(f"Passed array {symbolic_field.name} has shape {str(field_arr.shape)} "
+                                     f"which does not match expected shape {str(symbolic_field.shape)}")
+            if symbolic_field.has_fixed_shape:
+                symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides)
+                if isinstance(symbolic_field.dtype, StructType):
+                    symbolic_field_strides = symbolic_field_strides[:-1]
+                if symbolic_field_strides != field_arr.strides:
+                    raise ValueError(f"Passed array {symbolic_field.name} has strides {str(field_arr.strides)} "
+                                     f"which does not match expected strides {str(symbolic_field_strides)}")
+            if FieldType.is_indexed(symbolic_field):
+                index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
+            elif FieldType.is_generic(symbolic_field):
+                array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
+    if len(array_shapes) > 1:
+        raise ValueError(f"All passed arrays have to have the same size {str(array_shapes)}")
+    if len(index_arr_shapes) > 1:
+        raise ValueError(f"All passed index arrays have to have the same size {str(array_shapes)}")
+    if len(index_arr_shapes) > 0:
+        return list(index_arr_shapes)[0]
+    else:
+        return list(array_shapes)[0]
--- a/src/pystencils/gpu/indexing.py
+++ b/src/pystencils/gpu/indexing.py
+import abc
+from functools import partial
+import math
+from typing import List, Tuple
+import sympy as sp
+from sympy.core.cache import cacheit
+from pystencils.astnodes import Block, Conditional, SympyAssignment
+from pystencils.typing import TypedSymbol, create_type
+from pystencils.integer_functions import div_ceil, div_floor
+from pystencils.sympyextensions import is_integer_sequence, prod
+class ThreadIndexingSymbol(TypedSymbol):
+    def __new__(cls, *args, **kwds):
+        obj = ThreadIndexingSymbol.__xnew_cached_(cls, *args, **kwds)
+        return obj
+    def __new_stage2__(cls, name, dtype, *args, **kwargs):
+        obj = super(ThreadIndexingSymbol, cls).__xnew__(cls, name, dtype, *args, **kwargs)
+        return obj
+    __xnew__ = staticmethod(__new_stage2__)
+    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
+BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+class AbstractIndexing(abc.ABC):
+    """
+    Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
+    to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
+    and computes the number of blocks and threads a kernel is started with.
+    The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
+    and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
+    coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
+    """
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        for iter_space in iteration_space:
+            assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
+                                                  f"not tuple of type {type(iter_space)}"
+        self._iteration_space = iteration_space
+        self._data_layout = data_layout
+        self._dim = len(iteration_space)
+    @property
+    def iteration_space(self):
+        """Iteration space to loop over"""
+        return self._iteration_space
+    @property
+    def data_layout(self):
+        """Data layout of the kernels arrays"""
+        return self._data_layout
+    @property
+    def dim(self):
+        """Number of spatial dimensions"""
+        return self._dim
+    @property
+    @abc.abstractmethod
+    def coordinates(self):
+        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic GPU block and thread indices.
+        These symbolic indices can be obtained with the method `index_variables` """
+    @property
+    def index_variables(self):
+        """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
+        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
+    @abc.abstractmethod
+    def get_loop_ctr_assignments(self, loop_counter_symbols) -> List[SympyAssignment]:
+        """Adds assignments for the loop counter symbols depending on the gpu threads.
+        Args:
+            loop_counter_symbols: typed symbols representing the loop counters
+        Returns:
+            assignments for the loop counters
+        """
+    @abc.abstractmethod
+    def call_parameters(self, arr_shape):
+        """Determine grid and block size for kernel call.
+        Args:
+            arr_shape: the numeric (not symbolic) shape of the array
+        Returns:
+            dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
+            the kernel should be started with
+        """
+    @abc.abstractmethod
+    def guard(self, kernel_content, arr_shape):
+        """In some indexing schemes not all threads of a block execute the kernel content.
+        This function can return a Conditional ast node, defining this execution guard.
+        Args:
+            kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
+            arr_shape: the numeric or symbolic shape of the field
+        Returns:
+            ast node, which is put inside the kernel function
+        """
+    @abc.abstractmethod
+    def max_threads_per_block(self):
+        """Return maximal number of threads per block for launch bounds. If this cannot be determined without
+        knowing the array shape return None for unknown """
+    @abc.abstractmethod
+    def symbolic_parameters(self):
+        """Set of symbols required in call_parameters code"""
+# -------------------------------------------- Implementations ---------------------------------------------------------
+class BlockIndexing(AbstractIndexing):
+    """Generic indexing scheme that maps sub-blocks of an array to GPU blocks.
+    Args:
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple specifying loop order with innermost loop last.
+                     This is the same format as returned by `Field.layout`.
+        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
+                                                gets the largest amount of threads
+        compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
+        maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
+                            maximum block size from the device properties
+        device_number: device number of the used GPU. By default, the zeroth device is used.
+    """
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple[int],
+                 block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
+                 maximum_block_size=(1024, 1024, 64), device_number=None):
+        super(BlockIndexing, self).__init__(iteration_space, data_layout)
+        if self._dim > 4:
+            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
+        if permute_block_size_dependent_on_layout and self._dim < 4:
+            block_size = self.permute_block_size_according_to_layout(block_size, data_layout)
+        self._block_size = block_size
+        if maximum_block_size == 'auto':
+            assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
+            # Get device limits
+            import cupy as cp
+            # See https://github.com/cupy/cupy/issues/7676
+            if cp.cuda.runtime.is_hip:
+                maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
+            else:
+                da = cp.cuda.Device(device_number).attributes
+                maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
+        self._maximum_block_size = maximum_block_size
+        self._compile_time_block_size = compile_time_block_size
+        self._device_number = device_number
+    @property
+    def cuda_indices(self):
+        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
+        indices = [block_index * bs + thread_idx
+                   for block_index, bs, thread_idx in zip(BLOCK_IDX, block_size, THREAD_IDX)]
+        return indices[:self._dim]
+    @property
+    def coordinates(self):
+        if self._dim < 4:
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
+            return coordinates[:self._dim]
+        else:
+            coordinates = list()
+            width = self._iteration_space[1].stop - self.iteration_space[1].start
+            coordinates.append(div_floor(self.cuda_indices[0], width))
+            coordinates.append(sp.Mod(self.cuda_indices[0], width))
+            coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
+            coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
+            return coordinates
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
+    def call_parameters(self, arr_shape):
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)
+        if len(widths) > 3:
+            widths = [widths[0] * widths[1], widths[2], widths[3]]
+        extend_bs = (1,) * (3 - len(self._block_size))
+        block_size = self._block_size + extend_bs
+        if not self._compile_time_block_size:
+            assert len(block_size) == 3
+            adapted_block_size = []
+            for i in range(len(widths)):
+                factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
+                adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
+            extend_adapted_bs = (1,) * (3 - len(adapted_block_size))
+            block_size = tuple(adapted_block_size) + extend_adapted_bs
+        block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size))
+        grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
+        extend_gr = (1,) * (3 - len(grid))
+        return {'block': block_size,
+                'grid': grid + extend_gr}
+    def guard(self, kernel_content, arr_shape):
+        arr_shape = arr_shape[:self._dim]
+        if len(self._iteration_space) - 1 == len(arr_shape):
+            numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space[1:], arr_shape)
+            numeric_iteration_slice = [self.iteration_space[0]] + numeric_iteration_slice
+        else:
+            assert len(self._iteration_space) == len(arr_shape), "Iteration space must be equal to the array shape"
+            numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
+        for i, s in enumerate(numeric_iteration_slice):
+            if s.step and s.step != 1:
+                end[i] = div_ceil(s.stop - s.start, s.step) + s.start
+        if self._dim < 4:
+            conditions = [c < e for c, e in zip(self.coordinates, end)]
+        else:
+            end = [end[0] * end[1], end[2], end[3]]
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
+            conditions = [c < e for c, e in zip(coordinates, end)]
+        condition = conditions[0]
+        for c in conditions[1:]:
+            condition = sp.And(condition, c)
+        return Block([Conditional(condition, kernel_content)])
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+    def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
+        """Shrinks the block_size if there are too many registers used per block.
+        This is not done automatically, since the required_registers_per_thread are not known before compilation.
+        They can be obtained by ``func.num_regs`` from a cupy function.
+        Args:
+            block_size: used block size that is target for limiting
+            required_registers_per_thread: needed registers per thread
+        returns: smaller block_size if too many registers are used.
+        """
+        import cupy as cp
+        # See https://github.com/cupy/cupy/issues/7676
+        if cp.cuda.runtime.is_hip:
+            max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
+        else:
+            device = cp.cuda.Device(self._device_number)
+            da = device.attributes
+            max_registers_per_block = da.get("MaxRegistersPerBlock")
+        result = list(block_size)
+        while True:
+            required_registers = math.prod(result) * required_registers_per_thread
+            if required_registers <= max_registers_per_block:
+                return result
+            else:
+                largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
+                assert result[largest_list_entry_idx] >= 2
+                result[largest_list_entry_idx] //= 2
+    @staticmethod
+    def permute_block_size_according_to_layout(block_size, layout):
+        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
+        if not is_integer_sequence(block_size):
+            return block_size
+        sorted_block_size = list(sorted(block_size, reverse=True))
+        while len(sorted_block_size) > len(layout):
+            sorted_block_size[0] *= sorted_block_size[-1]
+            sorted_block_size = sorted_block_size[:-1]
+        result = list(block_size)
+        for l, bs in zip(reversed(layout), sorted_block_size):
+            result[l] = bs
+        return tuple(result[:len(layout)])
+    def max_threads_per_block(self):
+        if is_integer_sequence(self._block_size):
+            return prod(self._block_size)
+        else:
+            return None
+    def symbolic_parameters(self):
+        return set(b for b in self._block_size if isinstance(b, sp.Symbol))
+class LineIndexing(AbstractIndexing):
+    """
+    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D GPU block.
+    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
+    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
+    maximum amount of threads allowed in a GPU block (which depends on device).
+    Args:
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple to determine the fast and slow coordinates.
+    """
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        super(LineIndexing, self).__init__(iteration_space, data_layout)
+        if len(iteration_space) > 4:
+            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
+    @property
+    def cuda_indices(self):
+        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
+        coordinates = available_indices[:self.dim]
+        fastest_coordinate = self.data_layout[-1]
+        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
+        return coordinates
+    @property
+    def coordinates(self):
+        return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
+    def call_parameters(self, arr_shape):
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)
+        def get_shape_of_cuda_idx(cuda_idx):
+            if cuda_idx not in self.cuda_indices:
+                return 1
+            else:
+                idx = self.cuda_indices.index(cuda_idx)
+                return widths[idx]
+        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
+                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
+    def guard(self, kernel_content, arr_shape):
+        return kernel_content
+    def max_threads_per_block(self):
+        return None
+    def symbolic_parameters(self):
+        return set()
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+# -------------------------------------- Helper functions --------------------------------------------------------------
+def _get_numeric_iteration_slice(iteration_slice, arr_shape):
+    res = []
+    for slice_component, shape in zip(iteration_slice, arr_shape):
+        result_slice = slice_component
+        if not isinstance(result_slice.start, int):
+            start = result_slice.start
+            assert len(start.free_symbols) == 1
+            start = start.subs({symbol: shape for symbol in start.free_symbols})
+            result_slice = slice(start, result_slice.stop, result_slice.step)
+        if not isinstance(result_slice.stop, int):
+            stop = result_slice.stop
+            assert len(stop.free_symbols) == 1
+            stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
+            result_slice = slice(result_slice.start, stop, result_slice.step)
+        assert isinstance(result_slice.step, int)
+        res.append(result_slice)
+    return res
+def _get_widths(iteration_slice):
+    widths = []
+    for iter_slice in iteration_slice:
+        step = iter_slice.step
+        assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
+        start = iter_slice.start
+        stop = iter_slice.stop
+        if step == 1:
+            if stop - start == 0:
+                widths.append(1)
+            else:
+                widths.append(stop - start)
+        else:
+            width = (stop - start) / step
+            if isinstance(width, int):
+                widths.append(width)
+            elif isinstance(width, float):
+                widths.append(math.ceil(width))
+            else:
+                widths.append(div_ceil(stop - start, step))
+    return widths
+def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
+    loop_ctr_assignments = []
+    for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
+        if isinstance(iter_slice, slice) and iter_slice.step > 1:
+            offset = (iter_slice.step * iter_slice.start) - iter_slice.start
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step - offset))
+        elif iter_slice.start == iter_slice.stop:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
+        else:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))
+    return loop_ctr_assignments
+def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
+    if isinstance(gpu_indexing, str):
+        if gpu_indexing == 'block':
+            indexing_creator = BlockIndexing
+        elif gpu_indexing == 'line':
+            indexing_creator = LineIndexing
+        else:
+            raise ValueError(f"Unknown GPU indexing {gpu_indexing}. Valid values are 'block' and 'line'")
+        if gpu_indexing_params:
+            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
+        return indexing_creator
+    else:
+        return gpu_indexing
--- a/src/pystencils/gpu/kernelcreation.py
+++ b/src/pystencils/gpu/kernelcreation.py
+import sympy as sp
+from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
+from pystencils.config import CreateKernelConfig
+from pystencils.typing import StructType, TypedSymbol
+from pystencils.typing.transformations import add_types
+from pystencils.field import Field, FieldType
+from pystencils.enums import Target, Backend
+from pystencils.gpu.gpujit import make_python_function
+from pystencils.node_collection import NodeCollection
+from pystencils.gpu.indexing import indexing_creator_from_params
+from pystencils.slicing import normalize_slice
+from pystencils.transformations import (
+    get_base_buffer_index, get_common_field, get_common_indexed_element, parse_base_pointer_info,
+    resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
+def create_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
+    function_name = config.function_name
+    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
+    iteration_slice = config.iteration_slice
+    ghost_layers = config.ghost_layers
+    fields_written = assignments.bound_fields
+    fields_read = assignments.rhs_fields
+    assignments = assignments.all_assignments
+    assignments = add_types(assignments, config)
+    all_fields = fields_read.union(fields_written)
+    read_only_fields = set([f.name for f in fields_read - fields_written])
+    buffers = set([f for f in all_fields if FieldType.is_buffer(f)])
+    fields_without_buffers = all_fields - buffers
+    field_accesses = set()
+    num_buffer_accesses = 0
+    indexed_elements = set()
+    for eq in assignments:
+        indexed_elements.update(eq.atoms(sp.Indexed))
+        field_accesses.update(eq.atoms(Field.Access))
+        field_accesses = {e for e in field_accesses if not e.is_absolute_access}
+        num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field))
+    # common shape and field to from the iteration space
+    common_field = get_common_field(fields_without_buffers)
+    common_shape = common_field.spatial_shape
+    if iteration_slice is None:
+        # determine iteration slice from ghost layers
+        if ghost_layers is None:
+            # determine required number of ghost layers from field access
+            required_ghost_layers = max([fa.required_ghost_layers for fa in field_accesses])
+            ghost_layers = [(required_ghost_layers, required_ghost_layers)] * len(common_shape)
+        iteration_slice = []
+        if isinstance(ghost_layers, int):
+            for i in range(len(common_shape)):
+                iteration_slice.append(slice(ghost_layers, -ghost_layers if ghost_layers > 0 else None))
+            ghost_layers = [(ghost_layers, ghost_layers)] * len(common_shape)
+        else:
+            for i in range(len(common_shape)):
+                iteration_slice.append(slice(ghost_layers[i][0],
+                                             -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None))
+        iteration_space = normalize_slice(iteration_slice, common_shape)
+    else:
+        iteration_space = normalize_slice(iteration_slice, common_shape)
+    iteration_space = tuple([s if isinstance(s, slice) else slice(s, s + 1, 1) for s in iteration_space])
+    loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]
+    if len(indexed_elements) > 0:
+        common_indexed_element = get_common_indexed_element(indexed_elements)
+        index = common_indexed_element.indices[0].atoms(TypedSymbol)
+        assert len(index) == 1, "index expressions must only contain one symbol representing the index"
+        indexing = indexing_creator(iteration_space=(slice(0, common_indexed_element.shape[0], 1), *iteration_space),
+                                    data_layout=common_field.layout)
+        extended_ctrs = [index.pop(), *loop_counter_symbols]
+        loop_counter_assignments = indexing.get_loop_ctr_assignments(extended_ctrs)
+    else:
+        indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
+        loop_counter_assignments = indexing.get_loop_ctr_assignments(loop_counter_symbols)
+    assignments = loop_counter_assignments + assignments
+    block = indexing.guard(Block(assignments), common_shape)
+    unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)
+    ast = KernelFunction(block,
+                         Target.GPU,
+                         Backend.CUDA,
+                         make_python_function,
+                         ghost_layers,
+                         function_name,
+                         assignments=assignments)
+    ast.global_variables.update(indexing.index_variables)
+    base_pointer_spec = config.base_pointer_specification
+    if base_pointer_spec is None:
+        base_pointer_spec = []
+    base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
+                                                         f.spatial_dimensions, f.index_dimensions)
+                         for f in all_fields}
+    coord_mapping = {f.name: loop_counter_symbols for f in all_fields}
+    if any(FieldType.is_buffer(f) for f in all_fields):
+        base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
+        resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)
+    resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
+                           field_to_fixed_coordinates=coord_mapping)
+    # add the function which determines #blocks and #threads as additional member to KernelFunction node
+    # this is used by the jit
+    # If loop counter symbols have been explicitly used in the update equations (e.g. for built in periodicity),
+    # they are defined here
+    undefined_loop_counters = {LoopOverCoordinate.is_loop_counter_symbol(s): s for s in ast.body.undefined_symbols
+                               if LoopOverCoordinate.is_loop_counter_symbol(s) is not None}
+    for i, loop_counter in undefined_loop_counters.items():
+        ast.body.insert_front(SympyAssignment(loop_counter, indexing.coordinates[i]))
+    ast.indexing = indexing
+    return ast
+def created_indexed_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
+    index_fields = config.index_fields
+    function_name = config.function_name
+    coordinate_names = config.coordinate_names
+    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
+    fields_written = assignments.bound_fields
+    fields_read = assignments.rhs_fields
+    all_fields = fields_read.union(fields_written)
+    read_only_fields = set([f.name for f in fields_read - fields_written])
+    # extract the index fields based on the name. The original index field might have been modified
+    index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
+    non_index_fields = [f for f in all_fields if f not in index_fields]
+    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
+    assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
+                                          f"Non index fields are {non_index_fields}, spatial coordinates are " \
+                                          f"{spatial_coordinates}"
+    spatial_coordinates = list(spatial_coordinates)[0]
+    assignments = assignments.all_assignments
+    assignments = add_types(assignments, config)
+    for index_field in index_fields:
+        index_field.field_type = FieldType.INDEXED
+        assert FieldType.is_indexed(index_field)
+        assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
+    def get_coordinate_symbol_assignment(name):
+        for ind_f in index_fields:
+            assert isinstance(ind_f.dtype, StructType), "Index fields have to have a struct data type"
+            data_type = ind_f.dtype
+            if data_type.has_element(name):
+                rhs = ind_f[0](name)
+                lhs = TypedSymbol(name, data_type.get_element_type(name))
+                return SympyAssignment(lhs, rhs)
+        raise ValueError(f"Index {name} not found in any of the passed index fields")
+    coordinate_symbol_assignments = [get_coordinate_symbol_assignment(n)
+                                     for n in coordinate_names[:spatial_coordinates]]
+    coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments]
+    idx_field = list(index_fields)[0]
+    iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape),
+                                      idx_field.spatial_shape)
+    indexing = indexing_creator(iteration_space=iteration_space,
+                                data_layout=idx_field.layout)
+    function_body = Block(coordinate_symbol_assignments + assignments)
+    function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape)
+    ast = KernelFunction(function_body, Target.GPU, Backend.CUDA, make_python_function,
+                         None, function_name, assignments=assignments)
+    ast.global_variables.update(indexing.index_variables)
+    coord_mapping = indexing.coordinates
+    base_pointer_spec = [['spatialInner0']]
+    base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
+                                                         f.spatial_dimensions, f.index_dimensions)
+                         for f in all_fields}
+    coord_mapping = {f.name: coord_mapping for f in index_fields}
+    coord_mapping.update({f.name: coordinate_typed_symbols for f in non_index_fields})
+    resolve_field_accesses(ast, read_only_fields, field_to_fixed_coordinates=coord_mapping,
+                           field_to_base_pointer_info=base_pointer_info)
+    # add the function which determines #blocks and #threads as additional member to KernelFunction node
+    # this is used by the jit
+    ast.indexing = indexing
+    return ast
--- a/src/pystencils/gpu/periodicity.py
+++ b/src/pystencils/gpu/periodicity.py
+import numpy as np
+from itertools import product
+from pystencils import CreateKernelConfig, create_kernel
+from pystencils.gpu import make_python_function
+from pystencils import Assignment, Field
+from pystencils.enums import Target
+from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
+def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, index_dim_shape=1, dtype=np.float64):
+    """Copies a rectangular part of an array to another non-overlapping part"""
+    f = Field.create_generic("pdfs", len(domain_size), index_dimensions=index_dimensions, dtype=dtype)
+    normalized_from_slice = normalize_slice(from_slice, f.spatial_shape)
+    normalized_to_slice = normalize_slice(to_slice, f.spatial_shape)
+    offset = [s1.start - s2.start for s1, s2 in zip(normalized_from_slice, normalized_to_slice)]
+    assert offset == [s1.stop - s2.stop for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
+        "Slices have to have same size"
+    update_eqs = []
+    if index_dimensions < 2:
+        index_dim_shape = [index_dim_shape]
+    for i in product(*[range(d) for d in index_dim_shape]):
+        eq = Assignment(f(*i), f[tuple(offset)](*i))
+        update_eqs.append(eq)
+    config = CreateKernelConfig(target=Target.GPU, iteration_slice=to_slice, skip_independence_check=True)
+    ast = create_kernel(update_eqs, config=config)
+    return ast
+def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
+                                  thickness=None, dtype=np.float64, target=Target.GPU):
+    assert target in {Target.GPU}
+    src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
+    kernels = []
+    for src_slice, dst_slice in src_dst_slice_tuples:
+        ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
+        kernels.append(make_python_function(ast))
+    def functor(pdfs, **_):
+        for kernel in kernels:
+            kernel(pdfs=pdfs)
+    return functor
--- a/src/pystencils/include/__init__.py
+++ b/src/pystencils/include/__init__.py
+from os.path import dirname, realpath
+def get_pystencils_include_path():
+    return dirname(realpath(__file__))
--- a/src/pystencils/include/aesni_rand.h
+++ b/src/pystencils/include/aesni_rand.h
+/*
+Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
+Copyright 2019-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <emmintrin.h> // SSE2
+#include <wmmintrin.h> // AES
+#ifdef __AVX__
+#include <immintrin.h> // AVX*
+#else
+#include <smmintrin.h>  // SSE4
+#ifdef __FMA__
+#include <immintrin.h> // FMA
+#endif
+#endif
+#include <cstdint>
+#include <array>
+#include <map>
+#define QUALIFIERS inline
+#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
+#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
+#include "myintrin.h"
+typedef std::uint32_t uint32;
+typedef std::uint64_t uint64;
+template <typename T, std::size_t Alignment>
+class AlignedAllocator
+{
+public:
+    typedef T value_type;
+    template <typename U>
+    struct rebind {
+        typedef AlignedAllocator<U, Alignment> other;
+    };
+    T * allocate(const std::size_t n) const {
+        if (n == 0) {
+            return nullptr;
+        }
+        void * const p = _mm_malloc(n*sizeof(T), Alignment);
+        if (p == nullptr) {
+            throw std::bad_alloc();
+        }
+        return static_cast<T *>(p);
+    }
+    void deallocate(T * const p, const std::size_t n) const {
+        _mm_free(p);
+    }
+};
+template <typename Key, typename T>
+using AlignedMap = std::map<Key, T, std::less<Key>, AlignedAllocator<std::pair<const Key, T>, sizeof(Key)>>;
+#if defined(__AES__) || defined(_MSC_VER)
+QUALIFIERS __m128i aesni_keygen_assist(__m128i temp1, __m128i temp2) {
+    __m128i temp3; 
+    temp2 = _mm_shuffle_epi32(temp2 ,0xff); 
+    temp3 = _mm_slli_si128(temp1, 0x4);
+    temp1 = _mm_xor_si128(temp1, temp3);
+    temp3 = _mm_slli_si128(temp3, 0x4);
+    temp1 = _mm_xor_si128(temp1, temp3);
+    temp3 = _mm_slli_si128(temp3, 0x4);
+    temp1 = _mm_xor_si128(temp1, temp3);
+    temp1 = _mm_xor_si128(temp1, temp2); 
+    return temp1; 
+}
+QUALIFIERS std::array<__m128i,11> aesni_keygen(__m128i k) {
+    std::array<__m128i,11> rk;
+    __m128i tmp;
+    rk[0] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x1);
+    k = aesni_keygen_assist(k, tmp);
+    rk[1] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x2);
+    k = aesni_keygen_assist(k, tmp);
+    rk[2] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x4);
+    k = aesni_keygen_assist(k, tmp);
+    rk[3] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x8);
+    k = aesni_keygen_assist(k, tmp);
+    rk[4] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x10);
+    k = aesni_keygen_assist(k, tmp);
+    rk[5] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x20);
+    k = aesni_keygen_assist(k, tmp);
+    rk[6] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x40);
+    k = aesni_keygen_assist(k, tmp);
+    rk[7] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x80);
+    k = aesni_keygen_assist(k, tmp);
+    rk[8] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x1b);
+    k = aesni_keygen_assist(k, tmp);
+    rk[9] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x36);
+    k = aesni_keygen_assist(k, tmp);
+    rk[10] = k;
+    return rk;
+}
+QUALIFIERS const std::array<__m128i,11> & aesni_roundkeys(const __m128i & k128) {
+    alignas(16) std::array<uint32,4> a;
+    _mm_store_si128((__m128i*) a.data(), k128);
+    static AlignedMap<std::array<uint32,4>, std::array<__m128i,11>> roundkeys;
+    if(roundkeys.find(a) == roundkeys.end()) {
+        auto rk = aesni_keygen(k128);
+        roundkeys[a] = rk;
+    }
+    return roundkeys[a];
+}
+QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k0) {
+    auto k = aesni_roundkeys(k0);
+    __m128i x = _mm_xor_si128(k[0], in);
+    x = _mm_aesenc_si128(x, k[1]);
+    x = _mm_aesenc_si128(x, k[2]);
+    x = _mm_aesenc_si128(x, k[3]);
+    x = _mm_aesenc_si128(x, k[4]);
+    x = _mm_aesenc_si128(x, k[5]);
+    x = _mm_aesenc_si128(x, k[6]);
+    x = _mm_aesenc_si128(x, k[7]);
+    x = _mm_aesenc_si128(x, k[8]);
+    x = _mm_aesenc_si128(x, k[9]);
+    x = _mm_aesenclast_si128(x, k[10]);
+    return x;
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              double & rnd1, double & rnd2)
+{
+    // pack input and call AES
+    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    c128 = aesni1xm128i(c128, k128);
+    // convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
+    __m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
+    __m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
+    y = _mm_srli_si128(y, 4);
+    // calculate z = x ^ y << (53 - 32))
+    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm_xor_si128(x, z);
+    // convert uint64 to double
+    __m128d rs = _my_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+#ifdef __FMA__
+    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#else
+    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#endif
+    // store result
+    alignas(16) double rr[2];
+    _mm_store_pd(rr, rs);
+    rnd1 = rr[0];
+    rnd2 = rr[1];
+}
+QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             float & rnd1, float & rnd2, float & rnd3, float & rnd4)
+{
+    // pack input and call AES
+    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    c128 = aesni1xm128i(c128, k128);
+    // convert uint32 to float
+    __m128 rs = _my_cvtepu32_ps(c128);
+    // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+#ifdef __FMA__
+    rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#else
+    rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#endif
+    // store result
+    alignas(16) float r[4];
+    _mm_store_ps(r, rs);
+    rnd1 = r[0];
+    rnd2 = r[1];
+    rnd3 = r[2];
+    rnd4 = r[3];
+}
+template<bool high>
+QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
+        y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
+    }
+    else
+    {
+        x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
+        y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm_xor_si128(x, z);
+    // convert uint64 to double
+    __m128d rs = _my_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+#ifdef __FMA__
+    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#else
+    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#endif
+    return rs;
+}
+QUALIFIERS void aesni_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
+{
+    // pack input and call AES
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k128);
+    }
+    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    // convert uint32 to float
+    rnd1 = _my_cvtepu32_ps(ctr[0]);
+    rnd2 = _my_cvtepu32_ps(ctr[1]);
+    rnd3 = _my_cvtepu32_ps(ctr[2]);
+    rnd4 = _my_cvtepu32_ps(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+#ifdef __FMA__
+    rnd1 = _mm_fmadd_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm_fmadd_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm_fmadd_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm_fmadd_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+#else
+    rnd1 = _mm_mul_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd1 = _mm_add_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = _mm_mul_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd2 = _mm_add_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = _mm_mul_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd3 = _mm_add_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = _mm_mul_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd4 = _mm_add_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#endif
+}
+QUALIFIERS void aesni_double2(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
+{
+    // pack input and call AES
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k128);
+    }
+    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void aesni_float4(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             __m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
+{
+    __m128i ctr0v = _mm_set1_epi32(ctr0);
+    __m128i ctr2v = _mm_set1_epi32(ctr2);
+    __m128i ctr3v = _mm_set1_epi32(ctr3);
+    aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
+{
+    __m128i ctr0v = _mm_set1_epi32(ctr0);
+    __m128i ctr2v = _mm_set1_epi32(ctr2);
+    __m128i ctr3v = _mm_set1_epi32(ctr3);
+   aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m128d & rnd1, __m128d & rnd2)
+{
+    __m128i ctr0v = _mm_set1_epi32(ctr0);
+    __m128i ctr2v = _mm_set1_epi32(ctr2);
+    __m128i ctr3v = _mm_set1_epi32(ctr3);
+    __m128d ignore;
+   aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
+}
+#endif
+#ifdef __AVX2__
+QUALIFIERS const std::array<__m256i,11> & aesni_roundkeys(const __m256i & k256) {
+    alignas(32) std::array<uint32,8> a;
+    _mm256_store_si256((__m256i*) a.data(), k256);
+    static AlignedMap<std::array<uint32,8>, std::array<__m256i,11>> roundkeys;
+    if(roundkeys.find(a) == roundkeys.end()) {
+        auto rk1 = aesni_keygen(_mm256_extractf128_si256(k256, 0));
+        auto rk2 = aesni_keygen(_mm256_extractf128_si256(k256, 1));
+        for(int i = 0; i < 11; ++i) {
+            roundkeys[a][i] = _my256_set_m128i(rk2[i], rk1[i]);
+        }
+    }
+    return roundkeys[a];
+}
+QUALIFIERS __m256i aesni1xm128i(const __m256i & in, const __m256i & k0) {
+#if defined(__VAES__)
+    auto k = aesni_roundkeys(k0);
+    __m256i x = _mm256_xor_si256(k[0], in);
+    x = _mm256_aesenc_epi128(x, k[1]);
+    x = _mm256_aesenc_epi128(x, k[2]);
+    x = _mm256_aesenc_epi128(x, k[3]);
+    x = _mm256_aesenc_epi128(x, k[4]);
+    x = _mm256_aesenc_epi128(x, k[5]);
+    x = _mm256_aesenc_epi128(x, k[6]);
+    x = _mm256_aesenc_epi128(x, k[7]);
+    x = _mm256_aesenc_epi128(x, k[8]);
+    x = _mm256_aesenc_epi128(x, k[9]);
+    x = _mm256_aesenclast_epi128(x, k[10]);
+#else
+    __m128i a = aesni1xm128i(_mm256_extractf128_si256(in, 0), _mm256_extractf128_si256(k0, 0));
+    __m128i b = aesni1xm128i(_mm256_extractf128_si256(in, 1), _mm256_extractf128_si256(k0, 1));
+    __m256i x = _my256_set_m128i(b, a);
+#endif
+    return x;
+}
+template<bool high>
+QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 1));
+        y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 1));
+    }
+    else
+    {
+        x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 0));
+        y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 0));
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm256_xor_si256(x, z);
+    // convert uint64 to double
+    __m256d rs = _my256_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+#ifdef __FMA__
+    rs = _mm256_fmadd_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE), _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#else
+    rs = _mm256_mul_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm256_add_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#endif
+    return rs;
+}
+QUALIFIERS void aesni_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
+{
+    // pack input and call AES
+    __m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
+    __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    __m128i a[4], b[4];
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm256_extractf128_si256(ctr[i], 0);
+        b[i] = _mm256_extractf128_si256(ctr[i], 1);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my256_set_m128i(b[i], a[i]);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k256);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm256_extractf128_si256(ctr[i], 0);
+        b[i] = _mm256_extractf128_si256(ctr[i], 1);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my256_set_m128i(b[i], a[i]);
+    }
+    // convert uint32 to float
+    rnd1 = _my256_cvtepu32_ps(ctr[0]);
+    rnd2 = _my256_cvtepu32_ps(ctr[1]);
+    rnd3 = _my256_cvtepu32_ps(ctr[2]);
+    rnd4 = _my256_cvtepu32_ps(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+#ifdef __FMA__
+    rnd1 = _mm256_fmadd_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm256_fmadd_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm256_fmadd_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm256_fmadd_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+#else
+    rnd1 = _mm256_mul_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd1 = _mm256_add_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = _mm256_mul_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd2 = _mm256_add_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = _mm256_mul_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd3 = _mm256_add_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = _mm256_mul_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd4 = _mm256_add_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#endif
+}
+QUALIFIERS void aesni_double2(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
+{
+    // pack input and call AES
+    __m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
+    __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    __m128i a[4], b[4];
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm256_extractf128_si256(ctr[i], 0);
+        b[i] = _mm256_extractf128_si256(ctr[i], 1);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my256_set_m128i(b[i], a[i]);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k256);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm256_extractf128_si256(ctr[i], 0);
+        b[i] = _mm256_extractf128_si256(ctr[i], 1);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my256_set_m128i(b[i], a[i]);
+    }
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void aesni_float4(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             __m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
+{
+    __m256i ctr0v = _mm256_set1_epi32(ctr0);
+    __m256i ctr2v = _mm256_set1_epi32(ctr2);
+    __m256i ctr3v = _mm256_set1_epi32(ctr3);
+    aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
+{
+    __m256i ctr0v = _mm256_set1_epi32(ctr0);
+    __m256i ctr2v = _mm256_set1_epi32(ctr2);
+    __m256i ctr3v = _mm256_set1_epi32(ctr3);
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m256d & rnd1, __m256d & rnd2)
+{
+#if 0
+    __m256i ctr0v = _mm256_set1_epi32(ctr0);
+    __m256i ctr2v = _mm256_set1_epi32(ctr2);
+    __m256i ctr3v = _mm256_set1_epi32(ctr3);
+    __m256d ignore;
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
+#else
+    __m128d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
+    aesni_double2(ctr0, _mm256_extractf128_si256(ctr1, 0), ctr2, ctr3, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+    rnd1 = _my256_set_m128d(rnd1hi, rnd1lo);
+    rnd2 = _my256_set_m128d(rnd2hi, rnd2lo);
+#endif
+}
+#endif
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
+QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) {
+    alignas(64) std::array<uint32,16> a;
+    _mm512_store_si512((__m512i*) a.data(), k512);
+    static AlignedMap<std::array<uint32,16>, std::array<__m512i,11>> roundkeys;
+    if(roundkeys.find(a) == roundkeys.end()) {
+        auto rk1 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 0));
+        auto rk2 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 1));
+        auto rk3 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 2));
+        auto rk4 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 3));
+        for(int i = 0; i < 11; ++i) {
+            roundkeys[a][i] = _my512_set_m128i(rk4[i], rk3[i], rk2[i], rk1[i]);
+        }
+    }
+    return roundkeys[a];
+}
+QUALIFIERS __m512i aesni1xm128i(const __m512i & in, const __m512i & k0) {
+#ifdef __VAES__
+    auto k = aesni_roundkeys(k0);
+    __m512i x = _mm512_xor_si512(k[0], in);
+    x = _mm512_aesenc_epi128(x, k[1]);
+    x = _mm512_aesenc_epi128(x, k[2]);
+    x = _mm512_aesenc_epi128(x, k[3]);
+    x = _mm512_aesenc_epi128(x, k[4]);
+    x = _mm512_aesenc_epi128(x, k[5]);
+    x = _mm512_aesenc_epi128(x, k[6]);
+    x = _mm512_aesenc_epi128(x, k[7]);
+    x = _mm512_aesenc_epi128(x, k[8]);
+    x = _mm512_aesenc_epi128(x, k[9]);
+    x = _mm512_aesenclast_epi128(x, k[10]);
+#else
+    __m128i a = aesni1xm128i(_mm512_extracti32x4_epi32(in, 0), _mm512_extracti32x4_epi32(k0, 0));
+    __m128i b = aesni1xm128i(_mm512_extracti32x4_epi32(in, 1), _mm512_extracti32x4_epi32(k0, 1));
+    __m128i c = aesni1xm128i(_mm512_extracti32x4_epi32(in, 2), _mm512_extracti32x4_epi32(k0, 2));
+    __m128i d = aesni1xm128i(_mm512_extracti32x4_epi32(in, 3), _mm512_extracti32x4_epi32(k0, 3));
+    __m512i x = _my512_set_m128i(d, c, b, a);
+#endif
+    return x;
+}
+template<bool high>
+QUALIFIERS __m512d _uniform_double_hq(__m512i x, __m512i y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 1));
+        y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 1));
+    }
+    else
+    {
+        x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 0));
+        y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 0));
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm512_xor_si512(x, z);
+    // convert uint64 to double
+    __m512d rs = _mm512_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = _mm512_fmadd_pd(rs, _mm512_set1_pd(TWOPOW53_INV_DOUBLE), _mm512_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+    return rs;
+}
+QUALIFIERS void aesni_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
+{
+    // pack input and call AES
+    __m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
+                                    key3, key2, key1, key0, key3, key2, key1, key0);
+    __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    __m128i a[4], b[4], c[4], d[4];
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
+        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
+        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
+        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
+    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k512);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
+        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
+        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
+        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
+    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
+    }
+    // convert uint32 to float
+    rnd1 = _mm512_cvtepu32_ps(ctr[0]);
+    rnd2 = _mm512_cvtepu32_ps(ctr[1]);
+    rnd3 = _mm512_cvtepu32_ps(ctr[2]);
+    rnd4 = _mm512_cvtepu32_ps(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = _mm512_fmadd_ps(rnd1, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm512_fmadd_ps(rnd2, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm512_fmadd_ps(rnd3, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm512_fmadd_ps(rnd4, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+}
+QUALIFIERS void aesni_double2(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
+{
+    // pack input and call AES
+    __m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
+                                    key3, key2, key1, key0, key3, key2, key1, key0);
+    __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    __m128i a[4], b[4], c[4], d[4];
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
+        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
+        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
+        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
+    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k512);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
+        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
+        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
+        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
+    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
+    }
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void aesni_float4(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             __m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
+{
+    __m512i ctr0v = _mm512_set1_epi32(ctr0);
+    __m512i ctr2v = _mm512_set1_epi32(ctr2);
+    __m512i ctr3v = _mm512_set1_epi32(ctr3);
+    aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
+{
+    __m512i ctr0v = _mm512_set1_epi32(ctr0);
+    __m512i ctr2v = _mm512_set1_epi32(ctr2);
+    __m512i ctr3v = _mm512_set1_epi32(ctr3);
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m512d & rnd1, __m512d & rnd2)
+{
+#if 0
+    __m512i ctr0v = _mm512_set1_epi32(ctr0);
+    __m512i ctr2v = _mm512_set1_epi32(ctr2);
+    __m512i ctr3v = _mm512_set1_epi32(ctr3);
+    __m512d ignore;
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
+#else
+   __m256d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
+   aesni_double2(ctr0, _mm512_extracti64x4_epi64(ctr1, 0), ctr2, ctr3, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+   rnd1 = _my512_set_m256d(rnd1hi, rnd1lo);
+   rnd2 = _my512_set_m256d(rnd2hi, rnd2lo);
+#endif
+}
+#endif
--- a/src/pystencils/include/arm_neon_helpers.h
+++ b/src/pystencils/include/arm_neon_helpers.h
+/*
+Copyright 2021-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#if defined(_MSC_VER)
+#define __ARM_NEON
+#endif
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+#include <arm_sve.h>
+typedef svbool_t svbool_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+typedef svint32_t svint32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+#endif
+#ifdef __ARM_NEON
+inline float32x4_t makeVec_f32(float a, float b, float c, float d)
+{
+    alignas(16) float data[4] = {a, b, c, d};
+    return vld1q_f32(data);
+}
+inline float64x2_t makeVec_f64(double a, double b)
+{
+    alignas(16) double data[2] = {a, b};
+    return vld1q_f64(data);
+}
+inline int32x4_t makeVec_s32(int a, int b, int c, int d)
+{
+    alignas(16) int data[4] = {a, b, c, d};
+    return vld1q_s32(data);
+}
+#endif
+inline void cachelineZero(void * p) {
+#if !defined(_MSC_VER) || defined(__clang__)
+	__asm__ volatile("dc zva, %0"::"r"(p):"memory");
+#endif
+}
+inline size_t _cachelineSize() {
+#if !defined(_MSC_VER) || defined(__clang__)
+	// check that dc zva is permitted
+	uint64_t dczid;
+	__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+	if ((dczid & (1 << 4)) != 0) {
+		return SIZE_MAX;
+	}
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+#endif
+	// too much was zeroed
+	return SIZE_MAX;
+}
+inline size_t cachelineSize() {
+	static size_t size = _cachelineSize();
+	return size;
+}
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
+/*
+Copyright 2023, Markus Holzer.
+Copyright 2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#define POS_INFINITY __int_as_float(0x7f800000)
+#define INFINITY POS_INFINITY
+#define NEG_INFINITY __int_as_float(0xff800000)
+#ifdef __HIPCC_RTC__
+typedef __hip_uint8_t uint8_t;
+typedef __hip_int8_t int8_t;
+typedef __hip_uint16_t uint16_t;
+typedef __hip_int16_t int16_t;
+#endif
--- a/src/pystencils/include/half_precision.h
+++ b/src/pystencils/include/half_precision.h
+/*
+Copyright 2023, Markus Holzer.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/// Half precision support. Experimental. Use carefully.
+///
+/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
+/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
+/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
+/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
+/// Clang version must be 15 or higher for x86 half precision support.
+/// GCC version must be 12 or higher for x86 half precision support.
+/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
+/// See
+///   https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
+///   https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
+/// for more information.
+#pragma once
+using half    = _Float16;
--- a/src/pystencils/include/myintrin.h
+++ b/src/pystencils/include/myintrin.h
+/*
+Copyright 2019-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
+QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
+{
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
+    return _mm_cvtepu32_ps(v);
+#else
+    __m128i v2 = _mm_srli_epi32(v, 1);
+    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
+    __m128 v2f = _mm_cvtepi32_ps(v2);
+    __m128 v1f = _mm_cvtepi32_ps(v1);
+    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
+#endif
+}
+QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
+{
+    __m128i T0, T1, T2, T3;
+    T0  = _mm_unpacklo_epi32(R0, R1);
+    T1  = _mm_unpacklo_epi32(R2, R3);
+    T2  = _mm_unpackhi_epi32(R0, R1);
+    T3  = _mm_unpackhi_epi32(R2, R3);
+    R0  = _mm_unpacklo_epi64(T0, T1);
+    R1  = _mm_unpackhi_epi64(T0, T1);
+    R2  = _mm_unpacklo_epi64(T2, T3);
+    R3  = _mm_unpackhi_epi64(T2, T3);
+}
+#endif
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
+#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+__attribute__((optimize("no-associative-math")))
+#endif
+QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
+{
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
+    return _mm_cvtepu64_pd(x);
+#elif defined(__clang__)
+    return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d);
+#else
+    __m128i xH = _mm_srli_epi64(x, 32);
+    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
+    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
+    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
+    return _mm_add_pd(f, _mm_castsi128_pd(xL));
+#endif
+}
+#endif
+#ifdef __AVX2__
+QUALIFIERS __m256i _my256_set_m128i(__m128i hi, __m128i lo)
+{
+#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
+    return _mm256_set_m128i(hi, lo);
+#else
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+#endif
+}
+QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
+{
+#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
+    return _mm256_set_m128d(hi, lo);
+#else
+    return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
+#endif
+}
+QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
+{
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
+    return _mm256_cvtepu32_ps(v);
+#else
+    __m256i v2 = _mm256_srli_epi32(v, 1);
+    __m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
+    __m256 v2f = _mm256_cvtepi32_ps(v2);
+    __m256 v1f = _mm256_cvtepi32_ps(v1);
+    return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
+#endif
+}
+#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+__attribute__((optimize("no-associative-math")))
+#endif
+QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
+{
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
+    return _mm256_cvtepu64_pd(x);
+#elif defined(__clang__)
+    return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d);
+#else
+    __m256i xH = _mm256_srli_epi64(x, 32);
+    xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.)));          //  2^84
+    __m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
+    __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
+    return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+#endif
+}
+#endif
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
+QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
+{
+    return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);
+}
+QUALIFIERS __m512d _my512_set_m256d(__m256d b, __m256d a)
+{
+    return _mm512_insertf64x4(_mm512_castpd256_pd512(a), b, 1);
+}
+#endif
No results found