5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
 import warnings
 from typing import Container, Union
+import numpy as np
 import sympy as sp
+from sympy.logic.boolalg import BooleanFunction, BooleanAtom
 import pystencils.astnodes as ast
-from pystencils.backends.simd_instruction_sets import get_vector_instruction_set
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
-from pystencils.data_types import (
+from pystencils.typing import (BasicType, PointerType, TypedSymbol, VectorType, CastFunc, collate_types,
-    PointerType, TypedSymbol, VectorType, cast_func, collate_types, get_type_of_expression,
+                               get_type_of_expression, VectorMemoryAccess)
-    vector_memory_access)
+from pystencils.functions import DivFunc
-from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
 from pystencils.field import Field
 from pystencils.integer_functions import modulo_ceil, modulo_floor
 from pystencils.sympyextensions import fast_subs
-from pystencils.transformations import (
+from pystencils.transformations import cut_loop, filtered_tree_iteration, replace_inner_stride_with_one
-    cut_loop, filtered_tree_iteration, replace_inner_stride_with_one)
 # noinspection PyPep8Naming
 class vec_any(sp.Function):
-    nargs = (1, )
+    nargs = (1,)
 # noinspection PyPep8Naming
 class vec_all(sp.Function):
-    nargs = (1, )
+    nargs = (1,)
-def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
+class NontemporalFence(ast.Node):
+    def __init__(self):
+        super(NontemporalFence, self).__init__(parent=None)
+    @property
+    def symbols_defined(self):
+        return set()
+    @property
+    def undefined_symbols(self):
+        return set()
+    @property
+    def args(self):
+        return []
+    def __eq__(self, other):
+        return isinstance(other, NontemporalFence)
+class CachelineSize(ast.Node):
+    symbol = sp.Symbol("_clsize")
+    mask_symbol = sp.Symbol("_clsize_mask")
+    last_symbol = sp.Symbol("_cl_lastvec")
+    def __init__(self):
+        super(CachelineSize, self).__init__(parent=None)
+    @property
+    def symbols_defined(self):
+        return {self.symbol, self.mask_symbol, self.last_symbol}
+    @property
+    def undefined_symbols(self):
+        return set()
+    @property
+    def args(self):
+        return []
+    def __eq__(self, other):
+        return isinstance(other, CachelineSize)
+    def __hash__(self):
+        return hash(self.symbol)
+def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
              assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
              assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
+    # TODO Vectorization Revamp we first introduce the remainder loop and then check if we can even vectorise.
+    #  Maybe first copy the ast and return the copied version on failure
    """Explicit vectorization using SIMD vectorization via intrinsics.
    Args:
@@ -51,9 +100,14 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
                                        depending on the access pattern there might be additional padding
                                        required at the end of the array
    """
+    if instruction_set == 'best':
+        if get_supported_instruction_sets():
+            instruction_set = get_supported_instruction_sets()[-1]
+        else:
+            instruction_set = 'avx'
    if instruction_set is None:
        return
    all_fields = kernel_ast.fields_accessed
    if nontemporal is None or nontemporal is False:
        nontemporal = {}
@@ -69,39 +123,53 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
                                  "to differently typed floating point fields")
    float_size = field_float_dtypes.pop().numpy_dtype.itemsize
    assert float_size in (8, 4)
-    vector_is = get_vector_instruction_set('double' if float_size == 8 else 'float',
+    default_float_type = 'float64' if float_size == 8 else 'float32'
-                                           instruction_set=instruction_set)
+    vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set)
-    vector_width = vector_is['width']
    kernel_ast.instruction_set = vector_is
-    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned,
+    if nontemporal and 'cachelineZero' in vector_is:
-                                                nontemporal, assume_sufficient_line_padding)
+        kernel_ast.use_all_written_field_sizes = True
-    insert_vector_casts(kernel_ast)
+    strided = 'storeS' in vector_is and 'loadS' in vector_is
+    keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned and 'storeA' in vector_is else 'storeU']
+    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal,
+                                                strided, keep_loop_stop, assume_sufficient_line_padding,
+                                                default_float_type)
-def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
+def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontemporal_fields,
-                                                assume_sufficient_line_padding):
+                                                strided, keep_loop_stop, assume_sufficient_line_padding,
+                                                default_float_type):
    """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
-    all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
+    all_loops = list(filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment))
-    inner_loops = [n for n in all_loops if n.is_innermost_loop]
+    inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
-    zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops}
+    zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}
+    vector_is = ast_node.instruction_set
+    assert vector_is, "The ast needs to hold information about the instruction_set for the vectorisation"
+    vector_width = vector_is['width']
+    vector_int_width = vector_is['intwidth']
    for loop_node in inner_loops:
        loop_range = loop_node.stop - loop_node.start
        # cut off loop tail, that is not a multiple of four
-        if assume_aligned and assume_sufficient_line_padding:
+        if keep_loop_stop:
+            pass
+        elif assume_aligned and assume_sufficient_line_padding:
            loop_range = loop_node.stop - loop_node.start
            new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
            loop_node.stop = new_stop
        else:
            cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
-            loop_nodes = [l for l in cut_loop(loop_node, [cutting_point]).args if isinstance(l, ast.LoopOverCoordinate)]
+            # TODO cut_loop calls deepcopy on the loop_node. This is bad as documented in cut_loop
+            loop_nodes = [loop for loop in cut_loop(loop_node, [cutting_point]).args
+                          if isinstance(loop, ast.LoopOverCoordinate)]
            assert len(loop_nodes) in (0, 1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
            if len(loop_nodes) == 0:
                continue
            loop_node = loop_nodes[0]
+            # loop_node is the vectorized one
        # Find all array accesses (indexed) that depend on the loop counter as offset
        loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
        substitutions = {}
@@ -109,54 +177,184 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
        for indexed in loop_node.atoms(sp.Indexed):
            base, index = indexed.args
            if loop_counter_symbol in index.atoms(sp.Symbol):
+                if 'loadA' not in vector_is and 'storeA' not in vector_is and 'maskStoreA' not in vector_is:
+                    # don't need to generate the alignment check when there are no aligned load/store instructions
+                    aligned_access = False
+                else:
+                    if not isinstance(vector_width, int):
+                        raise NotImplementedError('Access alignment cannot be statically determined for sizeless '
+                                                  'vector ISAs')
+                    aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0
                loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
-                aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) == 0
+                stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
-                if not loop_counter_is_offset:
+                if not loop_counter_is_offset and (not strided or loop_counter_symbol in stride.atoms()):
                    successful = False
                    break
                typed_symbol = base.label
-                assert type(typed_symbol.dtype) is PointerType, \
+                assert type(typed_symbol.dtype) is PointerType, f"Type of access is {typed_symbol.dtype}, {indexed}"
-                    "Type of access is {}, {}".format(typed_symbol.dtype, indexed)
                vec_type = VectorType(typed_symbol.dtype.base_type, vector_width)
                use_aligned_access = aligned_access and assume_aligned
                nontemporal = False
                if hasattr(indexed, 'field'):
                    nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
-                substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal)
+                substitutions[indexed] = VectorMemoryAccess(indexed, vec_type, use_aligned_access, nontemporal, True,
+                                                            stride if strided else 1)
+                if nontemporal:
+                    # insert NontemporalFence after the outermost loop
+                    parent = loop_node.parent
+                    while type(parent.parent.parent) is not ast.KernelFunction:
+                        parent = parent.parent
+                    parent.parent.insert_after(NontemporalFence(), parent, if_not_exists=True)
+                    # insert CachelineSize at the beginning of the kernel
+                    parent.parent.insert_front(CachelineSize(), if_not_exists=True)
        if not successful:
            warnings.warn("Could not vectorize loop because of non-consecutive memory access")
            continue
        loop_node.step = vector_width
        loop_node.subs(substitutions)
+        arg_1 = CastFunc(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width))
+        arg_2 = CastFunc(tuple(range(vector_int_width if type(vector_int_width) is int else 2)),
+                         VectorType(loop_counter_symbol.dtype, vector_int_width))
+        vector_loop_counter = arg_1 + arg_2
+        fast_subs(loop_node, {loop_counter_symbol: vector_loop_counter},
+                  skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, VectorMemoryAccess))
+        mask_conditionals(loop_node)
-def insert_vector_casts(ast_node):
+        from pystencils.rng import RNGBase
+        substitutions = {}
+        for rng in loop_node.atoms(RNGBase):
+            new_result_symbols = [TypedSymbol(s.name, VectorType(s.dtype, width=vector_width))
+                                  for s in rng.result_symbols]
+            substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)})
+            rng._symbols_defined = set(new_result_symbols)
+        fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase))
+        insert_vector_casts(loop_node, vector_is, default_float_type)
+def mask_conditionals(loop_body):
+    def visit_node(node, mask):
+        if isinstance(node, ast.Conditional):
+            cond = node.condition_expr
+            skip = (loop_body.loop_counter_symbol not in cond.atoms(sp.Symbol)) or cond.func in (vec_all, vec_any)
+            cond = True if skip else cond
+            true_mask = sp.And(cond, mask)
+            visit_node(node.true_block, true_mask)
+            if node.false_block:
+                false_mask = sp.And(sp.Not(node.condition_expr), mask)
+                visit_node(node, false_mask)
+            if not skip:
+                node.condition_expr = vec_any(node.condition_expr)
+        elif isinstance(node, ast.SympyAssignment):
+            if mask is not True:
+                s = {ma: VectorMemoryAccess(*ma.args[0:4], sp.And(mask, ma.args[4]), *ma.args[5:])
+                     for ma in node.atoms(VectorMemoryAccess)}
+                node.subs(s)
+        else:
+            for arg in node.args:
+                visit_node(arg, mask)
+    visit_node(loop_body, mask=True)
+def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
    """Inserts necessary casts from scalar values to vector values."""
-    handled_functions = (sp.Add, sp.Mul, fast_division, fast_sqrt, fast_inv_sqrt, vec_any, vec_all)
+    handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.Abs)
-    def visit_expr(expr):
+    def is_scalar(expr) -> bool:
+        if hasattr(expr, "dtype"):
+            if type(expr.dtype) is VectorType:
+                return False
+            # Else branch: If expr is a CastFunc, then whether the expression
+            # is scalar is determined by the argument (remember: vector casts
+            # are not inserted yet). Therefore, we must recurse into the args of
+            # expr below. Otherwise, this expression is atomic and in that case
+            # it is assumed to be scalar below.
-        if isinstance(expr, cast_func) or isinstance(expr, vector_memory_access):
+        if isinstance(expr, ast.ResolvedFieldAccess):
-            return expr
+            # expr.field is not in expr.args
-        elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, sp.boolalg.BooleanFunction):
+            return is_scalar(expr.field)
-            new_args = [visit_expr(a) for a in expr.args]
+        elif isinstance(expr, (vec_any, vec_all)):
-            arg_types = [get_type_of_expression(a) for a in new_args]
+            return True
+        if not hasattr(expr, "args"):
+            return True
+        return all(is_scalar(arg) for arg in expr.args)
+    # TODO Vectorization Revamp: get rid of default_type
+    def visit_expr(expr, default_type='double', force_vectorize=False):
+        if isinstance(expr, VectorMemoryAccess):
+            return VectorMemoryAccess(*expr.args[0:4], visit_expr(expr.args[4], default_type, force_vectorize),
+                                      *expr.args[5:])
+        elif isinstance(expr, CastFunc):
+            cast_type = expr.args[1]
+            arg = visit_expr(expr.args[0], default_type, force_vectorize)
+            assert cast_type in [BasicType('float32'), BasicType('float64')], \
+                f'Vectorization cannot vectorize type {cast_type}'
+            return expr.func(arg, VectorType(cast_type, instruction_set['width']))
+        elif expr.func is sp.Abs and 'abs' not in instruction_set:
+            new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
+            base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is VectorMemoryAccess \
+                else get_type_of_expression(expr.args[0])
+            pw = sp.Piecewise((-new_arg, new_arg < CastFunc(0, base_type.numpy_dtype)),
+                              (new_arg, True))
+            return visit_expr(pw, default_type, force_vectorize)
+        elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction):
+            if expr.func is sp.Mul and expr.args[0] == -1:
+                # special treatment for the unary minus: make sure that the -1 has the same type as the argument
+                dtype = int
+                for arg in expr.atoms(VectorMemoryAccess):
+                    if arg.dtype.base_type.is_float():
+                        dtype = arg.dtype.base_type.numpy_dtype.type
+                for arg in expr.atoms(TypedSymbol):
+                    if type(arg.dtype) is VectorType and arg.dtype.base_type.is_float():
+                        dtype = arg.dtype.base_type.numpy_dtype.type
+                if dtype is not int:
+                    if dtype is np.float32:
+                        default_type = 'float'
+                    expr = sp.Mul(dtype(expr.args[0]), *expr.args[1:])
+            new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args]
+            arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
            if not any(type(t) is VectorType for t in arg_types):
                return expr
            else:
                target_type = collate_types(arg_types)
-                casted_args = [cast_func(a, target_type) if t != target_type else a
+                casted_args = [
-                               for a, t in zip(new_args, arg_types)]
+                    CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
+                    for a, t in zip(new_args, arg_types)]
                return expr.func(*casted_args)
+        elif expr.func is sp.UnevaluatedExpr:
+            assert expr.args[0].is_Pow or expr.args[0].is_Mul, "UnevaluatedExpr only implemented holding Mul or Pow"
+            # TODO this is only because cut_loop evaluates the multiplications again due to deepcopy. All this should
+            # TODO be fixed for real at some point.
+            if expr.args[0].is_Pow:
+                base = expr.args[0].base
+                exp = expr.args[0].exp
+                expr = sp.UnevaluatedExpr(sp.Mul(*([base] * +exp), evaluate=False))
+            new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args[0].args]
+            arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
+            target_type = collate_types(arg_types)
+            if not any(type(t) is VectorType for t in arg_types):
+                target_type = VectorType(target_type, instruction_set['width'])
+            casted_args = [
+                CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
+                for a, t in zip(new_args, arg_types)]
+            return expr.func(expr.args[0].func(*casted_args, evaluate=False))
        elif expr.func is sp.Pow:
-            new_arg = visit_expr(expr.args[0])
+            new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
            return expr.func(new_arg, expr.args[1])
        elif expr.func == sp.Piecewise:
-            new_results = [visit_expr(a[0]) for a in expr.args]
+            new_results = [visit_expr(a[0], default_type, force_vectorize) for a in expr.args]
-            new_conditions = [visit_expr(a[1]) for a in expr.args]
+            new_conditions = [visit_expr(a[1], default_type, force_vectorize) for a in expr.args]
            types_of_results = [get_type_of_expression(a) for a in new_results]
            types_of_conditions = [get_type_of_expression(a) for a in new_conditions]
@@ -167,43 +365,61 @@ def insert_vector_casts(ast_node):
            if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType:
                condition_target_type = VectorType(condition_target_type, width=result_target_type.width)
-            casted_results = [cast_func(a, result_target_type) if t != result_target_type else a
+            casted_results = [CastFunc(a, result_target_type) if t != result_target_type else a
                              for a, t in zip(new_results, types_of_results)]
-            casted_conditions = [cast_func(a, condition_target_type)
+            casted_conditions = [CastFunc(a, condition_target_type)
                                 if t != condition_target_type and a is not True else a
                                 for a, t in zip(new_conditions, types_of_conditions)]
            return sp.Piecewise(*[(r, c) for r, c in zip(casted_results, casted_conditions)])
-        else:
+        elif isinstance(expr, TypedSymbol):
+            if force_vectorize:
+                expr_type = get_type_of_expression(expr)
+                if type(expr_type) is not VectorType:
+                    vector_type = VectorType(expr_type, instruction_set['width'])
+                    return CastFunc(expr, vector_type)
            return expr
+        elif isinstance(expr, (sp.Number, BooleanAtom)):
+            return expr
+        else:
+            raise NotImplementedError(f'Due to defensive programming we handle only specific expressions.\n'
+                                      f'The expression {expr} of type {type(expr)} is not known yet.')
-    def visit_node(node, substitution_dict):
+    def visit_node(node, substitution_dict, default_type='double'):
        substitution_dict = substitution_dict.copy()
        for arg in node.args:
            if isinstance(arg, ast.SympyAssignment):
                assignment = arg
+                # If there is a remainder loop we do not vectorise it, thus lhs will indicate this
+                # if isinstance(assignment.lhs, ast.ResolvedFieldAccess):
+                # continue
                subs_expr = fast_subs(assignment.rhs, substitution_dict,
                                      skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
-                assignment.rhs = visit_expr(subs_expr)
-                rhs_type = get_type_of_expression(assignment.rhs)
+                # If either side contains a vectorized subexpression, both sides
+                # must be fully vectorized.
+                lhs_scalar = is_scalar(assignment.lhs)
+                rhs_scalar = is_scalar(subs_expr)
+                assignment.rhs = visit_expr(subs_expr, default_type, force_vectorize=not (lhs_scalar and rhs_scalar))
                if isinstance(assignment.lhs, TypedSymbol):
-                    lhs_type = assignment.lhs.dtype
+                    if lhs_scalar and not rhs_scalar:
-                    if type(rhs_type) is VectorType and type(lhs_type) is not VectorType:
+                        lhs_type = get_type_of_expression(assignment.lhs)
+                        rhs_type = get_type_of_expression(assignment.rhs)
                        new_lhs_type = VectorType(lhs_type, rhs_type.width)
                        new_lhs = TypedSymbol(assignment.lhs.name, new_lhs_type)
                        substitution_dict[assignment.lhs] = new_lhs
                        assignment.lhs = new_lhs
-                elif isinstance(assignment.lhs.func, cast_func):
+                elif isinstance(assignment.lhs, VectorMemoryAccess):
-                    lhs_type = assignment.lhs.args[1]
+                    assignment.lhs = visit_expr(assignment.lhs, default_type)
-                    if type(lhs_type) is VectorType and type(rhs_type) is not VectorType:
-                        assignment.rhs = cast_func(assignment.rhs, lhs_type)
            elif isinstance(arg, ast.Conditional):
                arg.condition_expr = fast_subs(arg.condition_expr, substitution_dict,
                                               skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
-                arg.condition_expr = visit_expr(arg.condition_expr)
+                arg.condition_expr = visit_expr(arg.condition_expr, default_type)
-                visit_node(arg, substitution_dict)
+                visit_node(arg, substitution_dict, default_type)
            else:
-                visit_node(arg, substitution_dict)
+                visit_node(arg, substitution_dict, default_type)
-    visit_node(ast_node, {})
+    visit_node(ast_node, {}, default_float_type)
--- a/pystencils/datahandling/__init__.py
+++ b/pystencils/datahandling/__init__.py
+import warnings
 from typing import Tuple, Union
 from .datahandling_interface import DataHandling
+from ..enums import Target
 from .serial_datahandling import SerialDataHandling
 try:
@@ -18,9 +21,10 @@ except ImportError:
 def create_data_handling(domain_size: Tuple[int, ...],
                         periodicity: Union[bool, Tuple[bool, ...]] = False,
                         default_layout: str = 'SoA',
-                         default_target: str = 'cpu',
+                         default_target: Target = Target.CPU,
                         parallel: bool = False,
-                         default_ghost_layers: int = 1) -> DataHandling:
+                         default_ghost_layers: int = 1,
+                         device_number: Union[int, None] = None) -> DataHandling:
    """Creates a data handling instance.
    Args:
@@ -28,10 +32,19 @@ def create_data_handling(domain_size: Tuple[int, ...],
        periodicity: either True, False for full or no periodicity or a tuple of booleans indicating periodicity
                     for each coordinate
        default_layout: default array layout, that is used if not explicitly specified in 'add_array'
-        default_target: either 'cpu' or 'gpu'
+        default_target: `Target`
        parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
        default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
+        device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
+                       specified. If none is given, the device with the largest amount of memory is used. If multiple
+                       devices have the same amount of memory, the one with the lower number is used
    """
+    if isinstance(default_target, str):
+        new_target = Target[default_target.upper()]
+        warnings.warn(f'Target "{default_target}" as str is deprecated. Use {new_target} instead',
+                      category=DeprecationWarning)
+        default_target = new_target
    if parallel:
        if wlb is None:
            raise ValueError("Cannot create parallel data handling because walberla module is not available")
@@ -56,8 +69,12 @@ def create_data_handling(domain_size: Tuple[int, ...],
        return ParallelDataHandling(blocks=block_storage, dim=dim, default_target=default_target,
                                    default_layout=default_layout, default_ghost_layers=default_ghost_layers)
    else:
-        return SerialDataHandling(domain_size, periodicity=periodicity, default_target=default_target,
+        return SerialDataHandling(domain_size,
-                                  default_layout=default_layout, default_ghost_layers=default_ghost_layers)
+                                  periodicity=periodicity,
+                                  default_target=default_target,
+                                  default_layout=default_layout,
+                                  default_ghost_layers=default_ghost_layers,
+                                  device_number=device_number)
 __all__ = ['create_data_handling']
--- a/pystencils/datahandling/blockiteration.py
+++ b/pystencils/datahandling/blockiteration.py
@@ -111,15 +111,15 @@ class ParallelBlock(Block):
    def __getitem__(self, data_name):
        result = self._block[self._name_prefix + data_name]
        type_name = type(result).__name__
-        if type_name == 'GhostLayerField':
+        if 'GhostLayerField' in type_name:
-            result = wlb.field.toArray(result, withGhostLayers=self._gls)
+            result = wlb.field.toArray(result, with_ghost_layers=self._gls)
            result = self._normalize_array_shape(result)
-        elif type_name == 'GpuField':
+        elif 'GpuField' in type_name:
-            result = wlb.cuda.toGpuArray(result, withGhostLayers=self._gls)
+            result = wlb.gpu.toGpuArray(result, with_ghost_layers=self._gls)
            result = self._normalize_array_shape(result)
        return result
    def _normalize_array_shape(self, arr):
-        if arr.shape[-1] == 1:
+        if arr.shape[-1] == 1 and len(arr.shape) == 4:
            arr = arr[..., 0]
        return arr[self._localSlice]
--- a/pystencils/datahandling/datahandling_interface.py
+++ b/pystencils/datahandling/datahandling_interface.py
@@ -3,7 +3,8 @@ from typing import Callable, Dict, Iterable, Optional, Sequence, Tuple, Union
 import numpy as np
-from pystencils.field import Field
+from pystencils.enums import Target, Backend
+from pystencils.field import Field, FieldType
 class DataHandling(ABC):
@@ -16,7 +17,14 @@ class DataHandling(ABC):
    'gather' function that has collects (parts of the) distributed data on a single process.
    """
+    _GPU_LIKE_TARGETS = [Target.GPU]
+    _GPU_LIKE_BACKENDS = [Backend.CUDA]
    # ---------------------------- Adding and accessing data -----------------------------------------------------------
+    @property
+    @abstractmethod
+    def default_target(self) -> Target:
+        """Target Enum indicating the target of the computation"""
    @property
    @abstractmethod
@@ -36,7 +44,7 @@ class DataHandling(ABC):
    @abstractmethod
    def add_array(self, name: str, values_per_cell, dtype=np.float64,
                  latex_name: Optional[str] = None, ghost_layers: Optional[int] = None, layout: Optional[str] = None,
-                  cpu: bool = True, gpu: Optional[bool] = None, alignment=False) -> Field:
+                  cpu: bool = True, gpu: Optional[bool] = None, alignment=False, field_type=FieldType.GENERIC) -> Field:
        """Adds a (possibly distributed) array to the handling that can be accessed using the given name.
        For each array a symbolic field is available via the 'fields' dictionary
@@ -53,12 +61,63 @@ class DataHandling(ABC):
            layout: memory layout of array, either structure of arrays 'SoA' or array of structures 'AoS'.
                    this is only important if values_per_cell > 1
            cpu: allocate field on the CPU
-            gpu: allocate field on the GPU, if None, a GPU field is allocated if default_target is 'gpu'
+            gpu: allocate field on the GPU, if None, a GPU field is allocated if default_target is 'GPU'
            alignment: either False for no alignment, or the number of bytes to align to
        Returns:
            pystencils field, that can be used to formulate symbolic kernels
        """
+    def add_arrays(self,
+                   description: str,
+                   dtype=np.float64,
+                   ghost_layers: Optional[int] = None,
+                   layout: Optional[str] = None,
+                   cpu: bool = True,
+                   gpu: Optional[bool] = None,
+                   alignment=False,
+                   field_type=FieldType.GENERIC) -> Tuple[Field]:
+        """Adds multiple arrays using a string description similar to :func:`pystencils.fields`
+        >>> from pystencils.datahandling import create_data_handling
+        >>> dh = create_data_handling((20, 30))
+        >>> x, y =dh.add_arrays('x, y(9)')
+        >>> print(dh.fields)
+        {'x': x: double[22,32], 'y': y(9): double[22,32]}
+        >>> assert x == dh.fields['x']
+        >>> assert dh.fields['x'].shape == (22, 32)
+        >>> assert dh.fields['y'].index_shape == (9,)
+        Args:
+            description (str): String description of the fields to add
+            dtype: data type of the array as numpy data type
+            ghost_layers: number of ghost layers - if not specified a default value specified in the constructor
+                         is used
+            layout: memory layout of array, either structure of arrays 'SoA' or array of structures 'AoS'.
+                    this is only important if values_per_cell > 1
+            cpu: allocate field on the CPU
+            gpu: allocate field on the GPU, if None, a GPU field is allocated if default_target is 'GPU'
+            alignment: either False for no alignment, or the number of bytes to align to
+        Returns:
+            Fields representing the just created arrays
+        """
+        from pystencils.field import _parse_part1
+        names = []
+        for name, indices in _parse_part1(description):
+            names.append(name)
+            self.add_array(name,
+                           values_per_cell=indices,
+                           dtype=dtype,
+                           ghost_layers=ghost_layers,
+                           layout=layout,
+                           cpu=cpu,
+                           gpu=gpu,
+                           alignment=alignment,
+                           field_type=field_type)
+        return (self.fields[n] for n in names)
    @abstractmethod
    def has_data(self, name):
        """Returns true if a field or custom data element with this name was added."""
@@ -153,6 +212,10 @@ class DataHandling(ABC):
        directly passed to the kernel function and override possible parameters from the DataHandling
        """
+    @abstractmethod
+    def get_kernel_kwargs(self, kernel_function, **kwargs):
+        """Returns the input arguments of a kernel"""
    @abstractmethod
    def swap(self, name1, name2, gpu=False):
        """Swaps data of two arrays"""
@@ -222,7 +285,7 @@ class DataHandling(ABC):
            names: what data to synchronize: name of array or sequence of names
            stencil: stencil as string defining which neighbors are synchronized e.g. 'D2Q9', 'D3Q19'
                     if None, a full synchronization (i.e. D2Q9 or D3Q27) is done
-            target: either 'cpu' or 'gpu
+            target: `Target` either 'CPU' or 'GPU'
            kwargs: implementation specific, optional optimization parameters for communication
        Returns:
@@ -268,6 +331,7 @@ class DataHandling(ABC):
                b[array_name][(Ellipsis, *value_idx)].fill(val)
            else:
                b[array_name].fill(val)
+            self.to_gpu(array_name)
    def min(self, array_name, slice_obj=None, ghost_layers=False, inner_ghost_layers=False, reduce=True):
        """Returns the minimum value inside the domain or slice of the domain.

--- a/pystencils/datahandling/parallel_datahandling.py
+++ b/pystencils/datahandling/parallel_datahandling.py
@@ -7,16 +7,18 @@ import waLBerla as wlb
 from pystencils.datahandling.blockiteration import block_iteration, sliced_block_iteration
 from pystencils.datahandling.datahandling_interface import DataHandling
-from pystencils.field import Field
+from pystencils.enums import Backend
-from pystencils.kernelparameters import FieldPointerSymbol
+from pystencils.field import Field, FieldType
+from pystencils.typing.typed_sympy import FieldPointerSymbol
 from pystencils.utils import DotDict
+from pystencils import Target
 class ParallelDataHandling(DataHandling):
    GPU_DATA_PREFIX = "gpu_"
    VTK_COUNTER = 0
-    def __init__(self, blocks, default_ghost_layers=1, default_layout='SoA', dim=3, default_target='cpu'):
+    def __init__(self, blocks, default_ghost_layers=1, default_layout='SoA', dim=3, default_target=Target.CPU):
        """
        Creates data handling based on walberla block storage
@@ -27,18 +29,19 @@ class ParallelDataHandling(DataHandling):
            dim: dimension of scenario,
                 walberla always uses three dimensions, so if dim=2 the extend of the
                 z coordinate of blocks has to be 1
-            default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated
+            default_target: `Target`, either 'CPU' or 'GPU' . If set to 'GPU' for each array also a GPU version is
-                           if not overwritten in add_array, and synchronization functions are for the GPU by default
+                            allocated if not overwritten in add_array, and synchronization functions are for the GPU by
+                            default
        """
        super(ParallelDataHandling, self).__init__()
        assert dim in (2, 3)
-        self.blocks = blocks
+        self._blocks = blocks
-        self.default_ghost_layers = default_ghost_layers
+        self._default_ghost_layers = default_ghost_layers
-        self.default_layout = default_layout
+        self._default_layout = default_layout
        self._fields = DotDict()  # maps name to symbolic pystencils field
        self._field_name_to_cpu_data_name = {}
        self._field_name_to_gpu_data_name = {}
-        self.data_names = set()
+        self._data_names = set()
        self._dim = dim
        self._fieldInformation = {}
        self._cpu_gpu_pairs = []
@@ -52,7 +55,11 @@ class ParallelDataHandling(DataHandling):
        if self._dim == 2:
            assert self.blocks.getDomainCellBB().size[2] == 1
-        self.default_target = default_target
+        self._default_target = default_target
+    @property
+    def default_target(self):
+        return self._default_target
    @property
    def dim(self):
@@ -70,6 +77,22 @@ class ParallelDataHandling(DataHandling):
    def fields(self):
        return self._fields
+    @property
+    def blocks(self):
+        return self._blocks
+    @property
+    def default_ghost_layers(self):
+        return self._default_ghost_layers
+    @property
+    def default_layout(self):
+        return self._default_layout
+    @property
+    def data_names(self):
+        return self.data_names
    def ghost_layers_of_field(self, name):
        return self._fieldInformation[name]['ghost_layers']
@@ -90,18 +113,18 @@ class ParallelDataHandling(DataHandling):
        self._custom_data_names.append(name)
    def add_array(self, name, values_per_cell=1, dtype=np.float64, latex_name=None, ghost_layers=None,
-                  layout=None, cpu=True, gpu=None, alignment=False):
+                  layout=None, cpu=True, gpu=None, alignment=False, field_type=FieldType.GENERIC):
        if ghost_layers is None:
            ghost_layers = self.default_ghost_layers
        if gpu is None:
-            gpu = self.default_target == 'gpu'
+            gpu = self.default_target == Target.GPU
        if layout is None:
            layout = self.default_layout
        if len(self.blocks) == 0:
            raise ValueError("Data handling expects that each process has at least one block")
        if hasattr(dtype, 'type'):
            dtype = dtype.type
-        if name in self.blocks[0] or self.GPU_DATA_PREFIX + name in self.blocks[0]:
+        if name in self.blocks[0].fieldNames or self.GPU_DATA_PREFIX + name in self.blocks[0].fieldNames:
            raise ValueError("Data with this name has already been added")
        if alignment is False or alignment is None:
@@ -109,11 +132,14 @@ class ParallelDataHandling(DataHandling):
        if hasattr(values_per_cell, '__len__'):
            raise NotImplementedError("Parallel data handling does not support multiple index dimensions")
-        self._fieldInformation[name] = {'ghost_layers': ghost_layers,
+        self._fieldInformation[name] = {
-                                        'values_per_cell': values_per_cell,
+            'ghost_layers': ghost_layers,
-                                        'layout': layout,
+            'values_per_cell': values_per_cell,
-                                        'dtype': dtype,
+            'layout': layout,
-                                        'alignment': alignment}
+            'dtype': dtype,
+            'alignment': alignment,
+            'field_type': field_type,
+        }
        layout_map = {'fzyx': wlb.field.Layout.fzyx, 'zyxf': wlb.field.Layout.zyxf,
                      'f': wlb.field.Layout.fzyx,
@@ -125,8 +151,8 @@ class ParallelDataHandling(DataHandling):
        if gpu:
            if alignment != 0:
                raise ValueError("Alignment for walberla GPU fields not yet supported")
-            wlb.cuda.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
+            wlb.gpu.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
-                                          usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])
+                                         usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])
        if cpu and gpu:
            self._cpu_gpu_pairs.append((name, self.GPU_DATA_PREFIX + name))
@@ -140,7 +166,8 @@ class ParallelDataHandling(DataHandling):
        assert all(f.name != name for f in self.fields.values()), "Symbolic field with this name already exists"
        self.fields[name] = Field.create_generic(name, self.dim, dtype, index_dimensions, layout,
-                                                 index_shape=(values_per_cell,) if index_dimensions > 0 else None)
+                                                 index_shape=(values_per_cell,) if index_dimensions > 0 else None,
+                                                 field_type=field_type)
        self.fields[name].latex_name = latex_name
        self._field_name_to_cpu_data_name[name] = name
        if gpu:
@@ -211,15 +238,13 @@ class ParallelDataHandling(DataHandling):
            array = array[:, :, 0]
        if last_element and self.fields[name].index_dimensions > 0:
            array = array[..., last_element[0]]
-        if self.fields[name].index_dimensions == 0:
-            array = array[..., 0]
        return array
    def _normalize_arr_shape(self, arr, index_dimensions):
-        if index_dimensions == 0:
+        if index_dimensions == 0 and len(arr.shape) > 3:
            arr = arr[..., 0]
-        if self.dim == 2:
+        if self.dim == 2 and len(arr.shape) > 2:
            arr = arr[:, :, 0]
        return arr
@@ -228,9 +253,9 @@ class ParallelDataHandling(DataHandling):
            kernel_function(**arg_dict)
    def get_kernel_kwargs(self, kernel_function, **kwargs):
-        if kernel_function.ast.backend == 'gpucuda':
+        if kernel_function.ast.backend == Backend.CUDA:
            name_map = self._field_name_to_gpu_data_name
-            to_array = wlb.cuda.toGpuArray
+            to_array = wlb.gpu.toGpuArray
        else:
            name_map = self._field_name_to_cpu_data_name
            to_array = wlb.field.toArray
@@ -242,7 +267,7 @@ class ParallelDataHandling(DataHandling):
        for block in self.blocks:
            field_args = {}
            for data_name, f in data_used_in_kernel:
-                arr = to_array(block[data_name], withGhostLayers=[True, True, self.dim == 3])
+                arr = to_array(block[data_name], with_ghost_layers=[True, True, self.dim == 3])
                arr = self._normalize_arr_shape(arr, f.index_dimensions)
                field_args[f.name] = arr
            field_args.update(kwargs)
@@ -255,7 +280,8 @@ class ParallelDataHandling(DataHandling):
            for block in self.blocks:
                transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
        else:
-            wlb.cuda.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
+            if self.is_on_gpu(name):
+                wlb.gpu.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
    def to_gpu(self, name):
        if name in self._custom_data_transfer_functions:
@@ -263,28 +289,29 @@ class ParallelDataHandling(DataHandling):
            for block in self.blocks:
                transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
        else:
-            wlb.cuda.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
+            if self.is_on_gpu(name):
+                wlb.gpu.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
    def is_on_gpu(self, name):
        return (name, self.GPU_DATA_PREFIX + name) in self._cpu_gpu_pairs
    def all_to_cpu(self):
        for cpu_name, gpu_name in self._cpu_gpu_pairs:
-            wlb.cuda.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
+            wlb.gpu.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
        for name in self._custom_data_transfer_functions.keys():
            self.to_cpu(name)
    def all_to_gpu(self):
        for cpu_name, gpu_name in self._cpu_gpu_pairs:
-            wlb.cuda.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
+            wlb.gpu.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
        for name in self._custom_data_transfer_functions.keys():
            self.to_gpu(name)
    def synchronization_function_cpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_):
-        return self.synchronization_function(names, stencil, 'cpu', buffered, stencil_restricted)
+        return self.synchronization_function(names, stencil, Target.CPU, buffered, stencil_restricted)
    def synchronization_function_gpu(self, names, stencil=None, buffered=True, stencil_restricted=False, **_):
-        return self.synchronization_function(names, stencil, 'gpu', buffered, stencil_restricted)
+        return self.synchronization_function(names, stencil, Target.GPU, buffered, stencil_restricted)
    def synchronization_function(self, names, stencil=None, target=None, buffered=True, stencil_restricted=False):
        if target is None:
@@ -297,13 +324,13 @@ class ParallelDataHandling(DataHandling):
            names = [names]
        create_scheme = wlb.createUniformBufferedScheme if buffered else wlb.createUniformDirectScheme
-        if target == 'cpu':
+        if target == Target.CPU:
            create_packing = wlb.field.createPackInfo if buffered else wlb.field.createMPIDatatypeInfo
-            if not buffered and stencil_restricted:
+            if buffered and stencil_restricted:
                create_packing = wlb.field.createStencilRestrictedPackInfo
        else:
-            assert target == 'gpu'
+            assert target == Target.GPU
-            create_packing = wlb.cuda.createPackInfo if buffered else wlb.cuda.createMPIDatatypeInfo
+            create_packing = wlb.gpu.createPackInfo if buffered else wlb.gpu.createMPIDatatypeInfo
            names = [self.GPU_DATA_PREFIX + name for name in names]
        sync_function = create_scheme(self.blocks, stencil)
@@ -379,7 +406,7 @@ class ParallelDataHandling(DataHandling):
        if not os.path.exists(directory):
            os.mkdir(directory)
        if os.path.isfile(directory):
-            raise RuntimeError("Trying to save to {}, but file exists already".format(directory))
+            raise RuntimeError(f"Trying to save to {directory}, but file exists already")
        for field_name, data_name in self._field_name_to_cpu_data_name.items():
            self.blocks.writeBlockData(data_name, os.path.join(directory, field_name + ".dat"))

--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -6,22 +6,24 @@ import numpy as np
 from pystencils.datahandling.blockiteration import SerialBlock
 from pystencils.datahandling.datahandling_interface import DataHandling
-from pystencils.field import (
+from pystencils.enums import Target
-    Field, create_numpy_array_with_layout, layout_string_to_tuple, spatial_layout_string_to_tuple)
+from pystencils.field import (Field, FieldType, create_numpy_array_with_layout,
+                              layout_string_to_tuple, spatial_layout_string_to_tuple)
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
 from pystencils.slicing import normalize_slice, remove_ghost_layers
 from pystencils.utils import DotDict
-try:
-    import pycuda.gpuarray as gpuarray
-    import pycuda.autoinit  # NOQA
-except ImportError:
-    gpuarray = None
 class SerialDataHandling(DataHandling):
-    def __init__(self, domain_size: Sequence[int], default_ghost_layers: int = 1, default_layout: str = 'SoA',
+    def __init__(self,
-                 periodicity: Union[bool, Sequence[bool]] = False, default_target: str = 'cpu') -> None:
+                 domain_size: Sequence[int],
+                 default_ghost_layers: int = 1,
+                 default_layout: str = 'SoA',
+                 periodicity: Union[bool, Sequence[bool]] = False,
+                 default_target: Target = Target.CPU,
+                 array_handler=None,
+                 device_number=None) -> None:
        """
        Creates a data handling for single node simulations.
@@ -29,8 +31,17 @@ class SerialDataHandling(DataHandling):
            domain_size: size of the spatial domain as tuple
            default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
            default_layout: default layout used, if  not overridden in add_array() method
-            default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated
+            periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
-                            if not overwritten in add_array, and synchronization functions are for the GPU by default
+                         Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
+                         False (non-periodic)
+            default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
+                            allocated if not overwritten in add_array, and synchronization functions are for the GPU by
+                            default
+            array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
+                           and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
+            device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
+                           the device with the largest amount of memory is used. If multiple devices have the same
+                           amount of memory, the one with the lower number is used
        """
        super(SerialDataHandling, self).__init__()
        self._domainSize = tuple(domain_size)
@@ -43,6 +54,19 @@ class SerialDataHandling(DataHandling):
        self.custom_data_gpu = DotDict()
        self._custom_data_transfer_functions = {}
+        if not array_handler:
+            try:
+                if device_number is None:
+                    import cupy.cuda.runtime
+                    if cupy.cuda.runtime.getDeviceCount() > 0:
+                        device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
+                                               key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
+                self.array_handler = GPUArrayHandler(device_number)
+            except ImportError:
+                self.array_handler = GPUNotAvailableHandler()
+        else:
+            self.array_handler = array_handler
        if periodicity is None or periodicity is False:
            periodicity = [False] * self.dim
        if periodicity is True:
@@ -50,9 +74,13 @@ class SerialDataHandling(DataHandling):
        self._periodicity = periodicity
        self._field_information = {}
-        self.default_target = default_target
+        self._default_target = default_target
        self._start_time = time.perf_counter()
+    @property
+    def default_target(self):
+        return self._default_target
    @property
    def dim(self):
        return len(self._domainSize)
@@ -76,13 +104,13 @@ class SerialDataHandling(DataHandling):
        return self._field_information[name]['values_per_cell']
    def add_array(self, name, values_per_cell=1, dtype=np.float64, latex_name=None, ghost_layers=None, layout=None,
-                  cpu=True, gpu=None, alignment=False):
+                  cpu=True, gpu=None, alignment=False, field_type=FieldType.GENERIC):
        if ghost_layers is None:
            ghost_layers = self.default_ghost_layers
        if layout is None:
            layout = self.default_layout
        if gpu is None:
-            gpu = self.default_target == 'gpu'
+            gpu = self.default_target in self._GPU_LIKE_TARGETS
        kwargs = {
            'shape': tuple(s + 2 * ghost_layers for s in self._domainSize),
@@ -90,7 +118,7 @@ class SerialDataHandling(DataHandling):
        }
        if not hasattr(values_per_cell, '__len__'):
-            values_per_cell = (values_per_cell, )
+            values_per_cell = (values_per_cell,)
        if len(values_per_cell) == 1 and values_per_cell[0] == 1:
            values_per_cell = ()
@@ -100,6 +128,7 @@ class SerialDataHandling(DataHandling):
            'layout': layout,
            'dtype': dtype,
            'alignment': alignment,
+            'field_type': field_type,
        }
        index_dimensions = len(values_per_cell)
@@ -110,10 +139,14 @@ class SerialDataHandling(DataHandling):
        else:
            layout_tuple = spatial_layout_string_to_tuple(layout, self.dim)
-        # cpu_arr is always created - since there is no create_pycuda_array_with_layout()
+        # cpu_arr is always created - since there is no create_gpu_array_with_layout()
        byte_offset = ghost_layers * np.dtype(dtype).itemsize
-        cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
-                                                 byte_offset=byte_offset, **kwargs)
+        if gpu:
+            cpu_arr = self.array_handler.pinned_numpy_array(shape=kwargs['shape'], layout=layout_tuple, dtype=dtype)
+        else:
+            cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
+                                                     byte_offset=byte_offset, **kwargs)
        if alignment and gpu:
            raise NotImplementedError("Alignment for GPU fields not supported")
@@ -125,10 +158,11 @@ class SerialDataHandling(DataHandling):
        if gpu:
            if name in self.gpu_arrays:
                raise ValueError("GPU Field with this name already exists")
-            self.gpu_arrays[name] = gpuarray.to_gpu(cpu_arr)
+            self.gpu_arrays[name] = self.array_handler.to_gpu(cpu_arr)
        assert all(f.name != name for f in self.fields.values()), "Symbolic field with this name already exists"
-        self.fields[name] = Field.create_from_numpy_array(name, cpu_arr, index_dimensions=index_dimensions)
+        self.fields[name] = Field.create_from_numpy_array(name, cpu_arr, index_dimensions=index_dimensions,
+                                                          field_type=field_type)
        self.fields[name].latex_name = latex_name
        return self.fields[name]
@@ -207,7 +241,7 @@ class SerialDataHandling(DataHandling):
    def swap(self, name1, name2, gpu=None):
        if gpu is None:
-            gpu = self.default_target == "gpu"
+            gpu = self.default_target in self._GPU_LIKE_TARGETS
        arr = self.gpu_arrays if gpu else self.cpu_arrays
        arr[name1], arr[name2] = arr[name2], arr[name1]
@@ -220,12 +254,12 @@ class SerialDataHandling(DataHandling):
            self.to_gpu(name)
    def run_kernel(self, kernel_function, **kwargs):
-        arrays = self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays
+        arrays = self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays
-        kernel_function(**arrays, **kwargs)
+        kernel_function(**{**arrays, **kwargs})
    def get_kernel_kwargs(self, kernel_function, **kwargs):
        result = {}
-        result.update(self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays)
+        result.update(self.gpu_arrays if kernel_function.ast.backend in self._GPU_LIKE_BACKENDS else self.cpu_arrays)
        result.update(kwargs)
        return [result]
@@ -234,28 +268,30 @@ class SerialDataHandling(DataHandling):
            transfer_func = self._custom_data_transfer_functions[name][1]
            transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
        else:
-            self.gpu_arrays[name].get(self.cpu_arrays[name])
+            if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
+                self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
    def to_gpu(self, name):
        if name in self._custom_data_transfer_functions:
            transfer_func = self._custom_data_transfer_functions[name][0]
            transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
        else:
-            self.gpu_arrays[name].set(self.cpu_arrays[name])
+            if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
+                self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
    def is_on_gpu(self, name):
        return name in self.gpu_arrays
    def synchronization_function_cpu(self, names, stencil_name=None, **_):
-        return self.synchronization_function(names, stencil_name, 'cpu')
+        return self.synchronization_function(names, stencil_name, target=Target.CPU)
    def synchronization_function_gpu(self, names, stencil_name=None, **_):
-        return self.synchronization_function(names, stencil_name, 'gpu')
+        return self.synchronization_function(names, stencil_name, target=Target.GPU)
-    def synchronization_function(self, names, stencil=None, target=None, **_):
+    def synchronization_function(self, names, stencil=None, target=None, functor=None, **_):
        if target is None:
            target = self.default_target
-        assert target in ('cpu', 'gpu')
+        assert target in (Target.CPU, Target.GPU)
        if not hasattr(names, '__len__') or type(names) is str:
            names = [names]
@@ -284,25 +320,28 @@ class SerialDataHandling(DataHandling):
            gls = self._field_information[name]['ghost_layers']
            values_per_cell = self._field_information[name]['values_per_cell']
            if values_per_cell == ():
-                values_per_cell = (1, )
+                values_per_cell = (1,)
            if len(values_per_cell) == 1:
                values_per_cell = values_per_cell[0]
-            else:
-                raise NotImplementedError("Synchronization of this field is not supported: " + name)
            if len(filtered_stencil) > 0:
-                if target == 'cpu':
+                if target == Target.CPU:
-                    from pystencils.slicing import get_periodic_boundary_functor
+                    if functor is None:
-                    result.append(get_periodic_boundary_functor(filtered_stencil, ghost_layers=gls))
+                        from pystencils.slicing import get_periodic_boundary_functor
+                        functor = get_periodic_boundary_functor
+                    result.append(functor(filtered_stencil, ghost_layers=gls))
                else:
-                    from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as boundary_func
+                    if functor is None:
-                    result.append(boundary_func(filtered_stencil, self._domainSize,
+                        from pystencils.gpu.periodicity import get_periodic_boundary_functor as functor
-                                                index_dimensions=self.fields[name].index_dimensions,
+                        target = Target.GPU
-                                                index_dim_shape=values_per_cell,
+                    result.append(functor(filtered_stencil, self._domainSize,
-                                                dtype=self.fields[name].dtype.numpy_dtype,
+                                          index_dimensions=self.fields[name].index_dimensions,
-                                                ghost_layers=gls))
+                                          index_dim_shape=values_per_cell,
+                                          dtype=self.fields[name].dtype.numpy_dtype,
-        if target == 'cpu':
+                                          ghost_layers=gls,
+                                          target=target))
+        if target == Target.CPU:
            def result_functor():
                for arr_name, func in zip(names, result):
                    func(pdfs=self.cpu_arrays[arr_name])
@@ -353,6 +392,7 @@ class SerialDataHandling(DataHandling):
                    raise NotImplementedError("VTK export for fields with more than one index "
                                              "coordinate not implemented")
            image_to_vtk(full_file_name, cell_data=cell_data)
        return writer
    def create_vtk_writer_for_flag_array(self, file_name, data_name, masks_to_name, ghost_layers=False):
@@ -384,7 +424,7 @@ class SerialDataHandling(DataHandling):
        time_running = time.perf_counter() - self._start_time
        spacing = 7 - len(str(int(time_running)))
-        message = "[{: <8}]{}({:.3f} sec) {} ".format(level, spacing * '-', time_running, message)
+        message = f"[{level: <8}]{spacing * '-'}({time_running:.3f} sec) {message} "
        print(message, flush=True)
    def log_on_root(self, *args, level='INFO'):
@@ -398,18 +438,28 @@ class SerialDataHandling(DataHandling):
    def world_rank(self):
        return 0
-    def save_all(self, file):
+    def save_all(self, filename, compressed=True, synchronise_data=True):
-        np.savez_compressed(file, **self.cpu_arrays)
+        if synchronise_data:
+            for name in (self.cpu_arrays.keys() & self.gpu_arrays.keys()):
+                self.to_cpu(name)
+        if compressed:
+            np.savez_compressed(filename, **self.cpu_arrays)
+        else:
+            np.savez(filename, **self.cpu_arrays)
-    def load_all(self, file):
+    def load_all(self, filename, synchronise_data=True):
-        file_contents = np.load(file)
+        if '.npz' not in filename:
+            filename += '.npz'
+        file_contents = np.load(filename)
        for arr_name, arr_contents in self.cpu_arrays.items():
            if arr_name not in file_contents:
-                print("Skipping read data {} because there is no data with this name in data handling".format(arr_name))
+                print(f"Skipping read data {arr_name} because there is no data with this name in data handling")
                continue
            if file_contents[arr_name].shape != arr_contents.shape:
-                print("Skipping read data {} because shapes don't match. "
+                print(f"Skipping read data {arr_name} because shapes don't match. "
-                      "Read array shape {}, existing array shape {}".format(arr_name, file_contents[arr_name].shape,
+                      f"Read array shape {file_contents[arr_name].shape}, existing array shape {arr_contents.shape}")
-                                                                            arr_contents.shape))
                continue
            np.copyto(arr_contents, file_contents[arr_name])
+            if synchronise_data:
+                if arr_name in self.gpu_arrays.keys():
+                    self.to_gpu(arr_name)
--- a/pystencils/datahandling/vtk.py
+++ b/pystencils/datahandling/vtk.py
--- a/pystencils/display_utils.py
+++ b/pystencils/display_utils.py
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 import sympy as sp
 from pystencils.astnodes import KernelFunction
+from pystencils.enums import Backend
+from pystencils.kernel_wrapper import KernelWrapper
 def to_dot(expr: sp.Expr, graph_style: Optional[Dict[str, Any]] = None, short=True):
    """Show a sympy or pystencils AST as dot graph"""
    from pystencils.astnodes import Node
-    import graphviz
+    try:
+        import graphviz
+    except ImportError:
+        print("graphviz is not installed. Visualizing the AST is not available")
+        return
    graph_style = {} if graph_style is None else graph_style
    if isinstance(expr, Node):
@@ -29,18 +36,24 @@ def highlight_cpp(code: str):
    from pygments.lexers import CppLexer
    css = HtmlFormatter().get_style_defs('.highlight')
-    css_tag = "<style>{css}</style>".format(css=css)
+    css_tag = f"<style>{css}</style>"
    display(HTML(css_tag))
    return HTML(highlight(code, CppLexer(), HtmlFormatter()))
-def show_code(ast: KernelFunction, custom_backend=None):
+def get_code_obj(ast: Union[KernelFunction, KernelWrapper], custom_backend=None):
    """Returns an object to display generated code (C/C++ or CUDA)
-    Can either  be displayed as HTML in Jupyter notebooks or printed as normal string.
+    Can either be displayed as HTML in Jupyter notebooks or printed as normal string.
    """
    from pystencils.backends.cbackend import generate_c
-    dialect = 'cuda' if ast.backend == 'gpucuda' else 'c'
+    if isinstance(ast, KernelWrapper):
+        ast = ast.ast
+    if ast.backend not in {Backend.C, Backend.CUDA}:
+        raise NotImplementedError(f'get_code_obj is not implemented for backend {ast.backend}')
+    dialect = ast.backend
    class CodeDisplay:
        def __init__(self, ast_input):
@@ -55,3 +68,37 @@ def show_code(ast: KernelFunction, custom_backend=None):
        def __repr__(self):
            return generate_c(self.ast, dialect=dialect, custom_backend=custom_backend)
    return CodeDisplay(ast)
+def get_code_str(ast, custom_backend=None):
+    return str(get_code_obj(ast, custom_backend))
+def _isnotebook():
+    try:
+        shell = get_ipython().__class__.__name__
+        if shell == 'ZMQInteractiveShell':
+            return True   # Jupyter notebook or qtconsole
+        elif shell == 'TerminalInteractiveShell':
+            return False  # Terminal running IPython
+        else:
+            return False  # Other type (?)
+    except NameError:
+        return False
+def show_code(ast: Union[KernelFunction, KernelWrapper], custom_backend=None):
+    code = get_code_obj(ast, custom_backend)
+    if _isnotebook():
+        from IPython.display import display
+        display(code)
+    else:
+        try:
+            import rich.syntax
+            import rich.console
+            syntax = rich.syntax.Syntax(str(code), "c++", theme="monokai", line_numbers=True)
+            console = rich.console.Console()
+            console.print(syntax)
+        except ImportError:
+            print(code)
--- a/src/pystencils/enums.py
+++ b/src/pystencils/enums.py
+from enum import Enum, auto
+class Target(Enum):
+    """
+    The Target enumeration represents all possible targets that can be used for the code generation.
+    """
+    CPU = auto()
+    """
+    Target CPU architecture.
+    """
+    GPU = auto()
+    """
+    Target GPU architecture.
+    """
+class Backend(Enum):
+    """
+    The Backend enumeration represents all possible backends that can be used for the code generation.
+    Backends and targets must be combined with care. For example CPU as a target and CUDA as a backend makes no sense.
+    """
+    C = auto()
+    """
+    Use the C Backend of pystencils.
+    """
+    CUDA = auto()
+    """
+    Use the CUDA backend to generate code for NVIDIA GPUs.
+    """
--- a/pystencils/fast_approximation.py
+++ b/pystencils/fast_approximation.py
@@ -4,20 +4,30 @@ import sympy as sp
 from pystencils.astnodes import Node
 from pystencils.simp import AssignmentCollection
+from pystencils.assignment import Assignment
 # noinspection PyPep8Naming
 class fast_division(sp.Function):
+    """
+    Produces special float instructions for CUDA kernels
+    """
    nargs = (2,)
 # noinspection PyPep8Naming
 class fast_sqrt(sp.Function):
+    """
+    Produces special float instructions for CUDA kernels
+    """
    nargs = (1, )
 # noinspection PyPep8Naming
 class fast_inv_sqrt(sp.Function):
+    """
+    Produces special float instructions for CUDA kernels
+    """
    nargs = (1, )
@@ -32,7 +42,7 @@ def _run(term, visitor):
        return visitor(term)
-def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]):
+def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection, Assignment]):
    def visit(expr):
        if isinstance(expr, Node):
            return expr
@@ -48,7 +58,7 @@ def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection])
    return _run(term, visit)
-def insert_fast_divisions(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]):
+def insert_fast_divisions(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection, Assignment]):
    def visit(expr):
        if isinstance(expr, Node):

--- a/pystencils/fd/__init__.py
+++ b/pystencils/fd/__init__.py
@@ -3,10 +3,11 @@ from .derivative import (
    expand_diff_full, expand_diff_linear, expand_diff_products, functional_derivative,
    normalize_diff_order, zero_diffs)
 from .finitedifferences import Discretization2ndOrder, advection, diffusion, transient
+from .finitevolumes import FVM1stOrder, VOF
 from .spatial import discretize_spatial, discretize_spatial_staggered
 __all__ = ['Diff', 'diff', 'DiffOperator', 'diff_terms', 'collect_diffs',
           'zero_diffs', 'evaluate_diffs', 'normalize_diff_order', 'expand_diff_full', 'expand_diff_linear',
           'expand_diff_products', 'combine_diff_products', 'functional_derivative',
           'advection', 'diffusion', 'transient', 'Discretization2ndOrder', 'discretize_spatial',
-           'discretize_spatial_staggered']
+           'discretize_spatial_staggered', 'FVM1stOrder', 'VOF']
--- a/pystencils/fd/derivation.py
+++ b/pystencils/fd/derivation.py
-import warnings
+import itertools
 from collections import defaultdict
 import numpy as np
 import sympy as sp
 from pystencils.field import Field
+from pystencils.stencil import direction_string_to_offset
 from pystencils.sympyextensions import multidimensional_sum, prod
 from pystencils.utils import LinearEquationSystem, fully_contains
@@ -106,7 +107,7 @@ class FiniteDifferenceStencilDerivation:
    @staticmethod
    def symbolic_weight(*args):
        str_args = [str(e) for e in args]
-        return sp.Symbol("w_({})".format(",".join(str_args)))
+        return sp.Symbol(f"w_({','.join(str_args)})")
    def error_term_dict(self, order):
        error_terms = defaultdict(lambda: 0)
@@ -125,7 +126,6 @@ class FiniteDifferenceStencilDerivation:
    def isotropy_equations(self, order):
        def cycle_int_sequence(sequence, modulus):
-            import numpy as np
            result = []
            arr = np.array(sequence, dtype=int)
            while True:
@@ -170,17 +170,8 @@ class FiniteDifferenceStencilDerivation:
            f = field_access
            return sum(f.get_shifted(*offset) * weight for offset, weight in zip(self.stencil, self.weights))
-        def as_matrix(self):
+        def __array__(self):
-            warnings.warn("as_matrix is deprecated and may be removed in the near future."
+            return np.array(self.as_array().tolist())
-                          "Please use as_array instead which will return an MutableDenseNDimArray."
-                          "as_array therefore can also work in 3 dimensions", category=DeprecationWarning)
-            dim = len(self.stencil[0])
-            assert dim == 2
-            max_offset = max(max(abs(e) for e in direction) for direction in self.stencil)
-            result = sp.Matrix(2 * max_offset + 1, 2 * max_offset + 1, lambda i, j: 0)
-            for direction, weight in zip(self.stencil, self.weights):
-                result[max_offset - direction[1], max_offset + direction[0]] = weight
-            return result
        def as_array(self):
            dim = len(self.stencil[0])
@@ -203,12 +194,12 @@ class FiniteDifferenceStencilDerivation:
            return result
-        def rotate_weights_and_apply(self, field_access: Field.Access, axis):
+        def rotate_weights_and_apply(self, field_access: Field.Access, axes):
            """derive gradient weights of other direction with already calculated weights of one direction
               via rotation and apply them to a field."""
            dim = len(self.stencil[0])
            assert (dim == 2 or dim == 3), "This function is only for 2D or 3D stencils available"
-            rotated_weights = np.rot90(np.array(self.as_array()).reshape(self.as_array().shape), 1, axis)
+            rotated_weights = np.rot90(np.array(self.__array__()), 1, axes)
            result = []
            max_offset = max(max(abs(e) for e in direction) for direction in self.stencil)
@@ -228,3 +219,119 @@ class FiniteDifferenceStencilDerivation:
        def __repr__(self):
            return "Finite difference stencil of accuracy {}, isotropic error: {}".format(self.accuracy,
                                                                                          self.is_isotropic)
+class FiniteDifferenceStaggeredStencilDerivation:
+    """Derives a finite difference stencil for application at a staggered position
+    Args:
+        neighbor: the neighbor direction string or vector at whose staggered position to calculate the derivative
+        dim: how many dimensions (2 or 3)
+        derivative: a tuple of directions over which to perform derivatives
+        free_weights_prefix: a string to prefix to free weight symbols. If None, do not return free weights
+    """
+    def __init__(self, neighbor, dim, derivative=tuple(), free_weights_prefix=None):
+        if type(neighbor) is str:
+            neighbor = direction_string_to_offset(neighbor)
+        if dim == 2:
+            assert neighbor[dim:] == 0
+        assert derivative is tuple() or max(derivative) < dim
+        neighbor = sp.Matrix(neighbor[:dim])
+        pos = neighbor / 2
+        def unitvec(i):
+            """return the `i`-th unit vector in three dimensions"""
+            a = np.zeros(dim, dtype=int)
+            a[i] = 1
+            return a
+        def flipped(a, i):
+            """return `a` with its `i`-th element's sign flipped"""
+            a = a.copy()
+            a[i] *= -1
+            return a
+        # determine the points to use, coordinates are relative to position
+        points = []
+        if np.linalg.norm(neighbor, 1) == 1:
+            main_points = [neighbor / 2, neighbor / -2]
+        elif np.linalg.norm(neighbor, 1) == 2:
+            nonzero_indices = [i for i, v in enumerate(neighbor) if v != 0 and i < dim]
+            main_points = [neighbor / 2, neighbor / -2, flipped(neighbor / 2, nonzero_indices[0]),
+                           flipped(neighbor / -2, nonzero_indices[0])]
+        else:
+            main_points = [sp.Matrix(np.multiply(neighbor, sp.Matrix(c) / 2))
+                           for c in itertools.product([-1, 1], repeat=3)]
+        points += main_points
+        zero_indices = [i for i, v in enumerate(neighbor) if v == 0 and i < dim]
+        for i in zero_indices:
+            points += [point + sp.Matrix(unitvec(i)) for point in main_points]
+            points += [point - sp.Matrix(unitvec(i)) for point in main_points]
+        points_tuple = tuple([tuple(p) for p in points])
+        self._stencil = points_tuple
+        # determine the stencil weights
+        if len(derivative) == 0:
+            weights = None
+        else:
+            derivation = FiniteDifferenceStencilDerivation(derivative, points_tuple).get_stencil()
+            if not derivation.accuracy:
+                raise Exception('the requested derivative cannot be performed with the available neighbors')
+            weights = derivation.weights
+            # if the weights are underdefined, we can choose the free symbols to find the sparsest stencil
+            free_weights = set(itertools.chain(*[w.free_symbols for w in weights]))
+            if free_weights_prefix is not None:
+                weights = [w.subs({fw: sp.Symbol(f"{free_weights_prefix}_{i}") for i, fw in enumerate(free_weights)})
+                           for w in weights]
+            elif len(free_weights) > 0:
+                zero_counts = defaultdict(list)
+                for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)],
+                                                repeat=len(free_weights)):
+                    subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
+                    weights = [w.subs(subs) for w in derivation.weights]
+                    if not all(a == 0 for a in weights):
+                        zero_count = sum([1 for w in weights if w == 0])
+                        zero_counts[zero_count].append(weights)
+                best = zero_counts[max(zero_counts.keys())]
+                if len(best) > 1:  # if there are multiple, pick the one that contains a nonzero center weight
+                    center = [tuple(p + pos) for p in points].index((0, 0, 0)[:dim])
+                    best = [b for b in best if b[center] != 0]
+                if len(best) > 1:  # if there are still multiple, they are equivalent, so we average
+                    weights = [sum([b[i] for b in best]) / len(best) for i in range(len(weights))]
+                else:
+                    weights = best[0]
+                assert weights
+        points_tuple = tuple([tuple(p + pos) for p in points])
+        self._points = points_tuple
+        self._weights = weights
+    @property
+    def points(self):
+        """return the points of the stencil"""
+        return self._points
+    @property
+    def stencil(self):
+        """return the points of the stencil relative to the staggered position specified by neighbor"""
+        return self._stencil
+    @property
+    def weights(self):
+        """return the weights of the stencil"""
+        assert self._weights is not None
+        return self._weights
+    def visualize(self):
+        if self._weights is None:
+            ws = None
+        else:
+            ws = np.array([w for w in self.weights if w != 0], dtype=float)
+        pts = np.array([p for i, p in enumerate(self.points) if self.weights[i] != 0], dtype=int)
+        from pystencils.stencil import plot
+        plot(pts, data=ws)
+    def apply(self, access: Field.Access):
+        return sum([access.get_shifted(*point) * weight for point, weight in zip(self.points, self.weights)])
--- a/pystencils/fd/derivative.py
+++ b/pystencils/fd/derivative.py
@@ -109,7 +109,17 @@ class Diff(sp.Expr):
        return result
    def __str__(self):
-        return "D(%s)" % self.arg
+        return f"D({self.arg})"
+    def interpolated_access(self, offset, **kwargs):
+        """Represents an interpolated access on a spatially differentiated field
+        Args:
+            offset (Tuple[sympy.Expr]): Absolute position to determine the value of the spatial derivative
+        """
+        from pystencils.interpolation_astnodes import DiffInterpolatorAccess
+        assert isinstance(self.arg.field, Field), "Must be field to enable interpolated accesses"
+        return DiffInterpolatorAccess(self.arg.field.interpolated_access(offset, **kwargs).symbol, self.target, *offset)
 class DiffOperator(sp.Expr):
@@ -218,7 +228,9 @@ def diff_terms(expr):
    Example:
        >>> x, y = sp.symbols("x, y")
-        >>> diff_terms( diff(x, 0, 0)  )
+        >>> diff_terms( diff(x, 0, 0) )
+        {Diff(Diff(x, 0, -1), 0, -1)}
+        >>> diff_terms( diff(x, 0, 0) + y )
        {Diff(Diff(x, 0, -1), 0, -1)}
    """
    result = set()
@@ -306,7 +318,8 @@ def expand_diff_full(expr, functions=None, constants=None):
            functions.difference_update(constants)
    def visit(e):
-        e = e.expand()
+        if not isinstance(e, sp.Tuple):
+            e = e.expand()
        if e.func == Diff:
            result = 0
@@ -331,6 +344,9 @@ def expand_diff_full(expr, functions=None, constants=None):
            return result
        elif isinstance(e, sp.Piecewise):
            return sp.Piecewise(*((expand_diff_full(a, functions, constants), b) for a, b in e.args))
+        elif isinstance(expr, sp.Tuple):
+            new_args = [visit(arg) for arg in e.args]
+            return sp.Tuple(*new_args)
        else:
            new_args = [visit(arg) for arg in e.args]
            return e.func(*new_args) if new_args else e
@@ -370,6 +386,9 @@ def expand_diff_linear(expr, functions=None, constants=None):
                return diff.split_linear(functions)
    elif isinstance(expr, sp.Piecewise):
        return sp.Piecewise(*((expand_diff_linear(a, functions, constants), b) for a, b in expr.args))
+    elif isinstance(expr, sp.Tuple):
+        new_args = [expand_diff_linear(e, functions) for e in expr.args]
+        return sp.Tuple(*new_args)
    else:
        new_args = [expand_diff_linear(e, functions) for e in expr.args]
        result = sp.expand(expr.func(*new_args) if new_args else expr)

--- a/pystencils/fd/finitedifferences.py
+++ b/pystencils/fd/finitedifferences.py
@@ -21,10 +21,13 @@ def diffusion(scalar, diffusion_coeff, idx=None):
    Examples:
        >>> f = Field.create_generic('f', spatial_dimensions=2)
-        >>> diffusion_term = diffusion(scalar=f, diffusion_coeff=sp.Symbol("d"))
+        >>> d = sp.Symbol("d")
+        >>> dx = sp.Symbol("dx")
+        >>> diffusion_term = diffusion(scalar=f, diffusion_coeff=d)
        >>> discretization = Discretization2ndOrder()
-        >>> discretization(diffusion_term)
+        >>> expected_output = ((f[-1, 0] + f[0, -1] - 4 * f[0, 0] + f[0, 1] + f[1, 0]) * d) / dx**2
-        (f_W*d + f_S*d - 4*f_C*d + f_N*d + f_E*d)/dx**2
+        >>> sp.simplify(discretization(diffusion_term) - expected_output)
+        0
    """
    if isinstance(scalar, Field):
        first_arg = scalar.center
@@ -76,13 +79,6 @@ class Discretization2ndOrder:
        self.dt = dt
        self.spatial_stencil = discretization_stencil_func
-    @staticmethod
-    def _diff_order(e):
-        if not isinstance(e, Diff):
-            return 0
-        else:
-            return 1 + Discretization2ndOrder._diff_order(e.args[0])
    def _discretize_diffusion(self, e):
        result = 0
        for c in range(e.dim):
@@ -109,6 +105,7 @@ class Discretization2ndOrder:
            return self._discretize_advection(e)
        elif isinstance(e, Diff):
            arg, *indices = diff_args(e)
            if not isinstance(arg, Field.Access):
                raise ValueError("Only derivatives with field or field accesses as arguments can be discretized")
            return self.spatial_stencil(indices, self.dx, arg)
@@ -116,29 +113,6 @@ class Discretization2ndOrder:
            new_args = [self._discretize_spatial(a) for a in e.args]
            return e.func(*new_args) if new_args else e
-    def _discretize_diff(self, e):
-        order = self._diff_order(e)
-        if order == 1:
-            fa = e.args[0]
-            index = e.target
-            return (fa.neighbor(index, 1) - fa.neighbor(index, -1)) / (2 * self.dx)
-        elif order == 2:
-            indices = sorted([e.target, e.args[0].target])
-            fa = e.args[0].args[0]
-            if indices[0] == indices[1] and all(i >= 0 for i in indices):
-                result = (-2 * fa + fa.neighbor(indices[0], -1) + fa.neighbor(indices[0], +1))
-            elif indices[0] == indices[1]:
-                result = 0
-                for d in range(fa.field.spatial_dimensions):
-                    result += (-2 * fa + fa.neighbor(d, -1) + fa.neighbor(d, +1))
-            else:
-                assert all(i >= 0 for i in indices)
-                offsets = [(1, 1), [-1, 1], [1, -1], [-1, -1]]
-                result = sum(o1 * o2 * fa.neighbor(indices[0], o1).neighbor(indices[1], o2) for o1, o2 in offsets) / 4
-            return result / (self.dx ** 2)
-        else:
-            raise NotImplementedError("Term contains derivatives of order > 2")
    def __call__(self, expr):
        if isinstance(expr, list):
            return [self(e) for e in expr]
@@ -188,7 +162,7 @@ class Advection(sp.Function):
        return self.scalar.spatial_dimensions
    def _latex(self, printer):
-        name_suffix = "_%s" % self.scalar_index if self.scalar_index is not None else ""
+        name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
        if isinstance(self.vector, Field):
            return r"\nabla \cdot(%s %s)" % (printer.doprint(sp.Symbol(self.vector.name)),
                                             printer.doprint(sp.Symbol(self.scalar.name + name_suffix)))
@@ -235,7 +209,7 @@ class Diffusion(sp.Function):
        return self.scalar.spatial_dimensions
    def _latex(self, printer):
-        name_suffix = "_%s" % self.scalar_index if self.scalar_index is not None else ""
+        name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
        coeff = self.diffusion_coeff
        diff_coeff = sp.Symbol(coeff.name) if isinstance(coeff, Field) else coeff
        return r"div(%s \nabla %s)" % (printer.doprint(diff_coeff),
@@ -268,7 +242,7 @@ class Transient(sp.Function):
        return None if len(self.args) <= 1 else int(self.args[1])
    def _latex(self, printer):
-        name_suffix = "_%s" % self.scalar_index if self.scalar_index is not None else ""
+        name_suffix = f"_{self.scalar_index}" if self.scalar_index is not None else ""
        return r"\partial_t %s" % (printer.doprint(sp.Symbol(self.scalar.name + name_suffix)),)
@@ -311,8 +285,9 @@ def discretize_center(term, symbols_to_field_dict, dx, dim=3):
      >>> term
      x*x^Delta^0
      >>> f = Field.create_generic('f', spatial_dimensions=3)
-      >>> discretize_center(term, { x: f }, dx=1, dim=3)
+      >>> expected_output = f[0, 0, 0] * (-f[-1, 0, 0]/2 + f[1, 0, 0]/2)
-      f_C*(-f_W/2 + f_E/2)
+      >>> sp.simplify(discretize_center(term, { x: f }, dx=1, dim=3) - expected_output)
+      0
    """
    substitutions = {}
    for symbols, field in symbols_to_field_dict.items():
@@ -362,7 +337,7 @@ def discretize_staggered(term, symbols_to_field_dict, coordinate, coordinate_off
        offset = [0] * dim
        offset[coordinate] = coordinate_offset
-        offset = np.array(offset, dtype=np.int)
+        offset = np.array(offset, dtype=int)
        gradient = grad(symbols)[coordinate]
        substitutions.update({s: (field[offset](i) + field(i)) / 2 for i, s in enumerate(symbols)})
@@ -394,8 +369,10 @@ def discretize_divergence(vector_term, symbols_to_field_dict, dx):
        >>> x, dx = sp.symbols("x dx")
        >>> grad_x = grad(x, dim=3)
        >>> f = Field.create_generic('f', spatial_dimensions=3)
-        >>> sp.simplify(discretize_divergence(grad_x, {x : f}, dx))
+        >>> expected_output = (f[-1, 0, 0] + f[0, -1, 0] + f[0, 0, -1] -
-        (f_W + f_S + f_B - 6*f_C + f_T + f_N + f_E)/dx**2
+        ... 6*f[0, 0, 0] + f[0, 0, 1] + f[0, 1, 0] + f[1, 0, 0])/dx**2
+        >>> sp.simplify(discretize_divergence(grad_x, {x : f}, dx) - expected_output)
+        0
    """
    dim = len(vector_term)
    result = 0
@@ -408,7 +385,7 @@ def discretize_divergence(vector_term, symbols_to_field_dict, dx):
 def __up_down_offsets(d, dim):
    coord = [0] * dim
    coord[d] = 1
-    up = np.array(coord, dtype=np.int)
+    up = np.array(coord, dtype=int)
    coord[d] = -1
-    down = np.array(coord, dtype=np.int)
+    down = np.array(coord, dtype=int)
    return up, down
--- a/src/pystencils/fd/finitevolumes.py
+++ b/src/pystencils/fd/finitevolumes.py
+import pystencils as ps
+import sympy as sp
+from pystencils.fd.derivation import FiniteDifferenceStaggeredStencilDerivation as FDS, \
+    FiniteDifferenceStencilDerivation as FD
+import itertools
+from collections import defaultdict
+from collections.abc import Iterable
+def get_access_and_direction(term):
+    direction1 = term.args[1]
+    if isinstance(term.args[0], ps.Field.Access):  # first derivative
+        access = term.args[0]
+        direction = (direction1,)
+    elif isinstance(term.args[0], ps.fd.Diff):  # nested derivative
+        if isinstance(term.args[0].args[0], ps.fd.Diff):  # third or higher derivative
+            raise ValueError("can only handle first and second derivatives")
+        elif not isinstance(term.args[0].args[0], ps.Field.Access):
+            raise ValueError("can only handle derivatives of field accesses")
+        access, direction2 = term.args[0].args[:2]
+        direction = (direction1, direction2)
+    else:
+        raise NotImplementedError(f"can only deal with derivatives of field accesses, "
+                                  f"but not {type(term.args[0])}; expansion of derivatives probably failed")
+    return access, direction
+class FVM1stOrder:
+    """Finite-volume discretization
+    Args:
+        field: the field with the quantity to calculate, e.g. a concentration
+        flux: a list of sympy expressions that specify the flux, one for each cartesian direction
+        source: a list of sympy expressions that specify the source
+    """
+    def __init__(self, field: ps.field.Field, flux=0, source=0):
+        def normalize(f, shape):
+            shape = tuple(s for s in shape if s != 1)
+            if not shape:
+                shape = None
+            if isinstance(f, sp.Array) or isinstance(f, Iterable) or isinstance(f, sp.Matrix):
+                return sp.Array(f, shape)
+            else:
+                return sp.Array([f] * (sp.Mul(*shape) if shape else 1))
+        self.c = field
+        self.dim = self.c.spatial_dimensions
+        self.j = normalize(flux, (self.dim, ) + self.c.index_shape)
+        self.q = normalize(source, self.c.index_shape)
+    def discrete_flux(self, flux_field: ps.field.Field):
+        """Return a list of assignments for the discrete fluxes
+        Args:
+            flux_field: a staggered field to which the fluxes should be assigned
+        """
+        assert ps.FieldType.is_staggered(flux_field)
+        num = 0
+        def discretize(term, neighbor):
+            nonlocal num
+            if isinstance(term, sp.Matrix):
+                nw = term.applyfunc(lambda t: discretize(t, neighbor))
+                return nw
+            elif isinstance(term, ps.field.Field.Access):
+                avg = (term.get_shifted(*neighbor) + term) * sp.Rational(1, 2)
+                return avg
+            elif isinstance(term, ps.fd.Diff):
+                access, direction = get_access_and_direction(term)
+                fds = FDS(neighbor, access.field.spatial_dimensions, direction,
+                          free_weights_prefix=f'fvm_free_{num}' if sp.Matrix(neighbor).dot(neighbor) > 2 else None)
+                num += 1
+                return fds.apply(access)
+            if term.args:
+                new_args = [discretize(a, neighbor) for a in term.args]
+                return term.func(*new_args)
+            else:
+                return term
+        fluxes = self.j.applyfunc(ps.fd.derivative.expand_diff_full)
+        fluxes = [sp.Matrix(fluxes.tolist()[i]) if flux_field.index_dimensions > 1 else fluxes.tolist()[i] 
+                  for i in range(self.dim)]
+        A0 = sum([sp.Matrix(ps.stencil.direction_string_to_offset(d)).norm()
+                  for d in flux_field.staggered_stencil]) / self.dim
+        discrete_fluxes = []
+        for neighbor in flux_field.staggered_stencil:
+            neighbor = ps.stencil.direction_string_to_offset(neighbor)
+            directional_flux = fluxes[0] * int(neighbor[0])
+            for i in range(1, self.dim):
+                directional_flux += fluxes[i] * int(neighbor[i])
+            discrete_flux = sp.simplify(discretize(directional_flux, neighbor))
+            free_weights = [s for s in discrete_flux.atoms(sp.Symbol) if s.name.startswith('fvm_free_')]
+            if len(free_weights) > 0:
+                discrete_flux = discrete_flux.collect(discrete_flux.atoms(ps.field.Field.Access))
+                access_counts = defaultdict(list)
+                for values in itertools.product([-1, 0, 1],
+                                                repeat=len(free_weights)):
+                    subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
+                    simp = discrete_flux.subs(subs)
+                    access_count = len(simp.atoms(ps.field.Field.Access))
+                    access_counts[access_count].append(simp)
+                best_count = min(access_counts.keys())
+                discrete_flux = sum(access_counts[best_count]) / len(access_counts[best_count])
+            discrete_fluxes.append(discrete_flux / sp.Matrix(neighbor).norm())
+        if flux_field.index_dimensions > 1:
+            return [ps.Assignment(lhs, rhs / A0)
+                    for i, d in enumerate(flux_field.staggered_stencil) if discrete_fluxes[i]
+                    for lhs, rhs in zip(flux_field.staggered_vector_access(d), sp.simplify(discrete_fluxes[i]))]
+        else:
+            return [ps.Assignment(flux_field.staggered_access(d), sp.simplify(discrete_fluxes[i]) / A0)
+                    for i, d in enumerate(flux_field.staggered_stencil)]
+    def discrete_source(self):
+        """Return a list of assignments for the discrete source term"""
+        def discretize(term):
+            if isinstance(term, ps.fd.Diff):
+                access, direction = get_access_and_direction(term)
+                if self.dim == 2:
+                    stencil = ["".join(a).replace(" ", "") for a in itertools.product("NS ", "EW ")
+                               if "".join(a).strip()]
+                else:
+                    stencil = ["".join(a).replace(" ", "") for a in itertools.product("NS ", "EW ", "TB ")
+                               if "".join(a).strip()]
+                weights = None
+                for stencil in [["N", "S", "E", "W", "T", "B"][:2 * self.dim], stencil]:
+                    stencil = [tuple(ps.stencil.direction_string_to_offset(d, self.dim)) for d in stencil]
+                    derivation = FD(direction, stencil).get_stencil()
+                    if not derivation.accuracy:
+                        continue
+                    weights = derivation.weights
+                    # if the weights are underdefined, we can choose the free symbols to find the sparsest stencil
+                    free_weights = set(itertools.chain(*[w.free_symbols for w in weights]))
+                    if len(free_weights) > 0:
+                        zero_counts = defaultdict(list)
+                        for values in itertools.product([-1, -sp.Rational(1, 2), 0, 1, sp.Rational(1, 2)],
+                                                        repeat=len(free_weights)):
+                            subs = {free_weight: value for free_weight, value in zip(free_weights, values)}
+                            weights = [w.subs(subs) for w in derivation.weights]
+                            if not all(a == 0 for a in weights):
+                                zero_count = sum([1 for w in weights if w == 0])
+                                zero_counts[zero_count].append(weights)
+                        best = zero_counts[max(zero_counts.keys())]
+                        if len(best) > 1:
+                            raise NotImplementedError("more than one suitable set of weights found, "
+                                                      "don't know how to proceed")
+                        weights = best[0]
+                    break
+                if not weights:
+                    raise Exception('the requested derivative cannot be performed with the available neighbors')
+                assert weights
+                if access._field.index_dimensions == 0:
+                    return sum([access._field.__getitem__(point) * weight for point, weight in zip(stencil, weights)])
+                else:
+                    total = access.get_shifted(*stencil[0]).at_index(*access.index) * weights[0]
+                    for point, weight in zip(stencil[1:], weights[1:]):
+                        addl = access.get_shifted(*point).at_index(*access.index) * weight
+                        total += addl
+                    return total
+            if term.args:
+                new_args = [discretize(a) for a in term.args]
+                return term.func(*new_args)
+            else:
+                return term
+        source = self.q.applyfunc(ps.fd.derivative.expand_diff_full)
+        source = source.applyfunc(discretize)
+        return [ps.Assignment(lhs, rhs) for lhs, rhs in zip(self.c.center_vector, sp.flatten(source)) if rhs]
+    def discrete_continuity(self, flux_field: ps.field.Field):
+        """Return a list of assignments for the continuity equation, which includes the source term
+        Args:
+            flux_field: a staggered field from which the fluxes are taken
+        """
+        assert ps.FieldType.is_staggered(flux_field)
+        neighbors = flux_field.staggered_stencil + [ps.stencil.inverse_direction_string(d)
+                                                    for d in flux_field.staggered_stencil]
+        divergence = flux_field.staggered_vector_access(neighbors[0])
+        for d in neighbors[1:]:
+            divergence += flux_field.staggered_vector_access(d)
+        source = self.discrete_source()
+        source = {s.lhs: s.rhs for s in source}
+        return [ps.Assignment(lhs, (lhs - rhs + source[lhs]) if lhs in source else (lhs - rhs))
+                for lhs, rhs in zip(self.c.center_vector, divergence)]
+def VOF(j: ps.field.Field, v: ps.field.Field, ρ: ps.field.Field):
+    """Volume-of-fluid discretization of advection
+    Args:
+        j: the staggered field to write the fluxes to. Should have a D2Q9/D3Q27 stencil. Other stencils work too, but
+           incur a small error (D2Q5/D3Q7: v^2, D3Q19: v^3).
+        v: the flow velocity field
+        ρ: the quantity to advect
+    """
+    assert ps.FieldType.is_staggered(j)
+    fluxes = [[] for i in range(j.index_shape[0])]
+    v0 = v.center_vector
+    for d, neighbor in enumerate(j.staggered_stencil):
+        c = ps.stencil.direction_string_to_offset(neighbor)
+        v1 = v.neighbor_vector(c)
+        # going out
+        cond = sp.And(*[sp.Or(c[i] * v0[i] > 0, c[i] == 0) for i in range(len(v0))])
+        overlap1 = [1 - sp.Abs(v0[i]) for i in range(len(v0))]
+        overlap2 = [c[i] * v0[i] for i in range(len(v0))]
+        overlap = sp.Mul(*[(overlap1[i] if c[i] == 0 else overlap2[i]) for i in range(len(v0))])
+        fluxes[d].append(ρ.center_vector * overlap * sp.Piecewise((1, cond), (0, True)))
+        # coming in
+        cond = sp.And(*[sp.Or(c[i] * v1[i] < 0, c[i] == 0) for i in range(len(v1))])
+        overlap1 = [1 - sp.Abs(v1[i]) for i in range(len(v1))]
+        overlap2 = [v1[i] for i in range(len(v1))]
+        overlap = sp.Mul(*[(overlap1[i] if c[i] == 0 else overlap2[i]) for i in range(len(v1))])
+        sign = (c == 1).sum() % 2 * 2 - 1
+        fluxes[d].append(sign * ρ.neighbor_vector(c) * overlap * sp.Piecewise((1, cond), (0, True)))
+    for i, ff in enumerate(fluxes):
+        fluxes[i] = ff[0]
+        for f in ff[1:]:
+            fluxes[i] += f
+    assignments = []
+    for i, d in enumerate(j.staggered_stencil):
+        for lhs, rhs in zip(j.staggered_vector_access(d).values(), fluxes[i].values()):
+            assignments.append(ps.Assignment(lhs, rhs))
+    return assignments
--- a/pystencils/fd/spatial.py
+++ b/pystencils/fd/spatial.py
+from functools import lru_cache
 from typing import Tuple
 import sympy as sp
 from pystencils.astnodes import LoopOverCoordinate
-from pystencils.cache import memorycache
 from pystencils.fd import Diff
 from pystencils.field import Field
 from pystencils.transformations import generic_visit
@@ -72,43 +72,12 @@ def fd_stencils_forth_order_isotropic(indices, dx, fa):
    return stencils[dim].apply(fa) / dx
-def fd_stencils_isotropic_high_density_code(indices, dx, fa):
-    dim = fa.field.spatial_dimensions
-    if dim == 1:
-        return fd_stencils_standard(indices, dx, fa)
-    elif dim == 2:
-        order = len(indices)
-        if order == 1:
-            idx = indices[0]
-            assert 0 <= idx < 2
-            other_idx = 1 if indices[0] == 0 else 0
-            weights = {-1: sp.Rational(1, 12) / dx,
-                       0: sp.Rational(1, 3) / dx,
-                       1: sp.Rational(1, 12) / dx}
-            upper_terms = sum(fa.neighbor(idx, +1).neighbor(other_idx, off) * w for off, w in weights.items())
-            lower_terms = sum(fa.neighbor(idx, -1).neighbor(other_idx, off) * w for off, w in weights.items())
-            return upper_terms - lower_terms
-        elif order == 2:
-            if indices[0] == indices[1]:
-                idx = indices[0]
-                diagonals = sp.Rational(1, 8) * sum(fa.neighbor(0, i).neighbor(1, j) for i in (-1, 1) for j in (-1, 1))
-                div_direction = sp.Rational(1, 2) * sum(fa.neighbor(idx, i) for i in (-1, 1))
-                center = - sp.Rational(3, 2) * fa
-                return (diagonals + div_direction + center) / (dx ** 2)
-            else:
-                return fd_stencils_standard(indices, dx, fa)
-    raise NotImplementedError("Supports only derivatives up to order 2 for 1D and 2D setups")
 def discretize_spatial(expr, dx, stencil=fd_stencils_standard):
    if isinstance(stencil, str):
        if stencil == 'standard':
            stencil = fd_stencils_standard
        elif stencil == 'isotropic':
            stencil = fd_stencils_isotropic
-        elif stencil == 'isotropic_hd':
-            stencil = fd_stencils_isotropic_high_density_code
        else:
            raise ValueError("Unknown stencil. Supported 'standard' and 'isotropic'")
@@ -167,9 +136,7 @@ def discretize_spatial_staggered(expr, dx, stencil=fd_stencils_standard):
 # -------------------------------------- special stencils --------------------------------------------------------------
+@lru_cache(maxsize=1)
-@memorycache(maxsize=1)
 def forth_order_2d_derivation() -> Tuple[FiniteDifferenceStencilDerivation.Result, ...]:
    # Symmetry, isotropy and 4th order conditions are not enough to fully specify the stencil
    # one weight has to be specifically set to a somewhat arbitrary value

--- a/pystencils/field.py
+++ b/pystencils/field.py
+import functools
 import hashlib
+import operator
 import pickle
 import re
 from enum import Enum
 from itertools import chain
-from typing import List, Optional, Sequence, Set, Tuple
+from typing import List, Optional, Sequence, Set, Tuple, Union
 import numpy as np
 import sympy as sp
 from sympy.core.cache import cacheit
+import pystencils
 from pystencils.alignedarray import aligned_empty
-from pystencils.data_types import StructType, TypedSymbol, create_type
+from pystencils.typing import StructType, TypedSymbol, BasicType, create_type
-from pystencils.kernelparameters import FieldShapeSymbol, FieldStrideSymbol
+from pystencils.typing.typed_sympy import FieldShapeSymbol, FieldStrideSymbol
-from pystencils.stencil import direction_string_to_offset, offset_to_direction_string
+from pystencils.stencil import (
+    direction_string_to_offset, inverse_direction, offset_to_direction_string)
 from pystencils.sympyextensions import is_integer_sequence
-__all__ = ['Field', 'fields', 'FieldType', 'AbstractField']
+__all__ = ['Field', 'fields', 'FieldType', 'Field']
-def fields(description=None, index_dimensions=0, layout=None, **kwargs):
-    """Creates pystencils fields from a string description.
-    Examples:
-        Create a 2D scalar and vector field:
-            >>> s, v = fields("s, v(2): double[2D]")
-            >>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
-            >>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
-        Create an integer field of shape (10, 20):
-            >>> f = fields("f : int32[10, 20]")
-            >>> f.has_fixed_shape, f.shape
-            (True, (10, 20))
-        Numpy arrays can be used as template for shape and data type of field:
-            >>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
-            >>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
-            >>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
-            >>> assert v.index_shape == (2,)
-        Format string can be left out, field names are taken from keyword arguments.
-            >>> fields(f1=arr_s, f2=arr_s)
-            [f1, f2]
-        The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
-            >>> f = fields(f=arr_v, index_dimensions=1)
-            >>> assert f.index_dimensions == 1
-            >>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
-            >>> f.layout
-            (2, 1, 0)
-    """
-    result = []
-    if description:
-        field_descriptions, dtype, shape = _parse_description(description)
-        layout = 'numpy' if layout is None else layout
-        for field_name, idx_shape in field_descriptions:
-            if field_name in kwargs:
-                arr = kwargs[field_name]
-                idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
-                assert idx_shape_of_arr == idx_shape
-                f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape))
-            elif isinstance(shape, tuple):
-                f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
-                                            index_dimensions=len(idx_shape), layout=layout)
-            elif isinstance(shape, int):
-                f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
-                                         index_shape=idx_shape, layout=layout)
-            elif shape is None:
-                f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
-                                         index_shape=idx_shape, layout=layout)
-            else:
-                assert False
-            result.append(f)
-    else:
-        assert layout is None, "Layout can not be specified when creating Field from numpy array"
-        for field_name, arr in kwargs.items():
-            result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions))
-    if len(result) == 0:
-        return None
-    elif len(result) == 1:
-        return result[0]
-    else:
-        return result
 class FieldType(Enum):
@@ -96,6 +33,10 @@ class FieldType(Enum):
    # unsafe fields may be accessed in an absolute fashion - the index depends on the data
    # and thus may lead to out-of-bounds accesses
    CUSTOM = 3
+    # staggered field
+    STAGGERED = 4
+    # staggered field that reverses sign when accessed via opposite direction
+    STAGGERED_FLUX = 5
    @staticmethod
    def is_generic(field):
@@ -117,14 +58,18 @@ class FieldType(Enum):
        assert isinstance(field, Field)
        return field.field_type == FieldType.CUSTOM
+    @staticmethod
+    def is_staggered(field):
+        assert isinstance(field, Field)
+        return field.field_type == FieldType.STAGGERED or field.field_type == FieldType.STAGGERED_FLUX
-class AbstractField:
+    @staticmethod
+    def is_staggered_flux(field):
-    class AbstractAccess:
+        assert isinstance(field, Field)
-        pass
+        return field.field_type == FieldType.STAGGERED_FLUX
-class Field(AbstractField):
+class Field:
    """
    With fields one can formulate stencil-like update rules on structured grids.
    This Field class knows about the dimension, memory layout (strides) and optionally about the size of an array.
@@ -156,6 +101,14 @@ class Field(AbstractField):
        First specify the spatial offsets in [], then in case index_dimension>0 the indices in ()
        e.g. ``f[-1,0,0](7)``
+    Staggered Fields:
+        Staggered fields are used to store a value on a second grid shifted by half a cell with respect to the usual
+        grid.
+        The first index dimension is used to specify the position on the staggered grid (e.g. 0 means half-way to the
+        eastern neighbor, 1 is half-way to the northern neighbor, etc.), while additional indices can be used to store
+        multiple values at each position.
    Example using no index dimensions:
        >>> a = np.zeros([10, 10])
        >>> f = Field.create_from_numpy_array("f", a, index_dimensions=0)
@@ -185,8 +138,9 @@ class Field(AbstractField):
            index_shape: optional shape of the index dimensions i.e. maximum values allowed for each index dimension,
                        has to be a list or tuple
            field_type: besides the normal GENERIC fields, there are INDEXED fields that store indices of the domain
-                        that should be iterated over, and BUFFER fields that are used to generate
+                        that should be iterated over, BUFFER fields that are used to generate communication
-                        communication packing/unpacking kernels
+                        packing/unpacking kernels, and STAGGERED fields, which store values half-way to the next
+                        cell
        """
        if index_shape is not None:
            assert index_dimensions == 0 or index_dimensions == len(index_shape)
@@ -208,11 +162,14 @@ class Field(AbstractField):
                raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
            shape += (1,)
            strides += (1,)
+        if field_type == FieldType.STAGGERED and index_dimensions == 0:
+            raise ValueError("A staggered field needs at least one index dimension")
        return Field(field_name, field_type, dtype, layout, shape, strides)
    @staticmethod
-    def create_from_numpy_array(field_name: str, array: np.ndarray, index_dimensions: int = 0) -> 'Field':
+    def create_from_numpy_array(field_name: str, array: np.ndarray, index_dimensions: int = 0,
+                                field_type=FieldType.GENERIC) -> 'Field':
        """Creates a field based on the layout, data type, and shape of a given numpy array.
        Kernels created for these kind of fields can only be called with arrays of the same layout, shape and type.
@@ -221,6 +178,7 @@ class Field(AbstractField):
            field_name: symbolic name for the field
            array: numpy array
            index_dimensions: see documentation of Field
+            field_type: kind of field
        """
        spatial_dimensions = len(array.shape) - index_dimensions
        if spatial_dimensions < 1:
@@ -239,12 +197,15 @@ class Field(AbstractField):
                raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
            shape += (1,)
            strides += (1,)
+        if field_type == FieldType.STAGGERED and index_dimensions == 0:
+            raise ValueError("A staggered field needs at least one index dimension")
-        return Field(field_name, FieldType.GENERIC, array.dtype, spatial_layout, shape, strides)
+        return Field(field_name, field_type, array.dtype, spatial_layout, shape, strides)
    @staticmethod
    def create_fixed_size(field_name: str, shape: Tuple[int, ...], index_dimensions: int = 0,
-                          dtype=np.float64, layout: str = 'numpy', strides: Optional[Sequence[int]] = None) -> 'Field':
+                          dtype=np.float64, layout: str = 'numpy', strides: Optional[Sequence[int]] = None,
+                          field_type=FieldType.GENERIC) -> 'Field':
        """
        Creates a field with fixed sizes i.e. can be called only with arrays of the same size and layout
@@ -255,6 +216,7 @@ class Field(AbstractField):
            dtype: numpy data type of the array the kernel is called with later
            layout: full layout of array, not only spatial dimensions
            strides: strides in bytes or None to automatically compute them from shape (assuming no padding)
+            field_type: kind of field
        """
        spatial_dimensions = len(shape) - index_dimensions
        assert spatial_dimensions >= 1
@@ -275,11 +237,13 @@ class Field(AbstractField):
                raise ValueError("Structured arrays/fields are not allowed to have an index dimension")
            shape += (1,)
            strides += (1,)
+        if field_type == FieldType.STAGGERED and index_dimensions == 0:
+            raise ValueError("A staggered field needs at least one index dimension")
        spatial_layout = list(layout)
        for i in range(spatial_dimensions, len(layout)):
            spatial_layout.remove(i)
-        return Field(field_name, FieldType.GENERIC, dtype, tuple(spatial_layout), shape, strides)
+        return Field(field_name, field_type, dtype, tuple(spatial_layout), shape, strides)
    def __init__(self, field_name, field_type, dtype, layout, shape, strides):
        """Do not use directly. Use static create* methods"""
@@ -291,14 +255,17 @@ class Field(AbstractField):
        self._layout = normalize_layout(layout)
        self.shape = shape
        self.strides = strides
-        self.latex_name = None  # type: Optional[str]
+        self.latex_name: Optional[str] = None
+        self.coordinate_origin = sp.Matrix([0] * self.spatial_dimensions)
+        self.coordinate_transform = sp.eye(self.spatial_dimensions)
+        if field_type == FieldType.STAGGERED:
+            assert self.staggered_stencil
    def new_field_with_different_name(self, new_name):
        if self.has_fixed_shape:
            return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides)
        else:
-            return Field.create_generic(new_name, self.spatial_dimensions, self.dtype.numpy_dtype,
+            return Field(new_name, self.field_type, self.dtype, self.layout, self.shape, self.strides)
-                                        self.index_dimensions, self._layout, self.index_shape, self.field_type)
    @property
    def spatial_dimensions(self) -> int:
@@ -312,6 +279,9 @@ class Field(AbstractField):
    def ndim(self) -> int:
        return len(self.shape)
+    def values_per_cell(self) -> int:
+        return functools.reduce(operator.mul, self.index_shape, 1)
    @property
    def layout(self):
        return self._layout
@@ -348,8 +318,24 @@ class Field(AbstractField):
    def dtype(self):
        return self._dtype
+    @property
+    def itemsize(self):
+        return self.dtype.numpy_dtype.itemsize
    def __repr__(self):
-        return self._field_name
+        if any(isinstance(s, sp.Symbol) for s in self.spatial_shape):
+            spatial_shape_str = f'{self.spatial_dimensions}d'
+        else:
+            spatial_shape_str = ','.join(str(i) for i in self.spatial_shape)
+        index_shape_str = ','.join(str(i) for i in self.index_shape)
+        if self.index_shape:
+            return f'{self._field_name}({index_shape_str}): {self.dtype}[{spatial_shape_str}]'
+        else:
+            return f'{self._field_name}: {self.dtype}[{spatial_shape_str}]'
+    def __str__(self):
+        return self.name
    def neighbor(self, coord_id, offset):
        offset_list = [0] * self.spatial_dimensions
@@ -364,19 +350,37 @@ class Field(AbstractField):
        index_shape = self.index_shape
        if len(index_shape) == 0:
            return sp.Matrix([self.center])
-        if len(index_shape) == 1:
+        elif len(index_shape) == 1:
            return sp.Matrix([self(i) for i in range(index_shape[0])])
        elif len(index_shape) == 2:
-            def cb(*args):
+            return sp.Matrix([[self(i, j) for j in range(index_shape[1])] for i in range(index_shape[0])])
-                r = self.__call__(*args)
+        elif len(index_shape) == 3:
-                return r
+            return sp.Array([[[self(i, j, k) for k in range(index_shape[2])]
-            return sp.Matrix(*index_shape, cb)
+                              for j in range(index_shape[1])] for i in range(index_shape[0])])
+        else:
+            raise NotImplementedError("center_vector is not implemented for more than 3 index dimensions")
    @property
    def center(self):
        center = tuple([0] * self.spatial_dimensions)
        return Field.Access(self, center)
+    def neighbor_vector(self, offset):
+        """Like neighbor, but returns the entire vector/tensor stored at offset."""
+        if self.spatial_dimensions == 2 and len(offset) == 3:
+            assert offset[2] == 0
+            offset = offset[:2]
+        if self.index_dimensions == 0:
+            return sp.Matrix([self.__getitem__(offset)])
+        elif self.index_dimensions == 1:
+            return sp.Matrix([self.__getitem__(offset)(i) for i in range(self.index_shape[0])])
+        elif self.index_dimensions == 2:
+            return sp.Matrix([[self.__getitem__(offset)(i, k) for k in range(self.index_shape[1])]
+                              for i in range(self.index_shape[0])])
+        else:
+            raise NotImplementedError("neighbor_vector is not implemented for more than 2 index dimensions")
    def __getitem__(self, offset):
        if type(offset) is np.ndarray:
            offset = tuple(offset)
@@ -385,21 +389,115 @@ class Field(AbstractField):
        if type(offset) is not tuple:
            offset = (offset,)
        if len(offset) != self.spatial_dimensions:
-            raise ValueError("Wrong number of spatial indices: "
+            raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
-                             "Got %d, expected %d" % (len(offset), self.spatial_dimensions))
        return Field.Access(self, offset)
    def absolute_access(self, offset, index):
        assert FieldType.is_custom(self)
        return Field.Access(self, offset, index, is_absolute_access=True)
+    def staggered_access(self, offset, index=None):
+        """If this field is a staggered field, it can be accessed using half-integer offsets.
+        For example, an offset of ``(0, sp.Rational(1,2))`` or ``"E"`` corresponds to the staggered point to the east
+        of the cell center, i.e. half-way to the eastern-next cell.
+        If the field stores more than one value per staggered point (e.g. a vector or a tensor), the index (integer or
+        tuple of integers) refers to which of these values to access.
+        """
+        assert FieldType.is_staggered(self)
+        offset_orig = offset
+        if type(offset) is np.ndarray:
+            offset = tuple(offset)
+        if type(offset) is str:
+            offset = tuple(direction_string_to_offset(offset, self.spatial_dimensions))
+            offset = tuple([o * sp.Rational(1, 2) for o in offset])
+        if len(offset) != self.spatial_dimensions:
+            raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
+        prefactor = 1
+        neighbor_vec = [0] * len(offset)
+        for i in range(self.spatial_dimensions):
+            if (offset[i] + sp.Rational(1, 2)).is_Integer:
+                neighbor_vec[i] = sp.sign(offset[i])
+        neighbor = offset_to_direction_string(neighbor_vec)
+        if neighbor not in self.staggered_stencil:
+            neighbor_vec = inverse_direction(neighbor_vec)
+            neighbor = offset_to_direction_string(neighbor_vec)
+            if FieldType.is_staggered_flux(self):
+                prefactor = -1
+        if neighbor not in self.staggered_stencil:
+            raise ValueError(f"{offset_orig} is not a valid neighbor for the {self.staggered_stencil_name} stencil")
+        offset = tuple(sp.Matrix(offset) - sp.Rational(1, 2) * sp.Matrix(neighbor_vec))
+        idx = self.staggered_stencil.index(neighbor)
+        if self.index_dimensions == 1:  # this field stores a scalar value at each staggered position
+            if index is not None:
+                raise ValueError("Cannot specify an index for a scalar staggered field")
+            return prefactor * Field.Access(self, offset, (idx,))
+        else:  # this field stores a vector or tensor at each staggered position
+            if index is None:
+                raise ValueError(f"Wrong number of indices: Got 0, expected {self.index_dimensions - 1}")
+            if type(index) is np.ndarray:
+                index = tuple(index)
+            if type(index) is not tuple:
+                index = (index,)
+            if self.index_dimensions != len(index) + 1:
+                raise ValueError(f"Wrong number of indices: Got {len(index)}, expected {self.index_dimensions - 1}")
+            return prefactor * Field.Access(self, offset, (idx, *index))
+    def staggered_vector_access(self, offset):
+        """Like staggered_access, but returns the entire vector/tensor stored at offset."""
+        assert FieldType.is_staggered(self)
+        if self.index_dimensions == 1:
+            return sp.Matrix([self.staggered_access(offset)])
+        elif self.index_dimensions == 2:
+            return sp.Matrix([self.staggered_access(offset, i) for i in range(self.index_shape[1])])
+        elif self.index_dimensions == 3:
+            return sp.Matrix([[self.staggered_access(offset, (i, k)) for k in range(self.index_shape[2])]
+                              for i in range(self.index_shape[1])])
+        else:
+            raise NotImplementedError("staggered_vector_access is not implemented for more than 3 index dimensions")
+    @property
+    def staggered_stencil(self):
+        assert FieldType.is_staggered(self)
+        stencils = {
+            2: {
+                2: ["W", "S"],  # D2Q5
+                4: ["W", "S", "SW", "NW"]  # D2Q9
+            },
+            3: {
+                3: ["W", "S", "B"],  # D3Q7
+                7: ["W", "S", "B", "BSW", "TSW", "BNW", "TNW"],  # D3Q15
+                9: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS"],  # D3Q19
+                13: ["W", "S", "B", "SW", "NW", "BW", "TW", "BS", "TS", "BSW", "TSW", "BNW", "TNW"]  # D3Q27
+            }
+        }
+        if not self.index_shape[0] in stencils[self.spatial_dimensions]:
+            raise ValueError(f"No known stencil has {self.index_shape[0]} staggered points")
+        return stencils[self.spatial_dimensions][self.index_shape[0]]
+    @property
+    def staggered_stencil_name(self):
+        assert FieldType.is_staggered(self)
+        return f"D{self.spatial_dimensions}Q{self.index_shape[0] * 2 + 1}"
    def __call__(self, *args, **kwargs):
        center = tuple([0] * self.spatial_dimensions)
        return Field.Access(self, center)(*args, **kwargs)
    def hashable_contents(self):
-        dth = hash(self._dtype)
+        return (self._layout,
-        return self._layout, self.shape, self.strides, dth, self.field_type, self._field_name, self.latex_name
+                self.shape,
+                self.strides,
+                self.field_type,
+                self._field_name,
+                self.latex_name,
+                self._dtype)
    def __hash__(self):
        return hash(self.hashable_contents())
@@ -409,8 +507,48 @@ class Field(AbstractField):
            return False
        return self.hashable_contents() == other.hashable_contents()
+    @property
+    def physical_coordinates(self):
+        if hasattr(self.coordinate_transform, '__call__'):
+            return self.coordinate_transform(self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions))
+        else:
+            return self.coordinate_transform @ (self.coordinate_origin + pystencils.x_vector(self.spatial_dimensions))
+    @property
+    def physical_coordinates_staggered(self):
+        return self.coordinate_transform @ \
+            (self.coordinate_origin + pystencils.x_staggered_vector(self.spatial_dimensions))
+    def index_to_physical(self, index_coordinates: sp.Matrix, staggered=False):
+        if staggered:
+            index_coordinates = sp.Matrix([0.5] * len(self.coordinate_origin)) + index_coordinates
+        if hasattr(self.coordinate_transform, '__call__'):
+            return self.coordinate_transform(self.coordinate_origin + index_coordinates)
+        else:
+            return self.coordinate_transform @ (self.coordinate_origin + index_coordinates)
+    def physical_to_index(self, physical_coordinates: sp.Matrix, staggered=False):
+        if hasattr(self.coordinate_transform, '__call__'):
+            if hasattr(self.coordinate_transform, 'inv'):
+                return self.coordinate_transform.inv()(physical_coordinates) - self.coordinate_origin
+            else:
+                idx = sp.Matrix(sp.symbols(f'index_coordinates:{self.ndim}', real=True))
+                rtn = sp.solve(self.index_to_physical(idx) - physical_coordinates, idx)
+                assert rtn, f'Could not find inverese of coordinate_transform: {self.index_to_physical(idx)}'
+                return rtn
+        else:
+            rtn = self.coordinate_transform.inv() @ physical_coordinates - self.coordinate_origin
+        if staggered:
+            rtn = sp.Matrix([i - 0.5 for i in rtn])
+        return rtn
+    def set_coordinate_origin_to_field_center(self):
+        self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape])
    # noinspection PyAttributeOutsideInit,PyUnresolvedReferences
-    class Access(TypedSymbol, AbstractField.AbstractAccess):
+    class Access(TypedSymbol):
        """Class representing a relative access into a `Field`.
        This class behaves like a normal sympy Symbol, it is actually derived from it. One can built up
@@ -429,11 +567,13 @@ class Field(AbstractField):
            >>> central_y_component.at_index(0)  # change component
            v_C^0
        """
+        _iterable = False  # see https://i10git.cs.fau.de/pycodegen/pystencils/-/merge_requests/166#note_10680
        def __new__(cls, name, *args, **kwargs):
            obj = Field.Access.__xnew_cached_(cls, name, *args, **kwargs)
            return obj
-        def __new_stage2__(self, field, offsets=(0, 0, 0), idx=None, is_absolute_access=False):
+        def __new_stage2__(self, field, offsets=(0, 0, 0), idx=None, is_absolute_access=False, dtype=None):
            field_name = field.name
            offsets_and_index = (*offsets, *idx) if idx is not None else offsets
            constant_offsets = not any([isinstance(o, sp.Basic) and not o.is_Integer for o in offsets_and_index])
@@ -458,11 +598,15 @@ class Field(AbstractField):
                offset_name = hashlib.md5(pickle.dumps(offsets_and_index)).hexdigest()[:12]
                superscript = None
-            symbol_name = "%s_%s" % (field_name, offset_name)
+            symbol_name = f"{field_name}_{offset_name}"
            if superscript is not None:
                symbol_name += "^" + superscript
-            obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype)
+            if dtype:
+                obj = super(Field.Access, self).__xnew__(self, symbol_name, dtype)
+            else:
+                obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype)
            obj._field = field
            obj._offsets = []
            for o in offsets:
@@ -470,7 +614,7 @@ class Field(AbstractField):
                    obj._offsets.append(o)
                else:
                    obj._offsets.append(int(o))
-            obj._offsets = tuple(obj._offsets)
+            obj._offsets = tuple(sp.sympify(obj._offsets))
            obj._offsetName = offset_name
            obj._superscript = superscript
            obj._index = idx
@@ -484,7 +628,10 @@ class Field(AbstractField):
            return obj
        def __getnewargs__(self):
-            return self.field, self.offsets, self.index, self.is_absolute_access
+            return self.field, self.offsets, self.index, self.is_absolute_access, self.dtype
+        def __getnewargs_ex__(self):
+            return (self.field, self.offsets, self.index, self.is_absolute_access, self.dtype), {}
        # noinspection SpellCheckingInspection
        __xnew__ = staticmethod(__new_stage2__)
@@ -501,18 +648,18 @@ class Field(AbstractField):
                idx = ()
            if len(idx) != self.field.index_dimensions:
-                raise ValueError("Wrong number of indices: "
+                raise ValueError(f"Wrong number of indices: Got {len(idx)}, expected {self.field.index_dimensions}")
-                                 "Got %d, expected %d" % (len(idx), self.field.index_dimensions))
+            if len(idx) == 1 and isinstance(idx[0], str):
-            return Field.Access(self.field, self._offsets, idx)
+                dtype = BasicType(self.field.dtype.numpy_dtype[idx[0]])
+                return Field.Access(self.field, self._offsets, idx,
+                                    is_absolute_access=self.is_absolute_access, dtype=dtype)
+            else:
+                return Field.Access(self.field, self._offsets, idx,
+                                    is_absolute_access=self.is_absolute_access, dtype=self.dtype)
        def __getitem__(self, *idx):
            return self.__call__(*idx)
-        def __iter__(self):
-            """This is necessary to work with parts of sympy that test if an object is iterable (e.g. simplify).
-            The __getitem__ would make it iterable"""
-            raise TypeError("Field access is not iterable")
        @property
        def field(self) -> 'Field':
            """Field that the Access points to"""
@@ -562,7 +709,8 @@ class Field(AbstractField):
            """
            offset_list = list(self.offsets)
            offset_list[coord_id] += offset
-            return Field.Access(self.field, tuple(offset_list), self.index)
+            return Field.Access(self.field, tuple(offset_list), self.index,
+                                is_absolute_access=self.is_absolute_access, dtype=self.dtype)
        def get_shifted(self, *shift) -> 'Field.Access':
            """Returns a new Access with changed spatial coordinates
@@ -572,7 +720,11 @@ class Field(AbstractField):
                >>> f[0,0].get_shifted(1, 1)
                f_NE
            """
-            return Field.Access(self.field, tuple(a + b for a, b in zip(shift, self.offsets)), self.index)
+            return Field.Access(self.field,
+                                tuple(a + b for a, b in zip(shift, self.offsets)),
+                                self.index,
+                                is_absolute_access=self.is_absolute_access,
+                                dtype=self.dtype)
        def at_index(self, *idx_tuple) -> 'Field.Access':
            """Returns new Access with changed index.
@@ -582,7 +734,15 @@ class Field(AbstractField):
                >>> f(0).at_index(8)
                f_C^8
            """
-            return Field.Access(self.field, self.offsets, idx_tuple)
+            return Field.Access(self.field, self.offsets, idx_tuple,
+                                is_absolute_access=self.is_absolute_access, dtype=self.dtype)
+        def _eval_subs(self, old, new):
+            return Field.Access(self.field,
+                                tuple(sp.sympify(a).subs(old, new) for a in self.offsets),
+                                tuple(sp.sympify(a).subs(old, new) for a in self.index),
+                                is_absolute_access=self.is_absolute_access,
+                                dtype=self.dtype)
        @property
        def is_absolute_access(self) -> bool:
@@ -599,30 +759,125 @@ class Field(AbstractField):
        def _hashable_content(self):
            super_class_contents = super(Field.Access, self)._hashable_content()
-            return (super_class_contents, self._field.hashable_contents(), *self._index, *self._offsets)
+            return (super_class_contents, self._field.hashable_contents(), *self._index,
+                    *self._offsets, self._is_absolute_access)
+        def _staggered_offset(self, offsets, index):
+            assert FieldType.is_staggered(self._field)
+            neighbor = self._field.staggered_stencil[index]
+            neighbor = direction_string_to_offset(neighbor, self._field.spatial_dimensions)
+            return [(o + sp.Rational(int(neighbor[i]), 2)) for i, o in enumerate(offsets)]
        def _latex(self, _):
            n = self._field.latex_name if self._field.latex_name else self._field.name
            offset_str = ",".join([sp.latex(o) for o in self.offsets])
+            if FieldType.is_staggered(self._field):
+                offset_str = ",".join([sp.latex(self._staggered_offset(self.offsets, self.index[0])[i])
+                                       for i in range(len(self.offsets))])
            if self.is_absolute_access:
-                offset_str = "\\mathbf{}".format(offset_str)
+                offset_str = f"\\mathbf{offset_str}"
            elif self.field.spatial_dimensions > 1:
-                offset_str = "({})".format(offset_str)
+                offset_str = f"({offset_str})"
-            if self.index and self.index != (0,):
+            if FieldType.is_staggered(self._field):
-                return "{{%s}_{%s}^{%s}}" % (n, offset_str, self.index if len(self.index) > 1 else self.index[0])
+                if self.index and self.field.index_dimensions > 1:
+                    return f"{{{n}}}_{{{offset_str}}}^{{{self.index[1:] if len(self.index) > 2 else self.index[1]}}}"
+                else:
+                    return f"{{{n}}}_{{{offset_str}}}"
            else:
-                return "{{%s}_{%s}}" % (n, offset_str)
+                if self.index and self.field.index_dimensions > 0:
+                    return f"{{{n}}}_{{{offset_str}}}^{{{self.index if len(self.index) > 1 else self.index[0]}}}"
+                else:
+                    return f"{{{n}}}_{{{offset_str}}}"
        def __str__(self):
            n = self._field.latex_name if self._field.latex_name else self._field.name
            offset_str = ",".join([sp.latex(o) for o in self.offsets])
+            if FieldType.is_staggered(self._field):
+                offset_str = ",".join([sp.latex(self._staggered_offset(self.offsets, self.index[0])[i])
+                                       for i in range(len(self.offsets))])
            if self.is_absolute_access:
-                offset_str = "[abs]{}".format(offset_str)
+                offset_str = f"[abs]{offset_str}"
-            if self.index and self.index != (0,):
-                return "%s[%s](%s)" % (n, offset_str, self.index if len(self.index) > 1 else self.index[0])
+            if FieldType.is_staggered(self._field):
+                if self.index and self.field.index_dimensions > 1:
+                    return f"{n}[{offset_str}]({self.index[1:] if len(self.index) > 2 else self.index[1]})"
+                else:
+                    return f"{n}[{offset_str}]"
+            else:
+                if self.index and self.field.index_dimensions > 0:
+                    return f"{n}[{offset_str}]({self.index if len(self.index) > 1 else self.index[0]})"
+                else:
+                    return f"{n}[{offset_str}]"
+def fields(description=None, index_dimensions=0, layout=None,
+           field_type=FieldType.GENERIC, **kwargs) -> Union[Field, List[Field]]:
+    """Creates pystencils fields from a string description.
+    Examples:
+        Create a 2D scalar and vector field:
+            >>> s, v = fields("s, v(2): double[2D]")
+            >>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
+            >>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
+        Create an integer field of shape (10, 20):
+            >>> f = fields("f : int32[10, 20]")
+            >>> f.has_fixed_shape, f.shape
+            (True, (10, 20))
+        Numpy arrays can be used as template for shape and data type of field:
+            >>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
+            >>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
+            >>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
+            >>> assert v.index_shape == (2,)
+        Format string can be left out, field names are taken from keyword arguments.
+            >>> fields(f1=arr_s, f2=arr_s)
+            [f1: double[20,20], f2: double[20,20]]
+        The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
+            >>> f = fields(f=arr_v, index_dimensions=1)
+            >>> assert f.index_dimensions == 1
+            >>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
+            >>> f.layout
+            (2, 1, 0)
+    """
+    result = []
+    if description:
+        field_descriptions, dtype, shape = _parse_description(description)
+        layout = 'numpy' if layout is None else layout
+        for field_name, idx_shape in field_descriptions:
+            if field_name in kwargs:
+                arr = kwargs[field_name]
+                idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
+                assert idx_shape_of_arr == idx_shape
+                f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
+                                                  field_type=field_type)
+            elif isinstance(shape, tuple):
+                f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
+                                            index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
+            elif isinstance(shape, int):
+                f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
+                                         index_shape=idx_shape, layout=layout, field_type=field_type)
+            elif shape is None:
+                f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
+                                         index_shape=idx_shape, layout=layout, field_type=field_type)
            else:
-                return "%s[%s]" % (n, offset_str)
+                assert False
+            result.append(f)
+    else:
+        assert layout is None, "Layout can not be specified when creating Field from numpy array"
+        for field_name, arr in kwargs.items():
+            result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
+                                                        field_type=field_type))
+    if len(result) == 0:
+        raise ValueError("Could not parse field description")
+    elif len(result) == 1:
+        return result[0]
+    else:
+        return result
 def get_layout_from_strides(strides: Sequence[int], index_dimension_ids: Optional[List[int]] = None):
@@ -685,8 +940,6 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0
    if not alignment:
        res = np.empty(shape, order='c', **kwargs)
    else:
-        if alignment is True:
-            alignment = 8 * 4
        res = aligned_empty(shape, alignment, byte_offset=byte_offset, **kwargs)
    for a, b in reversed(swaps):
@@ -695,24 +948,35 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0
 def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]:
-    if layout_str in ('fzyx', 'zyxf'):
+    if dim <= 0:
-        assert dim <= 3
+        raise ValueError("Dimensionality must be positive")
-        return tuple(reversed(range(dim)))
+    layout_str = layout_str.lower()
-    if layout_str in ('fzyx', 'f', 'reverse_numpy', 'SoA'):
+    if layout_str in ('fzyx', 'zyxf', 'soa', 'aos'):
+        if dim > 3:
+            raise ValueError(f"Invalid spatial dimensionality for layout descriptor {layout_str}: May be at most 3.")
+        return tuple(reversed(range(dim)))
+    if layout_str in ('f', 'reverse_numpy'):
        return tuple(reversed(range(dim)))
-    elif layout_str in ('c', 'numpy', 'AoS'):
+    elif layout_str in ('c', 'numpy'):
        return tuple(range(dim))
    raise ValueError("Unknown layout descriptor " + layout_str)
 def layout_string_to_tuple(layout_str, dim):
+    if dim <= 0:
+        raise ValueError("Dimensionality must be positive")
    layout_str = layout_str.lower()
    if layout_str == 'fzyx' or layout_str == 'soa':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim)))
    elif layout_str == 'zyxf' or layout_str == 'aos':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim - 1))) + (dim - 1,)
    elif layout_str == 'f' or layout_str == 'reverse_numpy':
        return tuple(reversed(range(dim)))
@@ -775,16 +1039,17 @@ type_description_regex = re.compile(r"""
 """, re.VERBOSE | re.IGNORECASE)
-def _parse_description(description):
+def _parse_part1(d):
-    def parse_part1(d):
+    result = field_description_regex.match(d)
+    while result:
+        name, index_str = result.group(1), result.group(2)
+        index = tuple(int(e) for e in index_str.split(",")) if index_str else ()
+        yield name, index
+        d = d[result.end():]
        result = field_description_regex.match(d)
-        while result:
-            name, index_str = result.group(1), result.group(2)
-            index = tuple(int(e) for e in index_str.split(",")) if index_str else ()
-            yield name, index
-            d = d[result.end():]
-            result = field_description_regex.match(d)
+def _parse_description(description):
    def parse_part2(d):
        result = type_description_regex.match(d)
        if result:
@@ -808,7 +1073,7 @@ def _parse_description(description):
    else:
        field_description, field_info = description, 'float64[2D]'
-    fields_info = [e for e in parse_part1(field_description)]
+    fields_info = [e for e in _parse_part1(field_description)]
    if not field_info:
        raise ValueError("Could not parse field description")

--- a/src/pystencils/functions.py
+++ b/src/pystencils/functions.py
+import sympy as sp
+from pystencils.typing import PointerType
+class DivFunc(sp.Function):
+    """
+    DivFunc represents a division operation, since sympy represents divisions with ^-1
+    """
+    is_Atom = True
+    is_real = True
+    def __new__(cls, *args, **kwargs):
+        if len(args) != 2:
+            raise ValueError(f'{cls} takes only 2 arguments, instead {len(args)} received!')
+        divisor, dividend, *other_args = args
+        return sp.Function.__new__(cls, divisor, dividend, *other_args, **kwargs)
+    def _eval_evalf(self, *args, **kwargs):
+        return self.divisor.evalf() / self.dividend.evalf()
+    @property
+    def divisor(self):
+        return self.args[0]
+    @property
+    def dividend(self):
+        return self.args[1]
+class AddressOf(sp.Function):
+    """
+    AddressOf is the '&' operation in C. It gets the address of a lvalue.
+    """
+    is_Atom = True
+    def __new__(cls, arg):
+        obj = sp.Function.__new__(cls, arg)
+        return obj
+    @property
+    def canonical(self):
+        if hasattr(self.args[0], 'canonical'):
+            return self.args[0].canonical
+        else:
+            raise NotImplementedError()
+    @property
+    def is_commutative(self):
+        return self.args[0].is_commutative
+    @property
+    def dtype(self):
+        if hasattr(self.args[0], 'dtype'):
+            return PointerType(self.args[0].dtype, restrict=True)
+        else:
+            raise ValueError(f'pystencils supports only non void pointers. Current address_of type: {self.args[0]}')
--- a/pystencils/gpucuda/__init__.py
+++ b/pystencils/gpucuda/__init__.py
-from pystencils.gpucuda.cudajit import make_python_function
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
-from pystencils.gpucuda.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
+from pystencils.gpu.gpujit import make_python_function
+from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
 from .indexing import AbstractIndexing, BlockIndexing, LineIndexing
-__all__ = ['create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
+__all__ = ['GPUArrayHandler', 'GPUNotAvailableHandler',
+           'create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
           'AbstractIndexing', 'BlockIndexing', 'LineIndexing']
--- a/src/pystencils/gpu/gpu_array_handler.py
+++ b/src/pystencils/gpu/gpu_array_handler.py
+try:
+    import cupy as cp
+    import cupyx as cpx
+except ImportError:
+    cp = None
+    cpx = None
+import numpy as np
+class GPUArrayHandler:
+    def __init__(self, device_number):
+        self._device_number = device_number
+    def zeros(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.zeros(shape=shape, dtype=dtype, order=order)
+    def ones(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.ones(shape=shape, dtype=dtype, order=order)
+    def empty(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.empty(shape=shape, dtype=dtype, order=order)
+    def to_gpu(self, numpy_array):
+        swaps = _get_index_swaps(numpy_array)
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                gpu_array = cp.asarray(numpy_array.base)
+            for a, b in reversed(swaps):
+                gpu_array = gpu_array.swapaxes(a, b)
+            return gpu_array
+        else:
+            return cp.asarray(numpy_array)
+    def upload(self, array, numpy_array):
+        assert self._device_number == array.device.id
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                array.base.set(numpy_array.base)
+        else:
+            with cp.cuda.Device(self._device_number):
+                array.set(numpy_array)
+    def download(self, array, numpy_array):
+        assert self._device_number == array.device.id
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                numpy_array.base[:] = array.base.get()
+        else:
+            with cp.cuda.Device(self._device_number):
+                numpy_array[:] = array.get()
+    def randn(self, shape, dtype=np.float64):
+        with cp.cuda.Device(self._device_number):
+            return cp.random.randn(*shape, dtype=dtype)
+    @staticmethod
+    def pinned_numpy_array(layout, shape, dtype):
+        assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
+        cur_layout = list(range(len(shape)))
+        swaps = []
+        for i in range(len(layout)):
+            if cur_layout[i] != layout[i]:
+                index_to_swap_with = cur_layout.index(layout[i])
+                swaps.append((i, index_to_swap_with))
+                cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
+        assert tuple(cur_layout) == tuple(layout)
+        shape = list(shape)
+        for a, b in swaps:
+            shape[a], shape[b] = shape[b], shape[a]
+        res = cpx.empty_pinned(tuple(shape), order='c', dtype=dtype)
+        for a, b in reversed(swaps):
+            res = res.swapaxes(a, b)
+        return res
+    from_numpy = to_gpu
+class GPUNotAvailableHandler:
+    def __getattribute__(self, name):
+        raise NotImplementedError("Unable to utilise cupy! Please make sure cupy works correctly in your setup!")
+def _get_index_swaps(array):
+    swaps = []
+    if array.base is not None and isinstance(array.base, np.ndarray):
+        for stride in array.base.strides:
+            index_base = array.base.strides.index(stride)
+            index_view = array.strides.index(stride)
+            if index_base != index_view and (index_view, index_base) not in swaps:
+                swaps.append((index_base, index_view))
+    return swaps
No results found