5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils/alignedarray.py
+++ b/pystencils/alignedarray.py
 import numpy as np
-from pystencils.data_types import BasicType
 def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
@@ -21,20 +20,26 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
        from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size,
                                                               get_vector_instruction_set)
-        type_name = BasicType.numpy_name_to_c(np.dtype(dtype).name)
        instruction_sets = get_supported_instruction_sets()
        if instruction_sets is None:
            byte_alignment = 64
        elif byte_alignment == 'cacheline':
            cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
-            if all([s is None for s in cacheline_sizes]):
+            if all([s is None for s in cacheline_sizes]) or \
-                byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
+                    max([s for s in cacheline_sizes if s is not None]) > 0x100000:
-                                      for is_name in instruction_sets])
+                widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
+                          for is_name in instruction_sets
+                          if type(get_vector_instruction_set(dtype, is_name)['width']) is int]
+                byte_alignment = 64 if all([s is None for s in widths]) else max(widths)
            else:
                byte_alignment = max([s for s in cacheline_sizes if s is not None])
+        elif not any([type(get_vector_instruction_set(dtype, is_name)['width']) is int
+                      for is_name in instruction_sets]):
+            byte_alignment = 64
        else:
-            byte_alignment = max([get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize
+            byte_alignment = max([get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
-                                  for is_name in instruction_sets])
+                                  for is_name in instruction_sets
+                                  if type(get_vector_instruction_set(dtype, is_name)['width']) is int])
    if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
        size = np.prod(shape)
        d = np.dtype(dtype)
@@ -72,7 +77,7 @@ def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, o
        return tmp
-def aligned_zeros(shape, byte_alignment=True, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True):
+def aligned_zeros(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
    arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset,
                        order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate)
    x = np.zeros((), arr.dtype)
@@ -80,7 +85,7 @@ def aligned_zeros(shape, byte_alignment=True, dtype=float, byte_offset=0, order=
    return arr
-def aligned_ones(shape, byte_alignment=True, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True):
+def aligned_ones(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
    arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset,
                        order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate)
    x = np.ones((), arr.dtype)

--- a/pystencils/assignment.py
+++ b/pystencils/assignment.py
 import numpy as np
 import sympy as sp
-from sympy.codegen.ast import Assignment
+from sympy.codegen.ast import Assignment, AugmentedAssignment, AddAugmentedAssignment
 from sympy.printing.latex import LatexPrinter
-__all__ = ['Assignment', 'assignment_from_stencil']
+__all__ = ['Assignment', 'AugmentedAssignment', 'AddAugmentedAssignment', 'assignment_from_stencil']
 def print_assignment_latex(printer, expr):
+    binop = f"{expr.binop}=" if isinstance(expr, AugmentedAssignment) else ''
    """sympy cannot print Assignments as Latex. Thus, this function is added to the sympy Latex printer"""
    printed_lhs = printer.doprint(expr.lhs)
    printed_rhs = printer.doprint(expr.rhs)
-    return r"{printed_lhs} \leftarrow {printed_rhs}".format(printed_lhs=printed_lhs, printed_rhs=printed_rhs)
+    return fr"{printed_lhs} \leftarrow_{{{binop}}} {printed_rhs}"
 def assignment_str(assignment):
-    return r"{lhs} ← {rhs}".format(lhs=assignment.lhs, rhs=assignment.rhs)
+    op = f"{assignment.binop}=" if isinstance(assignment, AugmentedAssignment) else '←'
+    return fr"{assignment.lhs} {op} {assignment.rhs}"
 _old_new = sp.codegen.ast.Assignment.__new__
+# TODO Typing Part2 add default type, defult_float_type, default_int_type and use sane defaults
 def _Assignment__new__(cls, lhs, rhs, *args, **kwargs):
    if isinstance(lhs, (list, tuple, sp.Matrix)) and isinstance(rhs, (list, tuple, sp.Matrix)):
        assert len(lhs) == len(rhs), f'{lhs} and {rhs} must have same length when performing vector assignment!'
@@ -31,20 +34,10 @@ Assignment.__str__ = assignment_str
 Assignment.__new__ = _Assignment__new__
 LatexPrinter._print_Assignment = print_assignment_latex
-sp.MutableDenseMatrix.__hash__ = lambda self: hash(tuple(self))
+AugmentedAssignment.__str__ = assignment_str
+LatexPrinter._print_AugmentedAssignment = print_assignment_latex
-# Apparently, in SymPy 1.4 Assignment.__hash__ is not implemented. This has been fixed in current master
-try:
-    sympy_version = sp.__version__.split('.')
-    if int(sympy_version[0]) <= 1 and int(sympy_version[1]) <= 4:
+sp.MutableDenseMatrix.__hash__ = lambda self: hash(tuple(self))
-        def hash_fun(self):
-            return hash((self.lhs, self.rhs))
-        Assignment.__hash__ = hash_fun
-except Exception:
-    pass
 def assignment_from_stencil(stencil_array, input_field, output_field,

--- a/pystencils/astnodes.py
+++ b/pystencils/astnodes.py
@@ -5,11 +5,12 @@ from typing import Any, List, Optional, Sequence, Set, Union
 import sympy as sp
-import pystencils
+from pystencils.assignment import Assignment
-from pystencils.data_types import TypedImaginaryUnit, TypedSymbol, cast_func, create_type
+from pystencils.enums import Target, Backend
 from pystencils.field import Field
-from pystencils.kernelparameters import FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol
 from pystencils.sympyextensions import fast_subs
+from pystencils.typing import (create_type, get_next_parent_of_type,
+                               FieldPointerSymbol, FieldShapeSymbol, FieldStrideSymbol, TypedSymbol, CFunction)
 NodeOrExpr = Union['Node', sp.Expr]
@@ -136,7 +137,6 @@ class Conditional(Node):
 class KernelFunction(Node):
    class Parameter:
        """Function parameter.
@@ -176,7 +176,9 @@ class KernelFunction(Node):
        def field_name(self):
            return self.fields[0].name
-    def __init__(self, body, target, backend, compile_function, ghost_layers, function_name="kernel", assignments=None):
+    def __init__(self, body, target: Target, backend: Backend, compile_function, ghost_layers,
+                 function_name: str = "kernel",
+                 assignments=None):
        super(KernelFunction, self).__init__()
        self._body = body
        body.parent = self
@@ -191,15 +193,19 @@ class KernelFunction(Node):
        # function that compiles the node to a Python callable, is set by the backends
        self._compile_function = compile_function
        self.assignments = assignments
+        # If nontemporal stores are activated together with the Neon instruction set it results in cacheline zeroing
+        # For cacheline zeroing the information of the field size for each field is needed. Thus, in this case
+        # all field sizes are kernel parameters and not just the common field size used for the loops
+        self.use_all_written_field_sizes = False
    @property
    def target(self):
-        """Currently either 'cpu' or 'gpu' """
+        """See pystencils.Target"""
        return self._target
    @property
    def backend(self):
-        """Backend for generating the code e.g. 'llvm', 'c', 'cuda' """
+        """Backend for generating the code: `Backend`"""
        return self._backend
    @property
@@ -226,13 +232,13 @@ class KernelFunction(Node):
    @property
    def fields_accessed(self) -> Set[Field]:
        """Set of Field instances: fields which are accessed inside this kernel function"""
-        from pystencils.interpolation_astnodes import InterpolatorAccess
+        return set(o.field for o in itertools.chain(self.atoms(ResolvedFieldAccess)))
-        return set(o.field for o in itertools.chain(self.atoms(ResolvedFieldAccess), self.atoms(InterpolatorAccess)))
    @property
    def fields_written(self) -> Set[Field]:
        assignments = self.atoms(SympyAssignment)
-        return {a.lhs.field for a in assignments if isinstance(a.lhs, ResolvedFieldAccess)}
+        return set().union(itertools.chain.from_iterable([f.field for f in a.lhs.free_symbols if hasattr(f, 'field')]
+                                                         for a in assignments))
    @property
    def fields_read(self) -> Set[Field]:
@@ -246,6 +252,11 @@ class KernelFunction(Node):
        This function is expensive, cache the result where possible!
        """
        field_map = {f.name: f for f in self.fields_accessed}
+        sizes = set()
+        if self.use_all_written_field_sizes:
+            sizes = set().union(*(a.shape[:a.spatial_dimensions] for a in self.fields_written))
+            sizes = filter(lambda s: isinstance(s, FieldShapeSymbol), sizes)
        def get_fields(symbol):
            if hasattr(symbol, 'field_name'):
@@ -255,9 +266,13 @@ class KernelFunction(Node):
            return ()
        argument_symbols = self._body.undefined_symbols - self.global_variables
+        argument_symbols.update(sizes)
        parameters = [self.Parameter(symbol, get_fields(symbol)) for symbol in argument_symbols]
        if hasattr(self, 'indexing'):
            parameters += [self.Parameter(s, []) for s in self.indexing.symbolic_parameters()]
+        # Exclude paramters of type CFunction. These parameters will result in a C function call that will be handled
+        # by including a respective header file in the compute kernel. Hence, it is not a free parameter.
+        parameters = [p for p in parameters if not isinstance(p.symbol, CFunction)]
        parameters.sort(key=lambda p: p.symbol.name)
        return parameters
@@ -291,8 +306,10 @@ class SkipIteration(Node):
 class Block(Node):
-    def __init__(self, nodes: List[Node]):
+    def __init__(self, nodes: Union[Node, List[Node]]):
        super(Block, self).__init__()
+        if not isinstance(nodes, list):
+            nodes = [nodes]
        self._nodes = nodes
        self.parent = None
        for n in self._nodes:
@@ -331,14 +348,6 @@ class Block(Node):
        assert self._nodes.count(insert_before) == 1
        idx = self._nodes.index(insert_before)
-        # move all assignment (definitions to the top)
-        if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
-            while idx > 0:
-                pn = self._nodes[idx - 1]
-                if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
-                    idx -= 1
-                else:
-                    break
        if not if_not_exists or self._nodes[idx] != new_node:
            self._nodes.insert(idx, new_node)
@@ -347,14 +356,6 @@ class Block(Node):
        assert self._nodes.count(insert_after) == 1
        idx = self._nodes.index(insert_after) + 1
-        # move all assignment (definitions to the top)
-        if isinstance(new_node, SympyAssignment) and new_node.is_declaration:
-            while idx > 0:
-                pn = self._nodes[idx - 1]
-                if isinstance(pn, LoopOverCoordinate) or isinstance(pn, Conditional):
-                    idx -= 1
-                else:
-                    break
        if not if_not_exists or not (self._nodes[idx - 1] == new_node
                                     or (idx < len(self._nodes) and self._nodes[idx] == new_node)):
            self._nodes.insert(idx, new_node)
@@ -389,7 +390,7 @@ class Block(Node):
    def symbols_defined(self):
        result = set()
        for a in self.args:
-            if isinstance(a, pystencils.Assignment):
+            if isinstance(a, Assignment):
                result.update(a.free_symbols)
            else:
                result.update(a.symbols_defined)
@@ -400,7 +401,7 @@ class Block(Node):
        result = set()
        defined_symbols = set()
        for a in self.args:
-            if isinstance(a, pystencils.Assignment):
+            if isinstance(a, Assignment):
                result.update(a.free_symbols)
                defined_symbols.update({a.lhs})
            else:
@@ -430,7 +431,7 @@ class LoopOverCoordinate(Node):
    LOOP_COUNTER_NAME_PREFIX = "ctr"
    BLOCK_LOOP_COUNTER_NAME_PREFIX = "_blockctr"
-    def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False):
+    def __init__(self, body, coordinate_to_loop_over, start, stop, step=1, is_block_loop=False, custom_loop_ctr=None):
        super(LoopOverCoordinate, self).__init__(parent=None)
        self.body = body
        body.parent = self
@@ -441,11 +442,12 @@ class LoopOverCoordinate(Node):
        self.body.parent = self
        self.prefix_lines = []
        self.is_block_loop = is_block_loop
+        self.custom_loop_ctr = custom_loop_ctr
    def new_loop_with_different_body(self, new_body):
        result = LoopOverCoordinate(new_body, self.coordinate_to_loop_over, self.start, self.stop,
-                                    self.step, self.is_block_loop)
+                                    self.step, self.is_block_loop, self.custom_loop_ctr)
-        result.prefix_lines = [l for l in self.prefix_lines]
+        result.prefix_lines = [prefix_line for prefix_line in self.prefix_lines]
        return result
    def subs(self, subs_dict):
@@ -507,10 +509,13 @@ class LoopOverCoordinate(Node):
    @property
    def loop_counter_name(self):
-        if self.is_block_loop:
+        if self.custom_loop_ctr:
-            return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over)
+            return self.custom_loop_ctr.name
        else:
-            return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
+            if self.is_block_loop:
+                return LoopOverCoordinate.get_block_loop_counter_name(self.coordinate_to_loop_over)
+            else:
+                return LoopOverCoordinate.get_loop_counter_name(self.coordinate_to_loop_over)
    @staticmethod
    def is_loop_counter_symbol(symbol):
@@ -534,14 +539,16 @@ class LoopOverCoordinate(Node):
    @property
    def loop_counter_symbol(self):
-        if self.is_block_loop:
+        if self.custom_loop_ctr:
-            return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over)
+            return self.custom_loop_ctr
        else:
-            return self.get_loop_counter_symbol(self.coordinate_to_loop_over)
+            if self.is_block_loop:
+                return self.get_block_loop_counter_symbol(self.coordinate_to_loop_over)
+            else:
+                return self.get_loop_counter_symbol(self.coordinate_to_loop_over)
    @property
    def is_outermost_loop(self):
-        from pystencils.transformations import get_next_parent_of_type
        return get_next_parent_of_type(self, LoopOverCoordinate) is None
    @property
@@ -564,13 +571,14 @@ class SympyAssignment(Node):
    def __init__(self, lhs_symbol, rhs_expr, is_const=True, use_auto=False):
        super(SympyAssignment, self).__init__(parent=None)
        self._lhs_symbol = sp.sympify(lhs_symbol)
-        self.rhs = sp.sympify(rhs_expr)
+        self._rhs = sp.sympify(rhs_expr)
        self._is_const = is_const
        self._is_declaration = self.__is_declaration()
-        self.use_auto = use_auto
+        self._use_auto = use_auto
    def __is_declaration(self):
-        if isinstance(self._lhs_symbol, cast_func):
+        from pystencils.typing import CastFunc
+        if isinstance(self._lhs_symbol, CastFunc):
            return False
        if any(isinstance(self._lhs_symbol, c) for c in (Field.Access, sp.Indexed, TemporaryMemoryAllocation)):
            return False
@@ -580,15 +588,28 @@ class SympyAssignment(Node):
    def lhs(self):
        return self._lhs_symbol
+    @property
+    def rhs(self):
+        return self._rhs
    @lhs.setter
    def lhs(self, new_value):
        self._lhs_symbol = new_value
        self._is_declaration = self.__is_declaration()
+    @rhs.setter
+    def rhs(self, new_rhs_expr):
+        self._rhs = new_rhs_expr
    def subs(self, subs_dict):
        self.lhs = fast_subs(self.lhs, subs_dict)
        self.rhs = fast_subs(self.rhs, subs_dict)
+    def fast_subs(self, subs_dict, skip=None):
+        self.lhs = fast_subs(self.lhs, subs_dict, skip)
+        self.rhs = fast_subs(self.rhs, subs_dict, skip)
+        return self
    def optimize(self, optimizations):
        try:
            from sympy.codegen.rewriting import optimize
@@ -598,7 +619,7 @@ class SympyAssignment(Node):
    @property
    def args(self):
-        return [self._lhs_symbol, self.rhs, sp.sympify(self._is_const)]
+        return [self._lhs_symbol, self.rhs]
    @property
    def symbols_defined(self):
@@ -615,9 +636,10 @@ class SympyAssignment(Node):
            if isinstance(symbol, Field.Access):
                for i in range(len(symbol.offsets)):
                    loop_counters.add(LoopOverCoordinate.get_loop_counter_symbol(i))
-        result = {r for r in result if not isinstance(r, TypedImaginaryUnit)}
        result.update(loop_counters)
        result.update(self._lhs_symbol.atoms(sp.Symbol))
        return result
    @property
@@ -628,6 +650,10 @@ class SympyAssignment(Node):
    def is_const(self):
        return self._is_const
+    @property
+    def use_auto(self):
+        return self._use_auto
    def replace(self, child, replacement):
        if child == self.lhs:
            replacement.parent = self
@@ -650,7 +676,7 @@ class SympyAssignment(Node):
        return hash((self.lhs, self.rhs))
    def __eq__(self, other):
-        return type(self) == type(other) and (self.lhs, self.rhs) == (other.lhs, other.rhs)
+        return type(self) is type(other) and (self.lhs, self.rhs) == (other.lhs, other.rhs)
 class ResolvedFieldAccess(sp.Indexed):
@@ -692,6 +718,9 @@ class ResolvedFieldAccess(sp.Indexed):
    def __getnewargs__(self):
        return self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values
+    def __getnewargs_ex__(self):
+        return (self.base, self.indices[0], self.field, self.offsets, self.idx_coordinate_values), {}
 class TemporaryMemoryAllocation(Node):
    """Node for temporary memory buffer allocation.
@@ -837,46 +866,5 @@ class ConditionalFieldAccess(sp.Function):
    def __getnewargs__(self):
        return self.access, self.outofbounds_condition, self.outofbounds_value
+    def __getnewargs_ex__(self):
-class NontemporalFence(Node):
+        return (self.access, self.outofbounds_condition, self.outofbounds_value), {}
-    def __init__(self):
-        super(NontemporalFence, self).__init__(parent=None)
-    @property
-    def symbols_defined(self):
-        return set()
-    @property
-    def undefined_symbols(self):
-        return set()
-    @property
-    def args(self):
-        return []
-    def __eq__(self, other):
-        return isinstance(other, NontemporalFence)
-class CachelineSize(Node):
-    symbol = sp.Symbol("_clsize")
-    mask_symbol = sp.Symbol("_clsize_mask")
-    last_symbol = sp.Symbol("_cl_lastvec")
-    def __init__(self):
-        super(CachelineSize, self).__init__(parent=None)
-    @property
-    def symbols_defined(self):
-        return set([self.symbol, self.mask_symbol, self.last_symbol])
-    @property
-    def undefined_symbols(self):
-        return set()
-    @property
-    def args(self):
-        return []
-    def __eq__(self, other):
-        return isinstance(other, CachelineSize)
--- a/pystencils/backends/__init__.py
+++ b/pystencils/backends/__init__.py
@@ -6,9 +6,3 @@ try:
    __all__.append('print_dot')
 except ImportError:
    pass
-try:
-    from .llvm import generate_llvm  # NOQA
-    __all__.append('generate_llvm')
-except ImportError:
-    pass
--- a/src/pystencils/backends/arm_instruction_sets.py
+++ b/src/pystencils/backends/arm_instruction_sets.py
+from pystencils.typing import CFunction
+def get_argument_string(function_shortcut, first=''):
+    args = function_shortcut[function_shortcut.index('[') + 1: -1]
+    arg_string = "("
+    if first:
+        arg_string += first + ', '
+    for arg in args.split(","):
+        arg = arg.strip()
+        if not arg:
+            continue
+        if arg in ('0', '1', '2', '3', '4', '5'):
+            arg_string += "{" + arg + "},"
+        else:
+            arg_string += arg + ","
+    arg_string = arg_string[:-1] + ")"
+    return arg_string
+def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
+    if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'):
+        raise NotImplementedError(instruction_set)
+    if instruction_set in ['sve', 'sve2', 'sme']:
+        cmp = 'cmp'
+    elif instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+        cmp = 'cmp'
+        bitwidth = int(instruction_set[4:])
+    elif instruction_set.startswith('sve'):
+        cmp = 'cmp'
+        bitwidth = int(instruction_set[3:])
+    elif instruction_set == 'neon':
+        cmp = 'c'
+        bitwidth = 128
+    base_names = {
+        '+': 'add[0, 1]',
+        '-': 'sub[0, 1]',
+        '*': 'mul[0, 1]',
+        '/': 'div[0, 1]',
+        'sqrt': 'sqrt[0]',
+        'loadU': 'ld1[0]',
+        'storeU': 'st1[0, 1]',
+        'abs': 'abs[0]',
+        '==': f'{cmp}eq[0, 1]',
+        '!=': f'{cmp}eq[0, 1]',
+        '<=': f'{cmp}le[0, 1]',
+        '<': f'{cmp}lt[0, 1]',
+        '>=': f'{cmp}ge[0, 1]',
+        '>': f'{cmp}gt[0, 1]',
+    }
+    bits = {'double': 64,
+            'float': 32,
+            'int': 32}
+    result = dict()
+    if instruction_set in ['sve', 'sve2', 'sme']:
+        width = 'svcntd()' if data_type == 'double' else 'svcntw()'
+        intwidth = 'svcntw()'
+        result['bytes'] = 'svcntb()'
+    else:
+        width = bitwidth // bits[data_type]
+        intwidth = bitwidth // bits['int']
+        result['bytes'] = bitwidth // 8
+    if instruction_set.startswith('sve') or instruction_set == 'sme':
+        base_names['stream'] = 'stnt1[0, 1]'
+        prefix = 'sv'
+        suffix = f'_f{bits[data_type]}'
+    elif instruction_set == 'neon':
+        prefix = 'v'
+        suffix = f'q_f{bits[data_type]}'
+    if instruction_set in ['sve', 'sve2', 'sme']:
+        predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})'
+        int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})'
+    else:
+        predicate = f'{prefix}whilelt_b{bits[data_type]}(0, {width})'
+        int_predicate = f'{prefix}whilelt_b{bits["int"]}(0, {intwidth})'
+    for intrinsic_id, function_shortcut in base_names.items():
+        function_shortcut = function_shortcut.strip()
+        name = function_shortcut[:function_shortcut.index('[')]
+        arg_string = get_argument_string(function_shortcut, first=predicate if prefix == 'sv' else '')
+        if prefix == 'sv' and not name.startswith('ld') and not name.startswith('st') and not name.startswith(cmp):
+            undef = '_x'
+        else:
+            undef = ''
+        result[intrinsic_id] = prefix + name + suffix + undef + arg_string
+    if instruction_set in ['sve', 'sve2', 'sme']:
+        result['width'] = CFunction(width, "int")
+        result['intwidth'] = CFunction(intwidth, "int")
+    else:
+        result['width'] = width
+        result['intwidth'] = intwidth
+    if instruction_set.startswith('sve') or instruction_set == 'sme':
+        result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})'
+        result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})'
+        result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})'
+        if instruction_set != 'sme':
+            vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
+            result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                               vindex.format("{2}") + ', {1})'
+            result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                              vindex.format("{1}") + ')'
+        if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+            result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                                vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})'
+        result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"
+        result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
+        result['headers'] = ['<arm_sve.h>', '<arm_acle.h>', '"arm_neon_helpers.h"']
+        result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})'
+        result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})'
+        result['blendv'] = f'svsel_f{bits[data_type]}' + '({2}, {1}, {0})'
+        result['any'] = f'svptest_any({predicate}, {{0}})'
+        result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'
+        result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
+        result['maskStream'] = result['stream'].replace(predicate, '{2}')
+        if instruction_set != 'sme':
+            result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
+            if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+                result['maskStreamS'] = result['streamS'].replace(predicate, '{3}')
+        result['streamFence'] = '__dmb(15)'
+        if instruction_set == 'sme':
+            result['function_prefix'] = '__attribute__((arm_locally_streaming))'
+        elif instruction_set not in ['sve', 'sve2', 'sme']:
+            result['compile_flags'] = [f'-msve-vector-bits={bitwidth}']
+    else:
+        result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
+        result['makeVec'] = f'makeVec_f{bits[data_type]}' + '(' + ", ".join(['{' + str(i) + '}' for i in
+                                                                             range(width)]) + ')'
+        result['makeVecConstInt'] = f'vdupq_n_s{bits["int"]}' + '({0})'
+        result['makeVecInt'] = f'makeVec_s{bits["int"]}' + '({0}, {1}, {2}, {3})'
+        result['+int'] = f"vaddq_s{bits['int']}" + "({0}, {1})"
+        result[data_type] = f'float{bits[data_type]}x{width}_t'
+        result['int'] = f'int{bits["int"]}x{intwidth}_t'
+        result['bool'] = f'uint{bits[data_type]}x{width}_t'
+        result['headers'] = ['<arm_neon.h>', '"arm_neon_helpers.h"']
+        result['!='] = f'vmvnq_u{bits[data_type]}({result["=="]})'
+        result['&'] = f'vandq_u{bits[data_type]}' + '({0}, {1})'
+        result['|'] = f'vorrq_u{bits[data_type]}' + '({0}, {1})'
+        result['blendv'] = f'vbslq_f{bits[data_type]}' + '({2}, {1}, {0})'
+        result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
+        result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
+        # SVE has real nontemporal stores, so we only need to zero cachlines on Neon
+        result['cachelineZero'] = 'cachelineZero((void*) {0})'
+    result['cachelineSize'] = 'cachelineSize()'
+    return result
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -7,13 +7,17 @@ import numpy as np
 import sympy as sp
 from sympy.core import S
 from sympy.logic.boolalg import BooleanFalse, BooleanTrue
+from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
-from pystencils.astnodes import KernelFunction, Node, CachelineSize
+from sympy.functions.elementary.hyperbolic import HyperbolicFunction
-from pystencils.cpu.vectorization import vec_all, vec_any
-from pystencils.data_types import (
+from pystencils.astnodes import KernelFunction, LoopOverCoordinate, Node
-    PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
+from pystencils.cpu.vectorization import vec_all, vec_any, CachelineSize
-    reinterpret_cast_func, vector_memory_access, BasicType, TypedSymbol)
+from pystencils.typing import (
+    PointerType, VectorType, CastFunc, create_type, get_type_of_expression,
+    ReinterpretCastFunc, VectorMemoryAccess, BasicType, TypedSymbol, CFunction)
+from pystencils.enums import Backend
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
+from pystencils.functions import DivFunc, AddressOf
 from pystencils.integer_functions import (
    bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor,
    int_div, int_power_of_2, modulo_ceil)
@@ -28,12 +32,10 @@ __all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSy
 HEADER_REGEX = re.compile(r'^[<"].*[">]$')
-KERNCRAFT_NO_TERNARY_MODE = False
 def generate_c(ast_node: Node,
               signature_only: bool = False,
-               dialect='c',
+               dialect: Backend = Backend.C,
               custom_backend=None,
               with_globals=True) -> str:
    """Prints an abstract syntax tree node as C or CUDA code.
@@ -45,7 +47,7 @@ def generate_c(ast_node: Node,
    Args:
        ast_node: ast representation of kernel
        signature_only: generate signature without function body
-        dialect: 'c', 'cuda' or opencl
+        dialect: `Backend`: 'C' or 'CUDA'
        custom_backend: use own custom printer for code generation
        with_globals: enable usage of global variables
    Returns:
@@ -59,21 +61,19 @@ def generate_c(ast_node: Node,
            ast_node.global_variables = d.symbols_defined
    if custom_backend:
        printer = custom_backend
-    elif dialect == 'c':
+    elif dialect == Backend.C:
        try:
+            # TODO Vectorization Revamp: instruction_set should not be just slapped on ast
            instruction_set = ast_node.instruction_set
        except Exception:
            instruction_set = None
        printer = CBackend(signature_only=signature_only,
                           vector_instruction_set=instruction_set)
-    elif dialect == 'cuda':
+    elif dialect == Backend.CUDA:
        from pystencils.backends.cuda_backend import CudaBackend
        printer = CudaBackend(signature_only=signature_only)
-    elif dialect == 'opencl':
-        from pystencils.backends.opencl_backend import OpenClBackend
-        printer = OpenClBackend(signature_only=signature_only)
    else:
-        raise ValueError("Unknown dialect: " + str(dialect))
+        raise ValueError(f'Unknown {dialect=}')
    code = printer(ast_node)
    if not signature_only and isinstance(ast_node, KernelFunction):
        if with_globals and global_declarations:
@@ -121,12 +121,12 @@ def get_headers(ast_node: Node) -> Set[str]:
    for h in headers:
        assert HEADER_REGEX.match(h), f'header /{h}/ does not follow the pattern /"..."/ or /<...>/'
-    return sorted(headers)
+    return headers
 # --------------------------------------- Backend Specific Nodes -------------------------------------------------------
+# TODO future CustomCodeNode should not be backend specific move it elsewhere
 class CustomCodeNode(Node):
    def __init__(self, code, symbols_read, symbols_defined, parent=None):
        super(CustomCodeNode, self).__init__(parent=parent)
@@ -150,8 +150,8 @@ class CustomCodeNode(Node):
    def undefined_symbols(self):
        return self._symbols_read - self._symbols_defined
-    def __eq___(self, other):
+    def __eq__(self, other):
-        return self._code == other._code
+        return type(self) is type(other) and self._code == other._code
    def __hash__(self):
        return hash(self._code)
@@ -171,7 +171,7 @@ class PrintNode(CustomCodeNode):
 # noinspection PyPep8Naming
 class CBackend:
-    def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect='c'):
+    def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect=Backend.C):
        if sympy_printer is None:
            if vector_instruction_set is not None:
                self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set)
@@ -184,6 +184,8 @@ class CBackend:
        self._indent = "   "
        self._dialect = dialect
        self._signatureOnly = signature_only
+        self._kwargs = {}
+        self.sympy_printer._kwargs = self._kwargs
    def __call__(self, node):
        prev_is = VectorType.instruction_set
@@ -196,18 +198,19 @@ class CBackend:
        if isinstance(node, str):
            return node
        for cls in type(node).__mro__:
-            method_name = "_print_" + cls.__name__
+            method_name = f"_print_{cls.__name__}"
            if hasattr(self, method_name):
                return getattr(self, method_name)(node)
-        raise NotImplementedError(self.__class__.__name__ + " does not support node of type " + node.__class__.__name__)
+        raise NotImplementedError(f"{self.__class__.__name__} does not support node of type {node.__class__.__name__}")
-    def _print_Type(self, node):
+    def _print_AbstractType(self, node):
        return str(node)
    def _print_KernelFunction(self, node):
-        function_arguments = [f"{self._print(s.symbol.dtype)} {s.symbol.name}" for s in node.get_parameters()]
+        function_arguments = [f"{self._print(s.symbol.dtype)} {s.symbol.name}" for s in node.get_parameters()
+                              if not type(s.symbol) is CFunction]
        launch_bounds = ""
-        if self._dialect == 'cuda':
+        if self._dialect == Backend.CUDA:
            max_threads = node.indexing.max_threads_per_block()
            if max_threads:
                launch_bounds = f"__launch_bounds__({max_threads}) "
@@ -227,11 +230,14 @@ class CBackend:
        return f"{node.pragma_line}\n{self._print_Block(node)}"
    def _print_LoopOverCoordinate(self, node):
-        counter_symbol = node.loop_counter_name
+        counter_name = node.loop_counter_name
-        start = f"int64_t {counter_symbol} = {self.sympy_printer.doprint(node.start)}"
+        counter_dtype = node.loop_counter_symbol.dtype.c_name
-        condition = f"{counter_symbol} < {self.sympy_printer.doprint(node.stop)}"
+        start = f"{counter_dtype} {counter_name} = {self.sympy_printer.doprint(node.start)}"
-        update = f"{counter_symbol} += {self.sympy_printer.doprint(node.step)}"
+        condition = f"{counter_name} < {self.sympy_printer.doprint(node.stop)}"
+        update = f"{counter_name} += {self.sympy_printer.doprint(node.step)}"
        loop_str = f"for ({start}; {condition}; {update})"
+        self._kwargs['loop_counter'] = counter_name
+        self._kwargs['loop_stop'] = node.stop
        prefix = "\n".join(node.prefix_lines)
        if prefix:
@@ -239,63 +245,113 @@ class CBackend:
        return f"{prefix}{loop_str}\n{self._print(node.body)}"
    def _print_SympyAssignment(self, node):
+        printed_lhs = self.sympy_printer.doprint(node.lhs)
+        printed_rhs = self.sympy_printer.doprint(node.rhs)
        if node.is_declaration:
            if node.use_auto:
-                data_type = 'auto '
+                data_type = 'auto'
            else:
+                data_type = self._print(node.lhs.dtype).replace(' const', '')
                if node.is_const:
-                    prefix = 'const '
+                    data_type = f'const {data_type}'
-                else:
+            return f"{data_type} {printed_lhs} = {printed_rhs};"
-                    prefix = ''
-                data_type = prefix + self._print(node.lhs.dtype).replace(' const', '') + " "
-            return "%s%s = %s;" % (data_type,
-                                   self.sympy_printer.doprint(node.lhs),
-                                   self.sympy_printer.doprint(node.rhs))
        else:
-            lhs_type = get_type_of_expression(node.lhs)
+            lhs_type = get_type_of_expression(node.lhs)  # TOOD: this should have been typed
            printed_mask = ""
-            if type(lhs_type) is VectorType and isinstance(node.lhs, cast_func):
+            if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc):
-                arg, data_type, aligned, nontemporal, mask = node.lhs.args
+                arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args
                instr = 'storeU'
-                if aligned:
+                if nontemporal and 'storeA' not in self._vector_instruction_set and \
+                        'stream' in self._vector_instruction_set:
+                    instr = 'stream'
+                elif aligned:
                    instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
                if mask != True:  # NOQA
-                    instr = 'maskStore' if aligned else 'maskStoreU'
+                    instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \
+                            'maskStoreA' if aligned else 'maskStoreU'
+                    if instr not in self._vector_instruction_set:
+                        if instr == 'maskStream' and 'stream' in self._vector_instruction_set:
+                            store, load = 'stream', 'loadA'
+                        elif (instr in ('maskStream', 'maskStoreA')) and 'storeA' in self._vector_instruction_set:
+                            store, load = 'storeA', 'loadA'
+                        else:
+                            store, load = 'storeU', 'loadU'
+                        load = load if load in self._vector_instruction_set else 'loadU'
+                        self._vector_instruction_set[instr] = self._vector_instruction_set[store].format(
+                            '{0}', self._vector_instruction_set['blendv'].format(
+                                self._vector_instruction_set[load].format('{0}', **self._kwargs),
+                                '{1}', '{2}', **self._kwargs), **self._kwargs)
                    printed_mask = self.sympy_printer.doprint(mask)
-                    if self._vector_instruction_set['dataTypePrefix']['double'] == '__mm256d':
+                    if data_type.base_type.c_name == 'double':
-                        printed_mask = f"_mm256_castpd_si256({printed_mask})"
+                        if self._vector_instruction_set['double'] == '__m256d':
+                            printed_mask = f"_mm256_castpd_si256({printed_mask})"
+                        elif self._vector_instruction_set['double'] == '__m128d':
+                            printed_mask = f"_mm_castpd_si128({printed_mask})"
+                    elif data_type.base_type.c_name == 'float':
+                        if self._vector_instruction_set['float'] == '__m256':
+                            printed_mask = f"_mm256_castps_si256({printed_mask})"
+                        elif self._vector_instruction_set['float'] == '__m128':
+                            printed_mask = f"_mm_castps_si128({printed_mask})"
                rhs_type = get_type_of_expression(node.rhs)
                if type(rhs_type) is not VectorType:
-                    rhs = cast_func(node.rhs, VectorType(rhs_type))
+                    raise ValueError(f'Cannot vectorize {node.rhs} of type {rhs_type} inside of the pretty printer! '
+                                     f'This should have happen earlier!')
+                    # rhs = CastFunc(node.rhs, VectorType(rhs_type)) # Unknown width
                else:
                    rhs = node.rhs
                ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
+                if stride != 1:
+                    instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else
+                             'maskStoreS') if mask != True else \
+                            ('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS')  # NOQA
+                    return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
+                                                                      stride, printed_mask, **self._kwargs) + ';'
                pre_code = ''
-                if nontemporal and 'cachelineZero' in self._vector_instruction_set:
+                if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True:  # NOQA
-                    pre_code = f"if (((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0) " + "{\n\t" + \
+                    first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
-                        self._vector_instruction_set['cachelineZero'].format(ptr) + ';\n}\n'
+                    offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
+                                      * node.lhs.args[0].field.spatial_strides[i] for i in
+                                      range(len(node.lhs.args[0].field.spatial_strides))])
+                    if stride == 1:
+                        offset = offset.subs({node.lhs.args[0].field.spatial_strides[0]: 1})
+                    size = sp.Mul(*node.lhs.args[0].field.spatial_shape)
+                    element_size = 8 if data_type.base_type.c_name == 'double' else 4
+                    size_cond = f"({offset} + {CachelineSize.symbol/element_size}) < {size}"
+                    pre_code = f"if ({first_cond} && {size_cond}) " + "{\n\t" + \
+                        self._vector_instruction_set['cachelineZero'].format(ptr, **self._kwargs) + ';\n}\n'
                code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
-                                                                  printed_mask) + ';'
+                                                                  printed_mask, **self._kwargs) + ';'
                flushcond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == {CachelineSize.last_symbol}"
                if nontemporal and 'flushCacheline' in self._vector_instruction_set:
                    code2 = self._vector_instruction_set['flushCacheline'].format(
-                        ptr, self.sympy_printer.doprint(rhs)) + ';'
+                        ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';'
                    code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
-                elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
+                elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
-                    tmpvar = '_tmp_' + hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
+                    lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8]
+                    rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
+                    tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}'
                    code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
                        + self.sympy_printer.doprint(rhs) + ';'
-                    code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask) + ';'
+                    code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
-                    code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask) \
+                    maskStore, store, load = 'maskStoreAAndFlushCacheline', 'storeAAndFlushCacheline', 'loadA'
-                        + ';'
+                    instr2 = maskStore if mask != True else store  # NOQA
+                    if instr2 not in self._vector_instruction_set:
+                        self._vector_instruction_set[maskStore] = self._vector_instruction_set[store].format(
+                            '{0}', self._vector_instruction_set['blendv'].format(
+                                self._vector_instruction_set[load].format('{0}', **self._kwargs),
+                                '{1}', '{2}', **self._kwargs),
+                            **self._kwargs)
+                    code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
                    code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
                return pre_code + code
            else:
-                return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};"
+                return f"{printed_lhs} = {printed_rhs};"
    def _print_NontemporalFence(self, _):
        if 'streamFence' in self._vector_instruction_set:
@@ -387,23 +443,31 @@ class CustomSympyPrinter(CCodePrinter):
    def __init__(self):
        super(CustomSympyPrinter, self).__init__()
-        self._float_type = create_type("float32")
    def _print_Pow(self, expr):
        """Don't use std::pow function, for small integer exponents, write as multiplication"""
-        if not expr.free_symbols:
+        # Ideally the printer has as little logic as possible. Therefore,
-            return self._typed_number(expr.evalf(), get_type_of_expression(expr))
+        # powers should be rewritten as `DivFunc`s / unevaluated `Mul`s before
+        # printing. `NodeCollection` offers a convenience function to do just
-        if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8:
+        # that. However, `cut_loops` rewrites unevaluated multiplications as
-            return f"({self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False))})"
+        # `Pow`s again. Neither `deepcopy` nor `func(*args)` are suited to
-        elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0:
+        # rebuild unevaluated expressions. Therefore, as long as we stick with
-            return f"1 / ({self._print(sp.Mul(*([expr.base] * -expr.exp), evaluate=False))})"
+        # SymPy, this is the only way to avoid printing `pow`s.
+        exp = expr.exp.expr if isinstance(expr.exp, CastFunc) else expr.exp
+        one_type = expr.base.dtype if hasattr(expr.base, "dtype") else get_type_of_expression(expr.base)
+        if exp.is_integer and exp.is_number and (0 < exp <= 8):
+            return f"({self._print(sp.Mul(*[expr.base] * exp, evaluate=False))})"
+        elif exp.is_integer and exp.is_number and (-8 <= exp < 0):
+            return f"{self._typed_number(1, one_type)} / ({self._print(sp.Mul(*([expr.base] * -exp), evaluate=False))})"
        else:
            return super(CustomSympyPrinter, self)._print_Pow(expr)
+    # TODO don't print ones in sp.Mul
    def _print_Rational(self, expr):
        """Evaluate all rationals i.e. print 0.25 instead of 1.0/4.0"""
-        res = str(expr.evalf().num)
+        res = str(expr.evalf(17))
        return res
    def _print_Equality(self, expr):
@@ -421,7 +485,7 @@ class CustomSympyPrinter(CCodePrinter):
        else:
            return f'fabs({self._print(expr.args[0])})'
-    def _print_Type(self, node):
+    def _print_AbstractType(self, node):
        return str(node)
    def _print_Function(self, expr):
@@ -434,30 +498,48 @@ class CustomSympyPrinter(CCodePrinter):
        }
        if hasattr(expr, 'to_c'):
            return expr.to_c(self._print)
-        if isinstance(expr, reinterpret_cast_func):
+        if isinstance(expr, ReinterpretCastFunc):
            arg, data_type = expr.args
-            return f"*(({self._print(PointerType(data_type, restrict=False))})(& {self._print(arg)}))"
+            if isinstance(data_type, PointerType):
-        elif isinstance(expr, address_of):
+                const_str = "const" if data_type.const else ""
+                return f"(({const_str} {self._print(data_type.base_type)} *)(& {self._print(arg)}))"
+            else:
+                return f"*(({self._print(PointerType(data_type, restrict=False))})(& {self._print(arg)}))"
+        elif isinstance(expr, AddressOf):
            assert len(expr.args) == 1, "address_of must only have one argument"
            return f"&({self._print(expr.args[0])})"
-        elif isinstance(expr, cast_func):
+        elif isinstance(expr, CastFunc):
+            cast = "(({data_type})({code}))"
            arg, data_type = expr.args
-            if isinstance(arg, sp.Number) and arg.is_finite:
+            if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
                return self._typed_number(arg, data_type)
+            elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
+                    and data_type == BasicType('float32'):
+                known = self.known_functions[arg.__class__.__name__.lower()]
+                code = self._print(arg)
+                return code.replace(known, f"{known}f")
+            elif isinstance(arg, (sp.Pow, sp.exp)) and data_type == BasicType('float32'):
+                known = ['sqrt', 'cbrt', 'pow', 'exp']
+                code = self._print(arg)
+                for k in known:
+                    if k in code:
+                        return code.replace(k, f'{k}f')
+                # Powers of small integers are printed as divisions/multiplications.
+                if '/' in code or '*' in code:
+                    return cast.format(data_type=data_type, code=code)
+                raise ValueError(f"{code} doesn't give {known=} function back.")
            else:
-                return f"(({data_type})({self._print(arg)}))"
+                return cast.format(data_type=data_type, code=self._print(arg))
        elif isinstance(expr, fast_division):
-            return f"({self._print(expr.args[0] / expr.args[1])})"
+            raise ValueError("fast_division is only supported for Taget.GPU")
        elif isinstance(expr, fast_sqrt):
-            return f"({self._print(sp.sqrt(expr.args[0]))})"
+            raise ValueError("fast_sqrt is only supported for Taget.GPU")
+        elif isinstance(expr, fast_inv_sqrt):
+            raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
        elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
            return self._print(expr.args[0])
-        elif isinstance(expr, fast_inv_sqrt):
-            return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
        elif isinstance(expr, sp.Abs):
            return f"abs({self._print(expr.args[0])})"
-        elif isinstance(expr, sp.Max):
-            return self._print(expr)
        elif isinstance(expr, sp.Mod):
            if expr.args[0].is_integer and expr.args[1].is_integer:
                return f"({self._print(expr.args[0])} % {self._print(expr.args[1])})"
@@ -469,6 +551,8 @@ class CustomSympyPrinter(CCodePrinter):
            return f"(1 << ({self._print(expr.args[0])}))"
        elif expr.func == int_div:
            return f"(({self._print(expr.args[0])}) / ({self._print(expr.args[1])}))"
+        elif expr.func == DivFunc:
+            return f'(({self._print(expr.divisor)}) / ({self._print(expr.dividend)}))'
        else:
            name = expr.name if hasattr(expr, 'name') else expr.__class__.__name__
            arg_str = ', '.join(self._print(a) for a in expr.args)
@@ -480,55 +564,17 @@ class CustomSympyPrinter(CCodePrinter):
            return res + '.0f' if '.' not in res else res + 'f'
        elif dtype.numpy_dtype == np.float64:
            return res + '.0' if '.' not in res else res
+        elif dtype.is_int():
+            tokens = res.split('.')
+            if len(tokens) == 1: 
+                return res
+            elif int(tokens[1]) != 0:
+                raise ValueError(f"Cannot print non-integer number {res} as an integer.")
+            else:
+                return tokens[0]
        else:
            return res
-    def _print_Sum(self, expr):
-        template = """[&]() {{
-    {dtype} sum = ({dtype}) 0;
-    for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
-        sum += {expr};
-    }}
-    return sum;
-}}()"""
-        var = expr.limits[0][0]
-        start = expr.limits[0][1]
-        end = expr.limits[0][2]
-        code = template.format(
-            dtype=get_type_of_expression(expr.args[0]),
-            iterator_dtype='int',
-            var=self._print(var),
-            start=self._print(start),
-            end=self._print(end),
-            expr=self._print(expr.function),
-            increment=str(1),
-            condition=self._print(var) + ' <= ' + self._print(end)  # if start < end else '>='
-        )
-        return code
-    def _print_Product(self, expr):
-        template = """[&]() {{
-    {dtype} product = ({dtype}) 1;
-    for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
-        product *= {expr};
-    }}
-    return product;
-}}()"""
-        var = expr.limits[0][0]
-        start = expr.limits[0][1]
-        end = expr.limits[0][2]
-        code = template.format(
-            dtype=get_type_of_expression(expr.args[0]),
-            iterator_dtype='int',
-            var=self._print(var),
-            start=self._print(start),
-            end=self._print(end),
-            expr=self._print(expr.function),
-            increment=str(1),
-            condition=self._print(var) + ' <= ' + self._print(end)  # if start < end else '>='
-        )
-        return code
    def _print_ConditionalFieldAccess(self, node):
        return self._print(sp.Piecewise((node.outofbounds_value, node.outofbounds_condition), (node.access, True)))
@@ -552,27 +598,6 @@ class CustomSympyPrinter(CCodePrinter):
            return f"(({a} < {b}) ? {a} : {b})"
        return inner_print_min(expr.args)
-    def _print_re(self, expr):
-        return f"real({self._print(expr.args[0])})"
-    def _print_im(self, expr):
-        return f"imag({self._print(expr.args[0])})"
-    def _print_ImaginaryUnit(self, expr):
-        return "complex<double>{0,1}"
-    def _print_TypedImaginaryUnit(self, expr):
-        if expr.dtype.numpy_dtype == np.complex64:
-            return "complex<float>{0,1}"
-        elif expr.dtype.numpy_dtype == np.complex128:
-            return "complex<double>{0,1}"
-        else:
-            raise NotImplementedError(
-                "only complex64 and complex128 supported")
-    def _print_Complex(self, expr):
-        return self._typed_number(expr, np.complex64)
 # noinspection PyPep8Naming
 class VectorizedCustomSympyPrinter(CustomSympyPrinter):
@@ -591,47 +616,100 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
            return None
    def _print_Abs(self, expr):
-        if 'abs' in self.instruction_set and isinstance(expr.args[0], vector_memory_access):
+        if isinstance(get_type_of_expression(expr), (VectorType, VectorMemoryAccess)):
-            return self.instruction_set['abs'].format(self._print(expr.args[0]))
+            return self.instruction_set['abs'].format(self._print(expr.args[0]), **self._kwargs)
        return super()._print_Abs(expr)
+    def _typed_vectorized_number(self, expr, data_type):
+        basic_data_type = data_type.base_type
+        number = self._typed_number(expr, basic_data_type)
+        instruction = 'makeVecConst'
+        if basic_data_type.is_bool():
+            instruction = 'makeVecConstBool'
+        # TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
+        elif basic_data_type.is_int():
+            instruction = 'makeVecConstInt'
+        return self.instruction_set[instruction].format(number, **self._kwargs)
+    def _typed_vectorized_symbol(self, expr, data_type):
+        if not isinstance(expr, TypedSymbol):
+            raise ValueError(f'{expr} is not a TypeSymbol. It is {expr.type=}')
+        basic_data_type = data_type.base_type
+        symbol = self._print(expr)
+        if basic_data_type != expr.dtype:
+            symbol = f'(({basic_data_type})({symbol}))'
+        instruction = 'makeVecConst'
+        if basic_data_type.is_bool():
+            instruction = 'makeVecConstBool'
+        # TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
+        elif basic_data_type.is_int():
+            instruction = 'makeVecConstInt'
+        return self.instruction_set[instruction].format(symbol, **self._kwargs)
+    def _print_CastFunc(self, expr):
+        arg, data_type = expr.args
+        if type(data_type) is VectorType:
+            base_type = data_type.base_type
+            # vector_memory_access is a cast_func itself so it should't be directly inside a cast_func
+            assert not isinstance(arg, VectorMemoryAccess)
+            if isinstance(arg, sp.Tuple):
+                is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
+                is_integer = get_type_of_expression(arg[0]) == create_type("int")
+                printed_args = [self._print(a) for a in arg]
+                instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
+                if instruction == 'makeVecInt' and 'makeVecIndex' in self.instruction_set:
+                    increments = np.array(arg)[1:] - np.array(arg)[:-1]
+                    if len(set(increments)) == 1:
+                        return self.instruction_set['makeVecIndex'].format(printed_args[0], increments[0],
+                                                                           **self._kwargs)
+                return self.instruction_set[instruction].format(*printed_args, **self._kwargs)
+            else:
+                if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
+                    return self._typed_vectorized_number(arg, data_type)
+                elif isinstance(arg, TypedSymbol):
+                    return self._typed_vectorized_symbol(arg, data_type)
+                elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
+                        and base_type == BasicType('float32'):
+                    raise NotImplementedError('Vectorizer is not tested for trigonometric functions yet')
+                    # known = self.known_functions[arg.__class__.__name__.lower()]
+                    # code = self._print(arg)
+                    # return code.replace(known, f"{known}f")
+                elif isinstance(arg, sp.Pow):
+                    if base_type == BasicType('float32') or base_type == BasicType('float64'):
+                        return self._print_Pow(arg)
+                    else:
+                        raise NotImplementedError('Integer Pow is not implemented')
+                elif isinstance(arg, sp.UnevaluatedExpr):
+                    return self._print(arg.args[0])
+                else:
+                    raise NotImplementedError('Vectorizer cannot cast between different datatypes')
+                    # to_type = self.instruction_set['suffix'][data_type.base_type.c_name]
+                    # from_type = self.instruction_set['suffix'][get_type_of_expression(arg).base_type.c_name]
+                    # return self.instruction_set['cast'].format(from_type, to_type, self._print(arg))
+        else:
+            return self._scalarFallback('_print_Function', expr)
+            # raise ValueError(f'Non VectorType cast "{data_type}" in vectorized code.')
    def _print_Function(self, expr):
-        if isinstance(expr, vector_memory_access):
+        if isinstance(expr, VectorMemoryAccess):
-            arg, data_type, aligned, _, mask = expr.args
+            arg, data_type, aligned, _, mask, stride = expr.args
+            if stride != 1:
+                return self.instruction_set['loadS'].format(f"& {self._print(arg)}", stride, **self._kwargs)
            instruction = self.instruction_set['loadA'] if aligned else self.instruction_set['loadU']
-            return instruction.format("& " + self._print(arg))
+            return instruction.format(f"& {self._print(arg)}", **self._kwargs)
-        elif isinstance(expr, cast_func):
+        elif expr.func == DivFunc:
-            arg, data_type = expr.args
-            if type(data_type) is VectorType:
-                # vector_memory_access is a cast_func itself so it should't be directly inside a cast_func
-                assert not isinstance(arg, vector_memory_access)
-                if isinstance(arg, sp.Tuple):
-                    is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
-                    is_integer = get_type_of_expression(arg[0]) == create_type("int")
-                    printed_args = [self._print(a) for a in arg]
-                    instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
-                    return self.instruction_set[instruction].format(*printed_args)
-                else:
-                    is_boolean = get_type_of_expression(arg) == create_type("bool")
-                    is_integer = get_type_of_expression(arg) == create_type("int") or \
-                        (isinstance(arg, TypedSymbol) and arg.dtype.is_int())
-                    instruction = 'makeVecConstBool' if is_boolean else \
-                                  'makeVecConstInt' if is_integer else 'makeVecConst'
-                    return self.instruction_set[instruction].format(self._print(arg))
-        elif expr.func == fast_division:
            result = self._scalarFallback('_print_Function', expr)
            if not result:
-                result = self.instruction_set['/'].format(self._print(expr.args[0]), self._print(expr.args[1]))
+                result = self.instruction_set['/'].format(self._print(expr.divisor), self._print(expr.dividend),
+                                                          **self._kwargs)
            return result
-        elif expr.func == fast_sqrt:
+        elif isinstance(expr, fast_division):
-            return f"({self._print(sp.sqrt(expr.args[0]))})"
+            raise ValueError("fast_division is only supported for Taget.GPU")
-        elif expr.func == fast_inv_sqrt:
+        elif isinstance(expr, fast_sqrt):
-            result = self._scalarFallback('_print_Function', expr)
+            raise ValueError("fast_sqrt is only supported for Taget.GPU")
-            if not result:
+        elif isinstance(expr, fast_inv_sqrt):
-                if self.instruction_set['rsqrt']:
+            raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
-                    return self.instruction_set['rsqrt'].format(self._print(expr.args[0]))
-                else:
-                    return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
        elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
            instr = 'any' if isinstance(expr, vec_any) else 'all'
            expr_type = get_type_of_expression(expr.args[0])
@@ -641,8 +719,9 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
                if isinstance(expr.args[0], sp.Rel):
                    op = expr.args[0].rel_op
                    if (instr, op) in self.instruction_set:
-                        return self.instruction_set[(instr, op)].format(*[self._print(a) for a in expr.args[0].args])
+                        return self.instruction_set[(instr, op)].format(*[self._print(a) for a in expr.args[0].args],
-                return self.instruction_set[instr].format(self._print(expr.args[0]))
+                                                                        **self._kwargs)
+                return self.instruction_set[instr].format(self._print(expr.args[0]), **self._kwargs)
        return super(VectorizedCustomSympyPrinter, self)._print_Function(expr)
@@ -655,7 +734,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        assert len(arg_strings) > 0
        result = arg_strings[0]
        for item in arg_strings[1:]:
-            result = self.instruction_set['&'].format(result, item)
+            result = self.instruction_set['&'].format(result, item, **self._kwargs)
        return result
    def _print_Or(self, expr):
@@ -667,7 +746,7 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        assert len(arg_strings) > 0
        result = arg_strings[0]
        for item in arg_strings[1:]:
-            result = self.instruction_set['|'].format(result, item)
+            result = self.instruction_set['|'].format(result, item, **self._kwargs)
        return result
    def _print_Add(self, expr, order=None):
@@ -681,12 +760,12 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        # special treatment for all-integer args, for loop index arithmetic until we have proper int vectorization
        suffix = ""
-        if all([(type(e) is cast_func and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer)
+        if all([(type(e) is CastFunc and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer)
                or (type(e) is TypedSymbol and isinstance(e.dtype, BasicType) and e.dtype.is_int()) for e in args]):
-            dtype = set([e.dtype for e in args if type(e) is cast_func])
+            dtype = set([e.dtype for e in args if type(e) is CastFunc])
            assert len(dtype) == 1
            dtype = dtype.pop()
-            args = [cast_func(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e
+            args = [CastFunc(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e
                    for e in args]
            suffix = "int"
@@ -708,29 +787,35 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        processed = summands[0].term
        for summand in summands[1:]:
            func = self.instruction_set['-' + suffix] if summand.sign == -1 else self.instruction_set['+' + suffix]
-            processed = func.format(processed, summand.term)
+            processed = func.format(processed, summand.term, **self._kwargs)
        return processed
    def _print_Pow(self, expr):
-        result = self._scalarFallback('_print_Pow', expr)
+        # Due to loop cutting sp.Mul is evaluated again.
+        try:
+            result = self._scalarFallback('_print_Pow', expr)
+        except ValueError:
+            result = None
        if result:
            return result
-        one = self.instruction_set['makeVecConst'].format(1.0)
+        one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs)
+        root = self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs)
-        if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8:
-            return "(" + self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False)) + ")"
+        if isinstance(expr.exp, CastFunc) and expr.exp.args[0].is_number:
-        elif expr.exp == -1:
+            exp = expr.exp.args[0]
-            one = self.instruction_set['makeVecConst'].format(1.0)
+        else:
-            return self.instruction_set['/'].format(one, self._print(expr.base))
+            exp = expr.exp
-        elif expr.exp == 0.5:
-            return self.instruction_set['sqrt'].format(self._print(expr.base))
+        # TODO the printer should not have any intelligence like this.
-        elif expr.exp == -0.5:
+        # TODO To remove all of these cases the vectoriser needs to be reworked. See loop cutting
-            root = self.instruction_set['sqrt'].format(self._print(expr.base))
+        if exp.is_integer and exp.is_number and 0 < exp < 8:
-            return self.instruction_set['/'].format(one, root)
+            return self._print(sp.Mul(*[expr.base] * exp, evaluate=False))
-        elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0:
+        elif exp == 0.5:
-            return self.instruction_set['/'].format(one,
+            return root
-                                                    self._print(sp.Mul(*[expr.base] * (-expr.exp), evaluate=False)))
+        elif exp == -0.5:
+            return self.instruction_set['/'].format(one, root, **self._kwargs)
        else:
            raise ValueError("Generic exponential not supported: " + str(expr))
@@ -738,7 +823,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        # noinspection PyProtectedMember
        from sympy.core.mul import _keep_coeff
-        result = self._scalarFallback('_print_Mul', expr)
+        if not inside_add:
+            result = self._scalarFallback('_print_Mul', expr)
+        else:
+            result = None
        if result:
            return result
@@ -769,19 +857,19 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        result = a_str[0]
        for item in a_str[1:]:
-            result = self.instruction_set['*'].format(result, item)
+            result = self.instruction_set['*'].format(result, item, **self._kwargs)
        if len(b) > 0:
            denominator_str = b_str[0]
            for item in b_str[1:]:
-                denominator_str = self.instruction_set['*'].format(denominator_str, item)
+                denominator_str = self.instruction_set['*'].format(denominator_str, item, **self._kwargs)
-            result = self.instruction_set['/'].format(result, denominator_str)
+            result = self.instruction_set['/'].format(result, denominator_str, **self._kwargs)
        if inside_add:
            return sign, result
        else:
            if sign < 0:
-                return self.instruction_set['*'].format(self._print(S.NegativeOne), result)
+                return self.instruction_set['*'].format(self._print(S.NegativeOne), result, **self._kwargs)
            else:
                return result
@@ -789,13 +877,13 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        result = self._scalarFallback('_print_Relational', expr)
        if result:
            return result
-        return self.instruction_set[expr.rel_op].format(self._print(expr.lhs), self._print(expr.rhs))
+        return self.instruction_set[expr.rel_op].format(self._print(expr.lhs), self._print(expr.rhs), **self._kwargs)
    def _print_Equality(self, expr):
        result = self._scalarFallback('_print_Equality', expr)
        if result:
            return result
-        return self.instruction_set['=='].format(self._print(expr.lhs), self._print(expr.rhs))
+        return self.instruction_set['=='].format(self._print(expr.lhs), self._print(expr.rhs), **self._kwargs)
    def _print_Piecewise(self, expr):
        result = self._scalarFallback('_print_Piecewise', expr)
@@ -813,13 +901,11 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        result = self._print(expr.args[-1][0])
        for true_expr, condition in reversed(expr.args[:-1]):
-            if isinstance(condition, cast_func) and get_type_of_expression(condition.args[0]) == create_type("bool"):
+            if isinstance(condition, CastFunc) and get_type_of_expression(condition.args[0]) == create_type("bool"):
-                if not KERNCRAFT_NO_TERNARY_MODE:
+                result = "(({}) ? ({}) : ({}))".format(self._print(condition.args[0]), self._print(true_expr),
-                    result = "(({}) ? ({}) : ({}))".format(self._print(condition.args[0]), self._print(true_expr),
+                                                       result, **self._kwargs)
-                                                           result)
-                else:
-                    print("Warning - skipping ternary op")
            else:
                # noinspection SpellCheckingInspection
-                result = self.instruction_set['blendv'].format(result, self._print(true_expr), self._print(condition))
+                result = self.instruction_set['blendv'].format(result, self._print(true_expr), self._print(condition),
+                                                               **self._kwargs)
        return result
--- a/pystencils/backends/cuda_backend.py
+++ b/pystencils/backends/cuda_backend.py
-from os.path import dirname, join
 from pystencils.astnodes import Node
 from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c
+from pystencils.enums import Backend
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
-from pystencils.interpolation_astnodes import DiffInterpolatorAccess, InterpolationMode
-with open(join(dirname(__file__), 'cuda_known_functions.txt')) as f:
-    lines = f.readlines()
-    CUDA_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l}
 def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str:
@@ -22,7 +16,7 @@ def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=N
    Returns:
        CUDA code for the ast node and its descendants
    """
-    return generate_c(ast_node, signature_only, dialect='cuda',
+    return generate_c(ast_node, signature_only, dialect=Backend.CUDA,
                      custom_backend=custom_backend, with_globals=with_globals)
@@ -33,7 +27,7 @@ class CudaBackend(CBackend):
        if not sympy_printer:
            sympy_printer = CudaSympyPrinter()
-        super().__init__(sympy_printer, signature_only, dialect='cuda')
+        super().__init__(sympy_printer, signature_only, dialect=Backend.CUDA)
    def _print_SharedMemoryAllocation(self, node):
        dtype = node.symbol.dtype
@@ -43,26 +37,13 @@ class CudaBackend(CBackend):
        return code
    @staticmethod
-    def _print_ThreadBlockSynchronization(node):
+    def _print_ThreadBlockSynchronization(_):
-        code = "__synchtreads();"
+        return "__synchtreads();"
-        return code
    def _print_TextureDeclaration(self, node):
+        cond = node.texture.field.dtype.numpy_dtype.itemsize > 4
-        # TODO: use fStrings here
+        return f'texture<{"fp_tex_" if cond else ""}{str(node.texture.field.dtype)}, ' \
-        if node.texture.field.dtype.numpy_dtype.itemsize > 4:
+               f'cudaTextureType{node.texture.field.spacial_dimensions}D, cudaReadModeElementType> {node.texture};'
-            code = "texture<fp_tex_%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
-                str(node.texture.field.dtype),
-                node.texture.field.spatial_dimensions,
-                node.texture
-            )
-        else:
-            code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
-                str(node.texture.field.dtype),
-                node.texture.field.spatial_dimensions,
-                node.texture
-            )
-        return code
    def _print_SkipIteration(self, _):
        return "return;"
@@ -73,31 +54,6 @@ class CudaSympyPrinter(CustomSympyPrinter):
    def __init__(self):
        super(CudaSympyPrinter, self).__init__()
-        self.known_functions.update(CUDA_KNOWN_FUNCTIONS)
-    def _print_InterpolatorAccess(self, node):
-        dtype = node.interpolator.field.dtype.numpy_dtype
-        if type(node) == DiffInterpolatorAccess:
-            # cubicTex3D_1st_derivative_x(texture tex, float3 coord)
-            template = f"cubicTex%iD_1st_derivative_{list(reversed('xyz'[:node.ndim]))[node.diff_coordinate_idx]}(%s, %s)"  # noqa
-        elif node.interpolator.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
-            template = "cubicTex%iDSimple(%s, %s)"
-        else:
-            if dtype.itemsize > 4:
-                # Use PyCuda hack!
-                # https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
-                template = "fp_tex%iD(%s, %s)"
-            else:
-                template = "tex%iD(%s, %s)"
-        code = template % (
-            node.interpolator.field.spatial_dimensions,
-            str(node.interpolator),
-            # + 0.5 comes from Nvidia's staggered indexing
-            ', '.join(self._print(o + 0.5) for o in reversed(node.offsets))
-        )
-        return code
    def _print_Function(self, expr):
        if isinstance(expr, fast_division):

--- a/pystencils/backends/dot.py
+++ b/pystencils/backends/dot.py
 import graphviz
-from graphviz import Digraph, lang
+try:
+    from graphviz import Digraph
+    import graphviz.quoting as quote
+except ImportError:
+    from graphviz import Digraph
+    import graphviz.lang as quote
 from sympy.printing.printer import Printer
@@ -12,7 +17,7 @@ class DotPrinter(Printer):
        super(DotPrinter, self).__init__()
        self._node_to_str_function = node_to_str_function
        self.dot = Digraph(**kwargs)
-        self.dot.quote_edge = lang.quote
+        self.dot.quote_edge = quote.quote
    def _print_KernelFunction(self, func):
        self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func))

--- a/pystencils/backends/json.py
+++ b/pystencils/backends/json.py
--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
@@ -78,7 +78,6 @@ def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
        # Clang and XL C++ are missing these for doubles
        result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
        result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
-        result['stream'] = result['stream'].format('(float*) {0}', '(__vector float) {1}')
        result['storeAAndFlushCacheline'] = result['storeAAndFlushCacheline'].format('(float*) {0}',
                                                                                     '(__vector float) {1}')

--- a/src/pystencils/backends/riscv_instruction_sets.py
+++ b/src/pystencils/backends/riscv_instruction_sets.py
+from pystencils.typing import CFunction
+def get_argument_string(function_shortcut, last=''):
+    args = function_shortcut[function_shortcut.index('[') + 1: -1]
+    arg_string = "("
+    for arg in args.split(","):
+        arg = arg.strip()
+        if not arg:
+            continue
+        if arg in ('0', '1', '2', '3', '4', '5'):
+            arg_string += "{" + arg + "},"
+        else:
+            arg_string += arg + ","
+    if last:
+        arg_string += last + ','
+    arg_string = arg_string[:-1] + ")"
+    return arg_string
+def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
+    assert instruction_set == 'rvv'
+    bits = {'double': 64,
+            'float': 32,
+            'int': 32}
+    base_names = {
+        '+': 'fadd_vv[0, 1]',
+        '-': 'fsub_vv[0, 1]',
+        '*': 'fmul_vv[0, 1]',
+        '/': 'fdiv_vv[0, 1]',
+        'sqrt': 'fsqrt_v[0]',
+        'loadU': f'le{bits[data_type]}_v[0]',
+        'storeU': f'se{bits[data_type]}_v[0, 1]',
+        'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
+        'loadS': f'lse{bits[data_type]}_v[0, 1]',
+        'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
+        'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',
+        'abs': 'fabs_v[0]',
+        '==': 'mfeq_vv[0, 1]',
+        '!=': 'mfne_vv[0, 1]',
+        '<=': 'mfle_vv[0, 1]',
+        '<': 'mflt_vv[0, 1]',
+        '>=': 'mfge_vv[0, 1]',
+        '>': 'mfgt_vv[0, 1]',
+        '&': 'mand_mm[0, 1]',
+        '|': 'mor_mm[0, 1]',
+        'blendv': 'merge_vvm[2, 0, 1]',
+        'any': 'cpop_m[0]',
+        'all': 'cpop_m[0]',
+    }
+    result = dict()
+    width = f'vsetvlmax_e{bits[data_type]}m1()'
+    intwidth = 'vsetvlmax_e{bits["int"]}m1()'
+    result['bytes'] = 'vsetvlmax_e8m1()'
+    prefix = 'v'
+    suffix = f'_f{bits[data_type]}m1'
+    vl = '{loop_stop} - {loop_counter}'
+    int_vl = f'({vl})*{bits[data_type]//bits["int"]}'
+    for intrinsic_id, function_shortcut in base_names.items():
+        function_shortcut = function_shortcut.strip()
+        name = function_shortcut[:function_shortcut.index('[')]
+        if name.startswith('mf'):
+            suffix2 = suffix + f'_b{bits[data_type]}'
+        elif name.endswith('_mm') or name.endswith('_m'):
+            suffix2 = f'_b{bits[data_type]}'
+        elif intrinsic_id.startswith('mask'):
+            suffix2 = suffix + '_m'
+        else:
+            suffix2 = suffix
+        arg_string = get_argument_string(function_shortcut, last=vl)
+        result[intrinsic_id] = prefix + name + suffix2 + arg_string
+    result['width'] = CFunction(width, "int")
+    result['intwidth'] = CFunction(intwidth, "int")
+    result['makeVecConst'] = f'vfmv_v_f_f{bits[data_type]}m1({{0}}, {vl})'
+    result['makeVecConstInt'] = f'vmv_v_x_i{bits["int"]}m1({{0}}, {int_vl})'
+    result['makeVecIndex'] = f'vmacc_vx_i{bits["int"]}m1({result["makeVecConstInt"]}, {{1}}, ' + \
+                             f'vid_v_i{bits["int"]}m1({int_vl}), {int_vl})'
+    result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
+    result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
+    result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
+    result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
+    result['float'] = f'vfloat{bits["float"]}m1_t'
+    result['double'] = f'vfloat{bits["double"]}m1_t'
+    result['int'] = f'vint{bits["int"]}m1_t'
+    result['bool'] = f'vbool{bits[data_type]}_t'
+    result['headers'] = ['<riscv_vector.h>', '"riscv_v_helpers.h"']
+    result['any'] += ' > 0x0'
+    result['all'] += f' == vsetvl_e{bits[data_type]}m1({vl})'
+    result['cachelineSize'] = 'cachelineSize()'
+    result['cachelineZero'] = 'cachelineZero((void*) {0})'
+    return result
--- a/src/pystencils/backends/simd_instruction_sets.py
+++ b/src/pystencils/backends/simd_instruction_sets.py
+import os
+import platform
+from ctypes import CDLL, c_int, c_size_t, sizeof, byref
+from warnings import warn
+import numpy as np
+from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
+from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
+from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
+from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
+from pystencils.cache import memorycache
+from pystencils.typing import numpy_name_to_c
+def get_vector_instruction_set(data_type='double', instruction_set='avx'):
+    if data_type == 'float':
+        warn(f"Ambiguous input for data_type: {data_type}. For single precision please use float32. "
+             f"For more information please take numpy.dtype as a reference. This input will not be supported in future "
+             f"releases")
+        data_type = 'float64'
+    type_name = numpy_name_to_c(np.dtype(data_type).name)
+    if instruction_set in ['neon', 'sme'] or instruction_set.startswith('sve'):
+        return get_vector_instruction_set_arm(type_name, instruction_set)
+    elif instruction_set in ['vsx']:
+        return get_vector_instruction_set_ppc(type_name, instruction_set)
+    elif instruction_set in ['rvv']:
+        return get_vector_instruction_set_riscv(type_name, instruction_set)
+    else:
+        return get_vector_instruction_set_x86(type_name, instruction_set)
+@memorycache
+def get_supported_instruction_sets():
+    """List of supported instruction sets on current hardware, or None if query failed."""
+    if 'PYSTENCILS_SIMD' in os.environ:
+        return os.environ['PYSTENCILS_SIMD'].split(',')
+    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
+        result = ['neon']
+        libc = CDLL('/usr/lib/libc.dylib')
+        value = c_int(0)
+        size = c_size_t(sizeof(value))
+        status = libc.sysctlbyname(b"hw.optional.arm.FEAT_SME", byref(value), byref(size), None, 0)
+        if status == 0 and value.value == 1:
+            result.insert(0, "sme")
+        return result
+    elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
+        return ['neon']
+    elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
+        result = ['neon']  # Neon is mandatory on 64-bit ARM
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap2 = libc.getauxval(26)  # AT_HWCAP2
+        if hwcap & (1 << 22):  # HWCAP_SVE
+            if hwcap2 & (1 << 1):  # HWCAP2_SVE2
+                name = 'sve2'
+            else:
+                name = 'sve'
+            length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
+            if length < 0:
+                raise OSError("SVE length query failed")
+            while length >= 128:
+                result.append(f"{name}{length}")
+                length //= 2
+            result.append(name)
+        if hwcap2 & (1 << 23):  # HWCAP2_SME
+            result.insert(0, "sme")  # prepend to list so it is not automatically chosen as best instruction set
+        return result
+    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
+        return ['rvv'] if hwcap & hwcap_isa_v else []
+    elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        return ['vsx'] if hwcap & 0x00000080 else []  # PPC_FEATURE_HAS_VSX
+    elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError:
+            return None
+        result = []
+        required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
+        required_avx_flags = {'avx', 'avx2'}
+        required_avx512_flags = {'avx512f'}
+        possible_avx512vl_flags = {'avx512vl', 'avx10_1'}
+        flags = set(get_cpu_info()['flags'])
+        if flags.issuperset(required_sse_flags):
+            result.append("sse")
+        if flags.issuperset(required_avx_flags):
+            result.append("avx")
+        if flags.issuperset(required_avx512_flags):
+            result.append("avx512")
+        if not flags.isdisjoint(possible_avx512vl_flags):
+            result.append("avx512vl")
+        return result
+    else:
+        raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
+                                  (platform.system(), platform.machine()))
+@memorycache
+def get_cacheline_size(instruction_set):
+    """Get the size (in bytes) of a cache block that can be zeroed without memory access.
+       Usually, this is identical to the cache line size."""
+    instruction_sets = get_vector_instruction_set('double', instruction_set)
+    if 'cachelineSize' not in instruction_sets:
+        return None
+    import pystencils as ps
+    from pystencils.astnodes import SympyAssignment
+    import numpy as np
+    from pystencils.cpu.vectorization import CachelineSize
+    arr = np.zeros((1, 1), dtype=np.float32)
+    f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
+    ass = [CachelineSize(), SympyAssignment(f.center, CachelineSize.symbol)]
+    ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
+    kernel = ast.compile()
+    kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
+    return int(arr[0, 0])
--- a/pystencils/backends/x86_instruction_sets.py
+++ b/pystencils/backends/x86_instruction_sets.py
@@ -51,35 +51,22 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
        'makeVecConstBool': 'set[]',
        'makeVecInt': 'set[]',
        'makeVecConstInt': 'set[]',
        'loadU': 'loadu[0]',
        'loadA': 'load[0]',
        'storeU': 'storeu[0,1]',
        'storeA': 'store[0,1]',
        'stream': 'stream[0,1]',
-        'maskstore': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]',
+        'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
-        'maskload': 'mask_load[0, 2, 1]' if instruction_set == 'avx512' else 'maskload[0, 2, 1]'
+        'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
    }
-    if instruction_set == 'avx512':
-        base_names.update({
-            'maskStore': 'mask_store[0, 2, 1]',
-            'maskStoreU': 'mask_storeu[0, 2, 1]',
-            'maskLoad': 'mask_load[2, 1, 0]',
-            'maskLoadU': 'mask_loadu[2, 1, 0]'
-        })
-    if instruction_set == 'avx':
-        base_names.update({
-            'maskStore': 'maskstore[0, 2, 1]',
-            'maskStoreU': 'maskstore[0, 2, 1]',
-            'maskLoad': 'maskload[0, 1]',
-            'maskLoadU': 'maskloadu[0, 1]'
-        })
    for comparison_op, constant in comparisons.items():
        base_names[comparison_op] = f'cmp[0, 1, {constant}]'
    headers = {
        'avx512': ['<immintrin.h>'],
+        'avx512vl': ['<immintrin.h>'],
        'avx': ['<immintrin.h>'],
        'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>',
                '<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>']
@@ -93,6 +80,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
    prefix = {
        'sse': '_mm',
        'avx': '_mm256',
+        'avx512vl': '_mm256',
        'avx512': '_mm512',
    }
@@ -103,11 +91,13 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
        ("double", "avx"): 4,
        ("float", "avx"): 8,
        ("int", "avx"): 8,
+        ("double", "avx512vl"): 4,
+        ("float", "avx512vl"): 8,
+        ("int", "avx512vl"): 8,
        ("double", "avx512"): 8,
        ("float", "avx512"): 16,
        ("int", "avx512"): 16,
    }
    result = {
        'width': width[(data_type, instruction_set)],
        'intwidth': width[('int', instruction_set)],
@@ -125,15 +115,9 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
            suf = suffix[data_type]
            arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
-        mask_suffix = '_mask' if instruction_set == 'avx512' and intrinsic_id in comparisons.keys() else ''
+        mask_suffix = '_mask' if instruction_set.startswith('avx512') and intrinsic_id in comparisons.keys() else ''
        result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
-    result['dataTypePrefix'] = {
-        'double': "_" + pre + 'd',
-        'float': "_" + pre,
-    }
-    result['rsqrt'] = None
    bit_width = result['width'] * (64 if data_type == 'double' else 32)
    result['double'] = f"__m{bit_width}d"
    result['float'] = f"__m{bit_width}"
@@ -144,22 +128,46 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
    result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0"
    result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}"
-    if instruction_set == 'avx512':
+    setsuf = "x" if bit_width < 512 and bit_width // result['width'] == 64 else ""
-        size = 8 if data_type == 'double' else 16
-        result['&'] = f'_kand_mask{size}({{0}}, {{1}})'
+    if instruction_set.startswith('avx512'):
-        result['|'] = f'_kor_mask{size}({{0}}, {{1}})'
+        size = result['width']
-        result['any'] = f'!_ktestz_mask{size}_u8({{0}}, {{0}})'
+        masksize = max(size, 8)
-        result['all'] = f'_kortestc_mask{size}_u8({{0}}, {{0}})'
+        result['&'] = f'_kand_mask{masksize}({{0}}, {{1}})'
+        result['|'] = f'_kor_mask{masksize}({{0}}, {{1}})'
+        result['any'] = f'!_ktestz_mask{masksize}_u8({{0}}, {{0}})'
+        result['all'] = f'_kortestc_mask{masksize}_u8({{0}}, {{0}})'
        result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})'
        result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})"
-        result['abs'] = f"{pre}_abs_{suf}({{0}})"
+        result['bool'] = f"__mmask{masksize}"
-        result['bool'] = f"__mmask{size}"
        params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)])
        result['makeVecBool'] = f"__mmask8(({params}) )"
        params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)])
        result['makeVecConstBool'] = f"__mmask8(({params}) )"
+        vindex = f'{pre}_set_epi{bit_width//size}{setsuf}(' + \
+                 ', '.join([str(i) for i in range(result['width'])][::-1]) + ')'
+        vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}{setsuf}({{0}}))'
+        scale = bit_width // size // 8
+        result['storeS'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \
+                           f', {{1}}, {scale})'
+        result['maskStoreS'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \
+                               f', {{1}}, {scale})'
+        if bit_width == 512:
+            result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {scale})'
+        else:
+            result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}({{0}}, ' + vindex.format("{1}") + f', {scale})'
+    # abs intrinsic exists in 512 bits, but expands to a sequence. We generate that same sequence for 128 and 256 bits
+    if instruction_set == 'avx512':
+        result['abs'] = f"{pre}_abs_{suf}({{0}})"
+    else:
+        result['abs'] = f"{pre}_castsi{bit_width}_{suf}({pre}_and_si{bit_width}(" + \
+                        f"{pre}_set1_epi{bit_width // result['width']}{setsuf}(0x7" + \
+                        'f' * (bit_width // result['width'] // 4 - 1) + "), " + \
+                        f"{pre}_cast{suf}_si{bit_width}({{0}})))"
    if instruction_set == 'avx' and data_type == 'float':
        result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})"

--- a/src/pystencils/bit_masks.py
+++ b/src/pystencils/bit_masks.py
+import sympy as sp
+# from pystencils.typing import get_type_of_expression
+# noinspection PyPep8Naming
+class flag_cond(sp.Function):
+    """Evaluates a flag condition on a bit mask, and returns the value of one of two expressions,
+    depending on whether the flag is set. 
+    Three argument version:
+    ```
+        flag_cond(flag_bit, mask, expr) = expr if (flag_bit is set in mask) else 0
+    ```
+    Four argument version:
+    ```
+        flag_cond(flag_bit, mask, expr_then, expr_else) = expr_then if (flag_bit is set in mask) else expr_else
+    ```
+    """
+    nargs = (3, 4)
+    def __new__(cls, flag_bit, mask_expression, *expressions):
+        # TODO Jan reintroduce checking
+        # flag_dtype = get_type_of_expression(flag_bit)
+        # if not flag_dtype.is_int():
+        #     raise ValueError('Argument flag_bit must be of integer type.')
+        #
+        # mask_dtype = get_type_of_expression(mask_expression)
+        # if not mask_dtype.is_int():
+        #     raise ValueError('Argument mask_expression must be of integer type.')
+        return super().__new__(cls, flag_bit, mask_expression, *expressions)
+    def to_c(self, print_func):
+        flag_bit = self.args[0]
+        mask = self.args[1]
+        then_expression = self.args[2]
+        flag_bit_code = print_func(flag_bit)
+        mask_code = print_func(mask)
+        then_code = print_func(then_expression)
+        code = f"(({mask_code}) >> ({flag_bit_code}) & 1) * ({then_code})"
+        if len(self.args) > 3:
+            else_expression = self.args[3]
+            else_code = print_func(else_expression)
+            code += f" + (({mask_code}) >> ({flag_bit_code}) ^ 1) * ({else_code})"
+        return code
--- a/pystencils/boundaries/__init__.py
+++ b/pystencils/boundaries/__init__.py
--- a/pystencils/boundaries/boundaryconditions.py
+++ b/pystencils/boundaries/boundaryconditions.py
 from typing import Any, List, Tuple
-from pystencils import Assignment
+from pystencils.astnodes import SympyAssignment
 from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo
-from pystencils.data_types import create_type
+from pystencils.typing import create_type
 class Boundary:
@@ -14,7 +14,7 @@ class Boundary:
    def __init__(self, name=None):
        self._name = name
-    def __call__(self, field, direction_symbol, index_field) -> List[Assignment]:
+    def __call__(self, field, direction_symbol, index_field) -> List[SympyAssignment]:
        """Defines the boundary behavior and must therefore be implemented by all boundaries.
        Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated.
@@ -63,20 +63,20 @@ class Neumann(Boundary):
        neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions)
        if field.index_dimensions == 0:
-            return [Assignment(field.center, field[neighbor])]
+            return [SympyAssignment(field.center, field[neighbor])]
        else:
            from itertools import product
            if not field.has_fixed_index_shape:
                raise NotImplementedError("Neumann boundary works only for fields with fixed index shape")
            index_iter = product(*(range(i) for i in field.index_shape))
-            return [Assignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
+            return [SympyAssignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
    def __hash__(self):
        # All boundaries of these class behave equal -> should also be equal
        return hash("Neumann")
    def __eq__(self, other):
-        return type(other) == Neumann
+        return type(other) is Neumann
 class Dirichlet(Boundary):
@@ -103,11 +103,11 @@ class Dirichlet(Boundary):
    def __call__(self, field, direction_symbol, index_field, **kwargs):
        if field.index_dimensions == 0:
-            return [Assignment(field.center, index_field("value") if self.additional_data else self._value)]
+            return [SympyAssignment(field.center, index_field("value") if self.additional_data else self._value)]
        elif field.index_dimensions == 1:
            assert not self.additional_data
            if not field.has_fixed_index_shape:
                raise NotImplementedError("Field needs fixed index shape")
            assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field"
-            return [Assignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
+            return [SympyAssignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
        raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension")
--- a/pystencils/boundaries/boundaryhandling.py
+++ b/pystencils/boundaries/boundaryhandling.py
+from functools import lru_cache
 import numpy as np
 import sympy as sp
-from pystencils import create_indexed_kernel
+from pystencils import create_kernel, CreateKernelConfig, Target
-from pystencils.assignment import Assignment
+from pystencils.astnodes import SympyAssignment
 from pystencils.backends.cbackend import CustomCodeNode
 from pystencils.boundaries.createindexlist import (
    create_boundary_index_array, numpy_data_type_for_boundary_object)
-from pystencils.cache import memorycache
+from pystencils.typing import TypedSymbol, create_type
-from pystencils.data_types import TypedSymbol, create_type
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler
-from pystencils.datahandling.pycuda import PyCudaArrayHandler
 from pystencils.field import Field
-from pystencils.kernelparameters import FieldPointerSymbol
+from pystencils.typing.typed_sympy import FieldPointerSymbol
 try:
    # noinspection PyPep8Naming
    import waLBerla as wlb
    if wlb.cpp_available:
        from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
+        import cupy.cuda.runtime
    else:
        ParallelDataHandling = None
 except ImportError:
@@ -33,11 +35,11 @@ class FlagInterface:
        >>> dh = create_data_handling((4, 5))
        >>> fi = FlagInterface(dh, 'flag_field', np.uint8)
        >>> assert dh.has_data('flag_field')
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        2
-        >>> fi.reserve_flag(4)
+        >>> int(fi.reserve_flag(4))
        4
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        8
    """
@@ -84,7 +86,7 @@ class FlagInterface:
 class BoundaryHandling:
    def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None,
-                 target='cpu', openmp=True):
+                 target: Target = Target.CPU, openmp=True):
        assert data_handling.has_data(field_name)
        assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match"
        self._data_handling = data_handling
@@ -99,7 +101,7 @@ class BoundaryHandling:
        self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
        if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
-            array_handler = PyCudaArrayHandler()
+            array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
        else:
            array_handler = self.data_handling.array_handler
@@ -115,7 +117,8 @@ class BoundaryHandling:
            for obj, cpu_arr in cpu_version.items():
                if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
-                    gpu_version[obj] = array_handler.to_gpu(cpu_arr)
+                    gpu_version[obj] = array_handler.empty(cpu_arr.shape, cpu_arr.dtype)
+                    array_handler.upload(gpu_version[obj], cpu_arr)
                else:
                    array_handler.upload(gpu_version[obj], cpu_arr)
@@ -378,15 +381,15 @@ class BoundaryDataSetter:
        assert coord < self.dim
        return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5
-    @memorycache()
+    @lru_cache()
    def link_offsets(self):
        return self.stencil[self.index_array['dir']]
-    @memorycache()
+    @lru_cache()
    def link_positions(self, coord):
        return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord]
-    @memorycache()
+    @lru_cache()
    def boundary_cell_positions(self, coord):
        return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord]
@@ -423,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode):
        code = "\n"
        for i in range(dim):
            offset_str = ", ".join([str(d[i]) for d in stencil])
-            code += "const int64_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
+            code += "const int32_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
        inv_dirs = []
        for direction in stencil:
            inverse_dir = tuple([-i for i in direction])
            inv_dirs.append(str(stencil.index(inverse_dir)))
-        code += "const int %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
+        code += "const int32_t %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
        offset_symbols = BoundaryOffsetInfo._offset_symbols(dim)
        super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(),
                                                 symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL]))
    @staticmethod
    def _offset_symbols(dim):
-        return [TypedSymbol(f"c{d}", create_type(np.int64)) for d in ['x', 'y', 'z'][:dim]]
+        return [TypedSymbol(f"c{d}", create_type(np.int32)) for d in ['x', 'y', 'z'][:dim]]
-    INV_DIR_SYMBOL = TypedSymbol("invdir", "int")
+    INV_DIR_SYMBOL = TypedSymbol("invdir", np.int32)
-def create_boundary_kernel(field, index_field, stencil, boundary_functor, target='cpu', **kernel_creation_args):
+def create_boundary_kernel(field, index_field, stencil, boundary_functor, target=Target.CPU, **kernel_creation_args):
    elements = [BoundaryOffsetInfo(stencil)]
-    index_arr_dtype = index_field.dtype.numpy_dtype
+    dir_symbol = TypedSymbol("dir", np.int32)
-    dir_symbol = TypedSymbol("dir", index_arr_dtype.fields['dir'][0])
+    elements += [SympyAssignment(dir_symbol, index_field[0]('dir'))]
-    elements += [Assignment(dir_symbol, index_field[0]('dir'))]
    elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field)
-    return create_indexed_kernel(elements, [index_field], target=target, **kernel_creation_args)
+    config = CreateKernelConfig(index_fields=[index_field], target=target, skip_independence_check=True,
+                                **kernel_creation_args)
+    return create_kernel(elements, config=config)
--- a/pystencils/boundaries/createindexlist.py
+++ b/pystencils/boundaries/createindexlist.py
-import itertools
 import warnings
 import numpy as np
 try:
-    # Try to import right away - assume compiled code is available
+    import pyximport
-    # compile with: python setup.py build_ext --inplace --use-cython
-    from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
-        create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, create_boundary_cell_index_list_3d
+    pyximport.install(language_level=3)
    cython_funcs_available = True
 except ImportError:
-    try:
+    cython_funcs_available = False
-        # If not, try development mode and import via pyximport
-        import pyximport
+if cython_funcs_available:
+    from pystencils.boundaries.createindexlistcython import (
-        pyximport.install(language_level=3)
+        create_boundary_neighbor_index_list_2d,
-        cython_funcs_available = True
+        create_boundary_neighbor_index_list_3d,
-    except ImportError:
+        create_boundary_cell_index_list_2d,
-        cython_funcs_available = False
+        create_boundary_cell_index_list_3d,
-    if cython_funcs_available:
+    )
-        from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
-            create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, \
-            create_boundary_cell_index_list_3d
 boundary_index_array_coordinate_names = ["x", "y", "z"]
 direction_member_name = "dir"
+default_index_array_dtype = np.int32
 def numpy_data_type_for_boundary_object(boundary_object, dim):
    coordinate_names = boundary_index_array_coordinate_names[:dim]
-    return np.dtype([(name, np.int32) for name in coordinate_names]
+    return np.dtype(
-                    + [(direction_member_name, np.int32)]
+        [(name, default_index_array_dtype) for name in coordinate_names]
-                    + [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data], align=True)
+        + [(direction_member_name, default_index_array_dtype)]
+        + [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data],
+        align=True,
-def _create_boundary_neighbor_index_list_python(flag_field_arr, nr_of_ghost_layers, boundary_mask,
+    )
-                                                fluid_mask, stencil, single_link):
-    coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
+def _create_index_list_python(
+    flag_field_arr,
-    result = []
+    boundary_mask,
-    gl = nr_of_ghost_layers
+    fluid_mask,
-    for cell in itertools.product(*reversed([range(gl, i - gl) for i in flag_field_arr.shape])):
+    stencil,
-        cell = cell[::-1]
+    single_link,
-        if not flag_field_arr[cell] & fluid_mask:
+    inner_or_boundary=False,
-            continue
+    nr_of_ghost_layers=None,
+):
+    if inner_or_boundary and nr_of_ghost_layers is None:
+        raise ValueError(
+            "If inner_or_boundary is set True the number of ghost layers "
+            "around the inner domain has to be specified"
+        )
+    if nr_of_ghost_layers is None:
+        nr_of_ghost_layers = 0
+    coordinate_names = boundary_index_array_coordinate_names[
+        : len(flag_field_arr.shape)
+    ]
+    index_arr_dtype = np.dtype(
+        [(name, default_index_array_dtype) for name in coordinate_names]
+        + [(direction_member_name, default_index_array_dtype)]
+    )
+    # boundary cells are extracted via np.where. To ensure continous memory access in the compute kernel these cells
+    # have to be sorted.
+    boundary_cells = np.transpose(np.nonzero(flag_field_arr == boundary_mask))
+    for i in range(len(flag_field_arr.shape)):
+        boundary_cells = boundary_cells[boundary_cells[:, i].argsort(kind="mergesort")]
+    # First a set is created to save all fluid cells which are near boundary
+    fluid_cells = set()
+    for cell in boundary_cells:
+        cell = tuple(cell)
        for dir_idx, direction in enumerate(stencil):
-            neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)])
+            neighbor_cell = tuple(
-            if flag_field_arr[neighbor_cell] & boundary_mask:
+                [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
-                result.append(cell + (dir_idx,))
+            )
-                if single_link:
+            # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
-                    break
+            if any(
+                not 0 + nr_of_ghost_layers <= e < upper - nr_of_ghost_layers
-    return np.array(result, dtype=index_arr_dtype)
+                for e, upper in zip(neighbor_cell, flag_field_arr.shape)
+            ):
+                continue
+            if flag_field_arr[neighbor_cell] & fluid_mask:
+                fluid_cells.add(neighbor_cell)
+    # then this is set is transformed to a list to make it sortable. This ensures continoous memory access later.
+    fluid_cells = list(fluid_cells)
+    if len(flag_field_arr.shape) == 3:
+        fluid_cells.sort(key=lambda tup: (tup[-1], tup[-2], tup[0]))
+    else:
+        fluid_cells.sort(key=lambda tup: (tup[-1], tup[0]))
-def _create_boundary_cell_index_list_python(flag_field_arr, boundary_mask,
+    cells_to_iterate = fluid_cells if inner_or_boundary else boundary_cells
-                                            fluid_mask, stencil, single_link):
+    checkmask = boundary_mask if inner_or_boundary else fluid_mask
-    coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
    result = []
-    for cell in itertools.product(*reversed([range(0, i) for i in flag_field_arr.shape])):
+    for cell in cells_to_iterate:
-        cell = cell[::-1]
+        cell = tuple(cell)
-        if not flag_field_arr[cell] & boundary_mask:
+        sum_cells = np.zeros(len(cell))
-            continue
        for dir_idx, direction in enumerate(stencil):
-            neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)])
+            neighbor_cell = tuple(
-            if any(not 0 <= e < upper for e, upper in zip(neighbor_cell, flag_field_arr.shape)):
+                [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
+            )
+            # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
+            if any(
+                not 0 <= e < upper
+                for e, upper in zip(neighbor_cell, flag_field_arr.shape)
+            ):
                continue
-            if flag_field_arr[neighbor_cell] & fluid_mask:
+            if flag_field_arr[neighbor_cell] & checkmask:
-                result.append(cell + (dir_idx,))
                if single_link:
-                    break
+                    sum_cells += np.array(direction)
+                else:
+                    result.append(tuple(cell) + (dir_idx,))
+        # the discrete normal direction is the one which gives the maximum inner product to the stencil direction
+        if single_link and any(sum_cells != 0):
+            idx = np.argmax(np.inner(sum_cells, stencil))
+            result.append(tuple(cell) + (idx,))
    return np.array(result, dtype=index_arr_dtype)
-def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
+def create_boundary_index_list(
-                               nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
+    flag_field,
+    stencil,
+    boundary_mask,
+    fluid_mask,
+    nr_of_ghost_layers=1,
+    inner_or_boundary=True,
+    single_link=False,
+):
    """Creates a numpy array storing links (connections) between domain cells and boundary cells.
    Args:
@@ -91,15 +141,25 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
        nr_of_ghost_layers: only relevant if neighbors is True
        inner_or_boundary: if true, the result contains the cell coordinates of the domain cells -
                    if false the boundary cells are listed
-        single_link: if true only the first link is reported from this cell
+        single_link: if true only the link in normal direction to this cell is reported
    """
    dim = len(flag_field.shape)
    coordinate_names = boundary_index_array_coordinate_names[:dim]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
+    index_arr_dtype = np.dtype(
+        [(name, default_index_array_dtype) for name in coordinate_names]
-    stencil = np.array(stencil, dtype=np.int32)
+        + [(direction_member_name, default_index_array_dtype)]
-    args = (flag_field, nr_of_ghost_layers, boundary_mask, fluid_mask, stencil, single_link)
+    )
+    stencil = np.array(stencil, dtype=default_index_array_dtype)
+    args = (
+        flag_field,
+        nr_of_ghost_layers,
+        boundary_mask,
+        fluid_mask,
+        stencil,
+        single_link,
+    )
    args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link)
    if cython_funcs_available:
@@ -118,24 +178,42 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
        return np.array(idx_list, dtype=index_arr_dtype)
    else:
        if flag_field.size > 1e6:
-            warnings.warn("Boundary setup may take very long! Consider installing cython to speed it up")
+            warnings.warn(
-        if inner_or_boundary:
+                "Boundary setup may take very long! Consider installing cython to speed it up"
-            return _create_boundary_neighbor_index_list_python(*args)
+            )
-        else:
+        return _create_index_list_python(
-            return _create_boundary_cell_index_list_python(*args_no_gl)
+            *args_no_gl,
+            inner_or_boundary=inner_or_boundary,
+            nr_of_ghost_layers=nr_of_ghost_layers,
-def create_boundary_index_array(flag_field, stencil, boundary_mask, fluid_mask, boundary_object,
+        )
-                                nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
-    idx_array = create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
-                                           nr_of_ghost_layers, inner_or_boundary, single_link)
+def create_boundary_index_array(
+    flag_field,
+    stencil,
+    boundary_mask,
+    fluid_mask,
+    boundary_object,
+    nr_of_ghost_layers=1,
+    inner_or_boundary=True,
+    single_link=False,
+):
+    idx_array = create_boundary_index_list(
+        flag_field,
+        stencil,
+        boundary_mask,
+        fluid_mask,
+        nr_of_ghost_layers,
+        inner_or_boundary,
+        single_link,
+    )
    dim = len(flag_field.shape)
    if boundary_object.additional_data:
        coordinate_names = boundary_index_array_coordinate_names[:dim]
        index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
        extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype)
-        for prop in coordinate_names + ['dir']:
+        for prop in coordinate_names + ["dir"]:
            extended_idx_field[prop] = idx_array[prop]
        idx_array = extended_idx_field

--- a/pystencils/boundaries/createindexlistcython.pyx
+++ b/pystencils/boundaries/createindexlistcython.pyx
-# distutils: language=c
+# cython: language_level=3str
-# Workaround for cython bug
-# see https://stackoverflow.com/questions/8024805/cython-compiled-c-extension-importerror-dynamic-module-does-not-define-init-fu
-WORKAROUND = "Something"
 import cython
@@ -22,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel
    cdef int xs, ys, x, y
    cdef int dirIdx, num_directions, dx, dy
+    cdef int sum_x, sum_y
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
    for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
        for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
+            sum_x = 0; sum_y = 0;
            if flag_field[x, y] & fluid_mask:
                for dirIdx in range(num_directions):
-                    dx = stencil[dirIdx,0]
+                    dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
-                    dy = stencil[dirIdx,1]
                    if flag_field[x + dx, y + dy] & boundary_mask:
-                        boundary_index_list.append((x,y, dirIdx))
                        if single_link:
-                            break
+                            sum_x += dx; sum_y += dy;
+                        else:
+                            boundary_index_list.append((x, y, dirIdx))
+            dot = 0; maxn = 0; calculated_idx = 0
+            if single_link and (sum_x != 0 or sum_y != 0):
+                for dirIdx in range(num_directions):
+                    dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1];
+                    dot = dx * sum_x + dy * sum_y
+                    if dot > maxn:
+                        maxn = dot
+                        calculated_idx = dirIdx
+                boundary_index_list.append((x, y, calculated_idx))
    return boundary_index_list
@@ -47,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
    cdef int xs, ys, zs, x, y, z
    cdef int dirIdx, num_directions, dx, dy, dz
+    cdef int sum_x, sum_y, sum_z
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys, zs = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
@@ -54,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
    for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers):
        for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
            for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
+                sum_x = 0; sum_y = 0; sum_z = 0
                if flag_field[x, y, z] & fluid_mask:
                    for dirIdx in range(num_directions):
-                        dx = stencil[dirIdx,0]
+                        dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]; dz = stencil[dirIdx,2]
-                        dy = stencil[dirIdx,1]
-                        dz = stencil[dirIdx,2]
                        if flag_field[x + dx, y + dy, z + dz] & boundary_mask:
-                            boundary_index_list.append((x,y,z, dirIdx))
                            if single_link:
-                                break
+                                sum_x += dx; sum_y += dy; sum_z += dz
+                            else:
+                                boundary_index_list.append((x, y, z, dirIdx))
+                dot = 0; maxn = 0; calculated_idx = 0
+                if single_link and (sum_x != 0 or sum_y != 0 or sum_z != 0):
+                    for dirIdx in range(num_directions):
+                        dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
+                        dot = dx * sum_x + dy * sum_y + dz * sum_z
+                        if dot > maxn:
+                            maxn = dot
+                            calculated_idx = dirIdx
+                    boundary_index_list.append((x, y, z, calculated_idx))
    return boundary_index_list
@@ -75,21 +105,39 @@ def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field,
    cdef int xs, ys, x, y
    cdef int dirIdx, num_directions, dx, dy
+    cdef int sum_x, sum_y
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
    for y in range(0, ys):
        for x in range(0, xs):
+            sum_x = 0; sum_y = 0;
            if flag_field[x, y] & boundary_mask:
                for dirIdx in range(num_directions):
-                    dx = stencil[dirIdx,0]
+                    dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
-                    dy = stencil[dirIdx,1]
                    if 0 <= x + dx < xs and 0 <= y + dy < ys:
                        if flag_field[x + dx, y + dy] & fluid_mask:
-                            boundary_index_list.append((x,y, dirIdx))
                            if single_link:
-                                break
+                                sum_x += dx; sum_y += dy
+                            else:
+                                boundary_index_list.append((x, y, dirIdx))
+            dot = 0; maxn = 0; calculated_idx = 0
+            if single_link and (sum_x != 0 or sum_y != 0):
+                for dirIdx in range(num_directions):
+                    dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]
+                    dot = dx * sum_x + dy * sum_y
+                    if dot > maxn:
+                        maxn = dot
+                        calculated_idx = dirIdx
+                boundary_index_list.append((x, y, calculated_idx))
    return boundary_index_list
@@ -101,6 +149,10 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
    cdef int xs, ys, zs, x, y, z
    cdef int dirIdx, num_directions, dx, dy, dz
+    cdef int sum_x, sum_y, sum_z
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys, zs = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
@@ -108,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
    for z in range(0, zs):
        for y in range(0, ys):
            for x in range(0, xs):
+                sum_x = 0; sum_y = 0; sum_z = 0
                if flag_field[x, y, z] & boundary_mask:
                    for dirIdx in range(num_directions):
-                        dx = stencil[dirIdx,0]
+                        dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
-                        dy = stencil[dirIdx,1]
-                        dz = stencil[dirIdx,2]
                        if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs:
                            if flag_field[x + dx, y + dy, z + dz] & fluid_mask:
-                                boundary_index_list.append((x,y,z, dirIdx))
                                if single_link:
-                                    break
+                                    sum_x += dx; sum_y += dy; sum_z += dz
+                                else:
+                                    boundary_index_list.append((x, y, z, dirIdx))
+                dot = 0; maxn = 0; calculated_idx=0
+                if single_link and (sum_x != 0 or sum_y !=0 or sum_z !=0):
+                    for dirIdx in range(num_directions):
+                        dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
+                        dot = dx*sum_x + dy*sum_y + dz*sum_z
+                        if dot > maxn:
+                            maxn = dot
+                            calculated_idx = dirIdx
+                    boundary_index_list.append((x, y, z, calculated_idx))
    return boundary_index_list
\ No newline at end of file
--- a/pystencils/boundaries/inkernel.py
+++ b/pystencils/boundaries/inkernel.py
 import sympy as sp
 from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE
-from pystencils.data_types import TypedSymbol, create_type
+from pystencils.typing import TypedSymbol, create_type
 from pystencils.field import Field
 from pystencils.integer_functions import bitwise_and
No results found