5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils/backends/arm_instruction_sets.py
+++ b/pystencils/backends/arm_instruction_sets.py
+from pystencils.typing import CFunction
 def get_argument_string(function_shortcut, first=''):
    args = function_shortcut[function_shortcut.index('[') + 1: -1]
    arg_string = "("
@@ -16,10 +19,13 @@ def get_argument_string(function_shortcut, first=''):
 def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
-    if instruction_set != 'neon' and not instruction_set.startswith('sve'):
+    if instruction_set not in ['neon', 'sme'] and not instruction_set.startswith('sve'):
        raise NotImplementedError(instruction_set)
-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sve2', 'sme']:
+        cmp = 'cmp'
+    elif instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
        cmp = 'cmp'
+        bitwidth = int(instruction_set[4:])
    elif instruction_set.startswith('sve'):
        cmp = 'cmp'
        bitwidth = int(instruction_set[3:])
@@ -35,9 +41,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        'sqrt': 'sqrt[0]',
        'loadU': 'ld1[0]',
-        'loadA': 'ld1[0]',
        'storeU': 'st1[0, 1]',
-        'storeA': 'st1[0, 1]',
        'abs': 'abs[0]',
        '==': f'{cmp}eq[0, 1]',
@@ -54,7 +58,7 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
    result = dict()
-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sve2', 'sme']:
        width = 'svcntd()' if data_type == 'double' else 'svcntw()'
        intwidth = 'svcntw()'
        result['bytes'] = 'svcntb()'
@@ -62,14 +66,15 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        width = bitwidth // bits[data_type]
        intwidth = bitwidth // bits['int']
        result['bytes'] = bitwidth // 8
-    if instruction_set.startswith('sve'):
+    if instruction_set.startswith('sve') or instruction_set == 'sme':
+        base_names['stream'] = 'stnt1[0, 1]'
        prefix = 'sv'
-        suffix = f'_f{bits[data_type]}' 
+        suffix = f'_f{bits[data_type]}'
    elif instruction_set == 'neon':
        prefix = 'v'
-        suffix = f'q_f{bits[data_type]}' 
+        suffix = f'q_f{bits[data_type]}'
-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sve2', 'sme']:
        predicate = f'{prefix}whilelt_b{bits[data_type]}_u64({{loop_counter}}, {{loop_stop}})'
        int_predicate = f'{prefix}whilelt_b{bits["int"]}_u64({{loop_counter}}, {{loop_stop}})'
    else:
@@ -88,33 +93,36 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        result[intrinsic_id] = prefix + name + suffix + undef + arg_string
-    if instruction_set == 'sve':
+    if instruction_set in ['sve', 'sve2', 'sme']:
-        from pystencils.backends.cbackend import CFunction
        result['width'] = CFunction(width, "int")
        result['intwidth'] = CFunction(intwidth, "int")
    else:
        result['width'] = width
        result['intwidth'] = intwidth
-    if instruction_set.startswith('sve'):
+    if instruction_set.startswith('sve') or instruction_set == 'sme':
        result['makeVecConst'] = f'svdup_f{bits[data_type]}' + '({0})'
        result['makeVecConstInt'] = f'svdup_s{bits["int"]}' + '({0})'
        result['makeVecIndex'] = f'svindex_s{bits["int"]}' + '({0}, {1})'
-        vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
+        if instruction_set != 'sme':
-        result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
+            vindex = f'svindex_u{bits[data_type]}(0, {{0}})'
-                           vindex.format("{2}") + ', {1})'
+            result['storeS'] = f'svst1_scatter_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
-        result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                               vindex.format("{2}") + ', {1})'
-                          vindex.format("{1}") + ')'
+            result['loadS'] = f'svld1_gather_u{bits[data_type]}index_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                              vindex.format("{1}") + ')'
+        if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+            result['streamS'] = f'svstnt1_scatter_u{bits[data_type]}offset_f{bits[data_type]}({predicate}, {{0}}, ' + \
+                                vindex.format(f"{{2}}*{bits[data_type]//8}") + ', {1})'
        result['+int'] = f"svadd_s{bits['int']}_x({int_predicate}, " + "{0}, {1})"
-        result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set != "sve" else ""}t'
+        result['float'] = f'svfloat{bits["float"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
-        result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set != "sve" else ""}t'
+        result['double'] = f'svfloat{bits["double"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
-        result['int'] = f'svint{bits["int"]}_{"s" if instruction_set != "sve" else ""}t'
+        result['int'] = f'svint{bits["int"]}_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
-        result['bool'] = f'svbool_{"s" if instruction_set != "sve" else ""}t'
+        result['bool'] = f'svbool_{"s" if instruction_set not in ["sve", "sve2", "sme"] else ""}t'
-        result['headers'] = ['<arm_sve.h>', '"arm_neon_helpers.h"']
+        result['headers'] = ['<arm_sve.h>', '<arm_acle.h>', '"arm_neon_helpers.h"']
        result['&'] = f'svand_b_z({predicate},' + ' {0}, {1})'
        result['|'] = f'svorr_b_z({predicate},' + ' {0}, {1})'
@@ -123,10 +131,17 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        result['all'] = f'svcntp_b{bits[data_type]}({predicate}, {{0}}) == {width}'
        result['maskStoreU'] = result['storeU'].replace(predicate, '{2}')
-        result['maskStoreA'] = result['storeA'].replace(predicate, '{2}')
+        result['maskStream'] = result['stream'].replace(predicate, '{2}')
-        result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
+        if instruction_set != 'sme':
+            result['maskStoreS'] = result['storeS'].replace(predicate, '{3}')
+            if instruction_set.startswith('sve2') and instruction_set not in ('sve256', 'sve2048'):
+                result['maskStreamS'] = result['streamS'].replace(predicate, '{3}')
-        if instruction_set != 'sve':
+        result['streamFence'] = '__dmb(15)'
+        if instruction_set == 'sme':
+            result['function_prefix'] = '__attribute__((arm_locally_streaming))'
+        elif instruction_set not in ['sve', 'sve2', 'sme']:
            result['compile_flags'] = [f'-msve-vector-bits={bitwidth}']
    else:
        result['makeVecConst'] = f'vdupq_n_f{bits[data_type]}' + '({0})'
@@ -151,9 +166,9 @@ def get_vector_instruction_set_arm(data_type='double', instruction_set='neon'):
        result['any'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) > 0'
        result['all'] = f'vaddlvq_u8(vreinterpretq_u8_u{bits[data_type]}({{0}})) == 16*0xff'
-    if instruction_set == 'sve' or bitwidth & (bitwidth - 1) == 0:
+        # SVE has real nontemporal stores, so we only need to zero cachlines on Neon
-        # only power-of-2 vector sizes will evenly divide a cacheline
-        result['cachelineSize'] = 'cachelineSize()'
        result['cachelineZero'] = 'cachelineZero((void*) {0})'
+    result['cachelineSize'] = 'cachelineSize()'
    return result
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -6,15 +6,18 @@ from typing import Set
 import numpy as np
 import sympy as sp
 from sympy.core import S
-from sympy.core.cache import cacheit
 from sympy.logic.boolalg import BooleanFalse, BooleanTrue
+from sympy.functions.elementary.trigonometric import TrigonometricFunction, InverseTrigonometricFunction
+from sympy.functions.elementary.hyperbolic import HyperbolicFunction
 from pystencils.astnodes import KernelFunction, LoopOverCoordinate, Node
 from pystencils.cpu.vectorization import vec_all, vec_any, CachelineSize
-from pystencils.data_types import (
+from pystencils.typing import (
-    PointerType, VectorType, address_of, cast_func, create_type, get_type_of_expression,
+    PointerType, VectorType, CastFunc, create_type, get_type_of_expression,
-    reinterpret_cast_func, vector_memory_access, BasicType, TypedSymbol)
+    ReinterpretCastFunc, VectorMemoryAccess, BasicType, TypedSymbol, CFunction)
+from pystencils.enums import Backend
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
+from pystencils.functions import DivFunc, AddressOf
 from pystencils.integer_functions import (
    bit_shift_left, bit_shift_right, bitwise_and, bitwise_or, bitwise_xor,
    int_div, int_power_of_2, modulo_ceil)
@@ -29,12 +32,10 @@ __all__ = ['generate_c', 'CustomCodeNode', 'PrintNode', 'get_headers', 'CustomSy
 HEADER_REGEX = re.compile(r'^[<"].*[">]$')
-KERNCRAFT_NO_TERNARY_MODE = False
 def generate_c(ast_node: Node,
               signature_only: bool = False,
-               dialect='c',
+               dialect: Backend = Backend.C,
               custom_backend=None,
               with_globals=True) -> str:
    """Prints an abstract syntax tree node as C or CUDA code.
@@ -46,7 +47,7 @@ def generate_c(ast_node: Node,
    Args:
        ast_node: ast representation of kernel
        signature_only: generate signature without function body
-        dialect: 'c', 'cuda' or opencl
+        dialect: `Backend`: 'C' or 'CUDA'
        custom_backend: use own custom printer for code generation
        with_globals: enable usage of global variables
    Returns:
@@ -60,21 +61,19 @@ def generate_c(ast_node: Node,
            ast_node.global_variables = d.symbols_defined
    if custom_backend:
        printer = custom_backend
-    elif dialect == 'c':
+    elif dialect == Backend.C:
        try:
+            # TODO Vectorization Revamp: instruction_set should not be just slapped on ast
            instruction_set = ast_node.instruction_set
        except Exception:
            instruction_set = None
        printer = CBackend(signature_only=signature_only,
                           vector_instruction_set=instruction_set)
-    elif dialect == 'cuda':
+    elif dialect == Backend.CUDA:
        from pystencils.backends.cuda_backend import CudaBackend
        printer = CudaBackend(signature_only=signature_only)
-    elif dialect == 'opencl':
-        from pystencils.backends.opencl_backend import OpenClBackend
-        printer = OpenClBackend(signature_only=signature_only)
    else:
-        raise ValueError("Unknown dialect: " + str(dialect))
+        raise ValueError(f'Unknown {dialect=}')
    code = printer(ast_node)
    if not signature_only and isinstance(ast_node, KernelFunction):
        if with_globals and global_declarations:
@@ -122,12 +121,12 @@ def get_headers(ast_node: Node) -> Set[str]:
    for h in headers:
        assert HEADER_REGEX.match(h), f'header /{h}/ does not follow the pattern /"..."/ or /<...>/'
-    return sorted(headers)
+    return headers
 # --------------------------------------- Backend Specific Nodes -------------------------------------------------------
+# TODO future CustomCodeNode should not be backend specific move it elsewhere
 class CustomCodeNode(Node):
    def __init__(self, code, symbols_read, symbols_defined, parent=None):
        super(CustomCodeNode, self).__init__(parent=parent)
@@ -151,8 +150,8 @@ class CustomCodeNode(Node):
    def undefined_symbols(self):
        return self._symbols_read - self._symbols_defined
-    def __eq___(self, other):
+    def __eq__(self, other):
-        return self._code == other._code
+        return type(self) is type(other) and self._code == other._code
    def __hash__(self):
        return hash(self._code)
@@ -166,30 +165,13 @@ class PrintNode(CustomCodeNode):
        self.headers.append("<iostream>")
-class CFunction(TypedSymbol):
-    def __new__(cls, function, dtype):
-        return CFunction.__xnew_cached_(cls, function, dtype)
-    def __new_stage2__(cls, function, dtype):
-        return super(CFunction, cls).__xnew__(cls, function, dtype)
-    __xnew__ = staticmethod(__new_stage2__)
-    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
-    def __getnewargs__(self):
-        return self.name, self.dtype
-    def __getnewargs_ex__(self):
-        return (self.name, self.dtype), {}
 # ------------------------------------------- Printer ------------------------------------------------------------------
 # noinspection PyPep8Naming
 class CBackend:
-    def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect='c'):
+    def __init__(self, sympy_printer=None, signature_only=False, vector_instruction_set=None, dialect=Backend.C):
        if sympy_printer is None:
            if vector_instruction_set is not None:
                self.sympy_printer = VectorizedCustomSympyPrinter(vector_instruction_set)
@@ -216,19 +198,19 @@ class CBackend:
        if isinstance(node, str):
            return node
        for cls in type(node).__mro__:
-            method_name = "_print_" + cls.__name__
+            method_name = f"_print_{cls.__name__}"
            if hasattr(self, method_name):
                return getattr(self, method_name)(node)
-        raise NotImplementedError(self.__class__.__name__ + " does not support node of type " + node.__class__.__name__)
+        raise NotImplementedError(f"{self.__class__.__name__} does not support node of type {node.__class__.__name__}")
-    def _print_Type(self, node):
+    def _print_AbstractType(self, node):
        return str(node)
    def _print_KernelFunction(self, node):
        function_arguments = [f"{self._print(s.symbol.dtype)} {s.symbol.name}" for s in node.get_parameters()
                              if not type(s.symbol) is CFunction]
        launch_bounds = ""
-        if self._dialect == 'cuda':
+        if self._dialect == Backend.CUDA:
            max_threads = node.indexing.max_threads_per_block()
            if max_threads:
                launch_bounds = f"__launch_bounds__({max_threads}) "
@@ -248,12 +230,13 @@ class CBackend:
        return f"{node.pragma_line}\n{self._print_Block(node)}"
    def _print_LoopOverCoordinate(self, node):
-        counter_symbol = node.loop_counter_name
+        counter_name = node.loop_counter_name
-        start = f"int64_t {counter_symbol} = {self.sympy_printer.doprint(node.start)}"
+        counter_dtype = node.loop_counter_symbol.dtype.c_name
-        condition = f"{counter_symbol} < {self.sympy_printer.doprint(node.stop)}"
+        start = f"{counter_dtype} {counter_name} = {self.sympy_printer.doprint(node.start)}"
-        update = f"{counter_symbol} += {self.sympy_printer.doprint(node.step)}"
+        condition = f"{counter_name} < {self.sympy_printer.doprint(node.stop)}"
+        update = f"{counter_name} += {self.sympy_printer.doprint(node.step)}"
        loop_str = f"for ({start}; {condition}; {update})"
-        self._kwargs['loop_counter'] = counter_symbol
+        self._kwargs['loop_counter'] = counter_name
        self._kwargs['loop_stop'] = node.stop
        prefix = "\n".join(node.prefix_lines)
@@ -262,41 +245,50 @@ class CBackend:
        return f"{prefix}{loop_str}\n{self._print(node.body)}"
    def _print_SympyAssignment(self, node):
+        printed_lhs = self.sympy_printer.doprint(node.lhs)
+        printed_rhs = self.sympy_printer.doprint(node.rhs)
        if node.is_declaration:
            if node.use_auto:
-                data_type = 'auto '
+                data_type = 'auto'
            else:
+                data_type = self._print(node.lhs.dtype).replace(' const', '')
                if node.is_const:
-                    prefix = 'const '
+                    data_type = f'const {data_type}'
-                else:
+            return f"{data_type} {printed_lhs} = {printed_rhs};"
-                    prefix = ''
-                data_type = prefix + self._print(node.lhs.dtype).replace(' const', '') + " "
-            return "%s%s = %s;" % (data_type,
-                                   self.sympy_printer.doprint(node.lhs),
-                                   self.sympy_printer.doprint(node.rhs))
        else:
-            lhs_type = get_type_of_expression(node.lhs)
+            lhs_type = get_type_of_expression(node.lhs)  # TOOD: this should have been typed
            printed_mask = ""
-            if type(lhs_type) is VectorType and isinstance(node.lhs, cast_func):
+            if type(lhs_type) is VectorType and isinstance(node.lhs, CastFunc):
                arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args
                instr = 'storeU'
-                if aligned:
+                if nontemporal and 'storeA' not in self._vector_instruction_set and \
+                        'stream' in self._vector_instruction_set:
+                    instr = 'stream'
+                elif aligned:
                    instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA'
                if mask != True:  # NOQA
-                    instr = 'maskStoreA' if aligned else 'maskStoreU'
+                    instr = 'maskStream' if nontemporal and 'maskStream' in self._vector_instruction_set else \
+                            'maskStoreA' if aligned else 'maskStoreU'
                    if instr not in self._vector_instruction_set:
-                        self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format(
+                        if instr == 'maskStream' and 'stream' in self._vector_instruction_set:
+                            store, load = 'stream', 'loadA'
+                        elif (instr in ('maskStream', 'maskStoreA')) and 'storeA' in self._vector_instruction_set:
+                            store, load = 'storeA', 'loadA'
+                        else:
+                            store, load = 'storeU', 'loadU'
+                        load = load if load in self._vector_instruction_set else 'loadU'
+                        self._vector_instruction_set[instr] = self._vector_instruction_set[store].format(
                            '{0}', self._vector_instruction_set['blendv'].format(
-                                self._vector_instruction_set['load' + instr[-1]].format('{0}', **self._kwargs),
+                                self._vector_instruction_set[load].format('{0}', **self._kwargs),
                                '{1}', '{2}', **self._kwargs), **self._kwargs)
                    printed_mask = self.sympy_printer.doprint(mask)
-                    if data_type.base_type.base_name == 'double':
+                    if data_type.base_type.c_name == 'double':
                        if self._vector_instruction_set['double'] == '__m256d':
                            printed_mask = f"_mm256_castpd_si256({printed_mask})"
                        elif self._vector_instruction_set['double'] == '__m128d':
                            printed_mask = f"_mm_castpd_si128({printed_mask})"
-                    elif data_type.base_type.base_name == 'float':
+                    elif data_type.base_type.c_name == 'float':
                        if self._vector_instruction_set['float'] == '__m256':
                            printed_mask = f"_mm256_castps_si256({printed_mask})"
                        elif self._vector_instruction_set['float'] == '__m128':
@@ -304,25 +296,31 @@ class CBackend:
                rhs_type = get_type_of_expression(node.rhs)
                if type(rhs_type) is not VectorType:
-                    rhs = cast_func(node.rhs, VectorType(rhs_type))
+                    raise ValueError(f'Cannot vectorize {node.rhs} of type {rhs_type} inside of the pretty printer! '
+                                     f'This should have happen earlier!')
+                    # rhs = CastFunc(node.rhs, VectorType(rhs_type)) # Unknown width
                else:
                    rhs = node.rhs
                ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0])
                if stride != 1:
-                    instr = 'maskStoreS' if mask != True else 'storeS'  # NOQA
+                    instr = ('maskStreamS' if nontemporal and 'maskStreamS' in self._vector_instruction_set else
+                             'maskStoreS') if mask != True else \
+                            ('streamS' if nontemporal and 'streamS' in self._vector_instruction_set else 'storeS')  # NOQA
                    return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs),
                                                                      stride, printed_mask, **self._kwargs) + ';'
                pre_code = ''
-                if nontemporal and 'cachelineZero' in self._vector_instruction_set:
+                if nontemporal and 'cachelineZero' in self._vector_instruction_set and mask == True:  # NOQA
                    first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0"
                    offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i))
                                      * node.lhs.args[0].field.spatial_strides[i] for i in
                                      range(len(node.lhs.args[0].field.spatial_strides))])
+                    if stride == 1:
+                        offset = offset.subs({node.lhs.args[0].field.spatial_strides[0]: 1})
                    size = sp.Mul(*node.lhs.args[0].field.spatial_shape)
-                    element_size = 8 if data_type.base_type.base_name == 'double' else 4
+                    element_size = 8 if data_type.base_type.c_name == 'double' else 4
                    size_cond = f"({offset} + {CachelineSize.symbol/element_size}) < {size}"
                    pre_code = f"if ({first_cond} && {size_cond}) " + "{\n\t" + \
                        self._vector_instruction_set['cachelineZero'].format(ptr, **self._kwargs) + ';\n}\n'
@@ -334,17 +332,26 @@ class CBackend:
                    code2 = self._vector_instruction_set['flushCacheline'].format(
                        ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';'
                    code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}"
-                elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
+                elif aligned and nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set:
-                    tmpvar = '_tmp_' + hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
+                    lhs_hash = hashlib.sha1(self.sympy_printer.doprint(node.lhs).encode('ascii')).hexdigest()[:8]
+                    rhs_hash = hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8]
+                    tmpvar = f'_tmp_{lhs_hash}_{rhs_hash}'
                    code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \
                        + self.sympy_printer.doprint(rhs) + ';'
                    code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
-                    code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask,
+                    maskStore, store, load = 'maskStoreAAndFlushCacheline', 'storeAAndFlushCacheline', 'loadA'
-                                                                                           **self._kwargs) + ';'
+                    instr2 = maskStore if mask != True else store  # NOQA
+                    if instr2 not in self._vector_instruction_set:
+                        self._vector_instruction_set[maskStore] = self._vector_instruction_set[store].format(
+                            '{0}', self._vector_instruction_set['blendv'].format(
+                                self._vector_instruction_set[load].format('{0}', **self._kwargs),
+                                '{1}', '{2}', **self._kwargs),
+                            **self._kwargs)
+                    code2 = self._vector_instruction_set[instr2].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';'
                    code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}"
                return pre_code + code
            else:
-                return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};"
+                return f"{printed_lhs} = {printed_rhs};"
    def _print_NontemporalFence(self, _):
        if 'streamFence' in self._vector_instruction_set:
@@ -436,23 +443,31 @@ class CustomSympyPrinter(CCodePrinter):
    def __init__(self):
        super(CustomSympyPrinter, self).__init__()
-        self._float_type = create_type("float32")
    def _print_Pow(self, expr):
        """Don't use std::pow function, for small integer exponents, write as multiplication"""
-        if not expr.free_symbols:
+        # Ideally the printer has as little logic as possible. Therefore,
-            return self._typed_number(expr.evalf(), get_type_of_expression(expr))
+        # powers should be rewritten as `DivFunc`s / unevaluated `Mul`s before
+        # printing. `NodeCollection` offers a convenience function to do just
-        if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8:
+        # that. However, `cut_loops` rewrites unevaluated multiplications as
-            return f"({self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False))})"
+        # `Pow`s again. Neither `deepcopy` nor `func(*args)` are suited to
-        elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0:
+        # rebuild unevaluated expressions. Therefore, as long as we stick with
-            return f"1 / ({self._print(sp.Mul(*([expr.base] * -expr.exp), evaluate=False))})"
+        # SymPy, this is the only way to avoid printing `pow`s.
+        exp = expr.exp.expr if isinstance(expr.exp, CastFunc) else expr.exp
+        one_type = expr.base.dtype if hasattr(expr.base, "dtype") else get_type_of_expression(expr.base)
+        if exp.is_integer and exp.is_number and (0 < exp <= 8):
+            return f"({self._print(sp.Mul(*[expr.base] * exp, evaluate=False))})"
+        elif exp.is_integer and exp.is_number and (-8 <= exp < 0):
+            return f"{self._typed_number(1, one_type)} / ({self._print(sp.Mul(*([expr.base] * -exp), evaluate=False))})"
        else:
            return super(CustomSympyPrinter, self)._print_Pow(expr)
+    # TODO don't print ones in sp.Mul
    def _print_Rational(self, expr):
        """Evaluate all rationals i.e. print 0.25 instead of 1.0/4.0"""
-        res = str(expr.evalf().num)
+        res = str(expr.evalf(17))
        return res
    def _print_Equality(self, expr):
@@ -470,7 +485,7 @@ class CustomSympyPrinter(CCodePrinter):
        else:
            return f'fabs({self._print(expr.args[0])})'
-    def _print_Type(self, node):
+    def _print_AbstractType(self, node):
        return str(node)
    def _print_Function(self, expr):
@@ -483,30 +498,48 @@ class CustomSympyPrinter(CCodePrinter):
        }
        if hasattr(expr, 'to_c'):
            return expr.to_c(self._print)
-        if isinstance(expr, reinterpret_cast_func):
+        if isinstance(expr, ReinterpretCastFunc):
            arg, data_type = expr.args
-            return f"*(({self._print(PointerType(data_type, restrict=False))})(& {self._print(arg)}))"
+            if isinstance(data_type, PointerType):
-        elif isinstance(expr, address_of):
+                const_str = "const" if data_type.const else ""
+                return f"(({const_str} {self._print(data_type.base_type)} *)(& {self._print(arg)}))"
+            else:
+                return f"*(({self._print(PointerType(data_type, restrict=False))})(& {self._print(arg)}))"
+        elif isinstance(expr, AddressOf):
            assert len(expr.args) == 1, "address_of must only have one argument"
            return f"&({self._print(expr.args[0])})"
-        elif isinstance(expr, cast_func):
+        elif isinstance(expr, CastFunc):
+            cast = "(({data_type})({code}))"
            arg, data_type = expr.args
-            if isinstance(arg, sp.Number) and arg.is_finite:
+            if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
                return self._typed_number(arg, data_type)
+            elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
+                    and data_type == BasicType('float32'):
+                known = self.known_functions[arg.__class__.__name__.lower()]
+                code = self._print(arg)
+                return code.replace(known, f"{known}f")
+            elif isinstance(arg, (sp.Pow, sp.exp)) and data_type == BasicType('float32'):
+                known = ['sqrt', 'cbrt', 'pow', 'exp']
+                code = self._print(arg)
+                for k in known:
+                    if k in code:
+                        return code.replace(k, f'{k}f')
+                # Powers of small integers are printed as divisions/multiplications.
+                if '/' in code or '*' in code:
+                    return cast.format(data_type=data_type, code=code)
+                raise ValueError(f"{code} doesn't give {known=} function back.")
            else:
-                return f"(({data_type})({self._print(arg)}))"
+                return cast.format(data_type=data_type, code=self._print(arg))
        elif isinstance(expr, fast_division):
-            return f"({self._print(expr.args[0] / expr.args[1])})"
+            raise ValueError("fast_division is only supported for Taget.GPU")
        elif isinstance(expr, fast_sqrt):
-            return f"({self._print(sp.sqrt(expr.args[0]))})"
+            raise ValueError("fast_sqrt is only supported for Taget.GPU")
+        elif isinstance(expr, fast_inv_sqrt):
+            raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
        elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
            return self._print(expr.args[0])
-        elif isinstance(expr, fast_inv_sqrt):
-            return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
        elif isinstance(expr, sp.Abs):
            return f"abs({self._print(expr.args[0])})"
-        elif isinstance(expr, sp.Max):
-            return self._print(expr)
        elif isinstance(expr, sp.Mod):
            if expr.args[0].is_integer and expr.args[1].is_integer:
                return f"({self._print(expr.args[0])} % {self._print(expr.args[1])})"
@@ -518,6 +551,8 @@ class CustomSympyPrinter(CCodePrinter):
            return f"(1 << ({self._print(expr.args[0])}))"
        elif expr.func == int_div:
            return f"(({self._print(expr.args[0])}) / ({self._print(expr.args[1])}))"
+        elif expr.func == DivFunc:
+            return f'(({self._print(expr.divisor)}) / ({self._print(expr.dividend)}))'
        else:
            name = expr.name if hasattr(expr, 'name') else expr.__class__.__name__
            arg_str = ', '.join(self._print(a) for a in expr.args)
@@ -540,52 +575,6 @@ class CustomSympyPrinter(CCodePrinter):
        else:
            return res
-    def _print_Sum(self, expr):
-        template = """[&]() {{
-    {dtype} sum = ({dtype}) 0;
-    for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
-        sum += {expr};
-    }}
-    return sum;
-}}()"""
-        var = expr.limits[0][0]
-        start = expr.limits[0][1]
-        end = expr.limits[0][2]
-        code = template.format(
-            dtype=get_type_of_expression(expr.args[0]),
-            iterator_dtype='int',
-            var=self._print(var),
-            start=self._print(start),
-            end=self._print(end),
-            expr=self._print(expr.function),
-            increment=str(1),
-            condition=self._print(var) + ' <= ' + self._print(end)  # if start < end else '>='
-        )
-        return code
-    def _print_Product(self, expr):
-        template = """[&]() {{
-    {dtype} product = ({dtype}) 1;
-    for ( {iterator_dtype} {var} = {start}; {condition}; {var} += {increment} ) {{
-        product *= {expr};
-    }}
-    return product;
-}}()"""
-        var = expr.limits[0][0]
-        start = expr.limits[0][1]
-        end = expr.limits[0][2]
-        code = template.format(
-            dtype=get_type_of_expression(expr.args[0]),
-            iterator_dtype='int',
-            var=self._print(var),
-            start=self._print(start),
-            end=self._print(end),
-            expr=self._print(expr.function),
-            increment=str(1),
-            condition=self._print(var) + ' <= ' + self._print(end)  # if start < end else '>='
-        )
-        return code
    def _print_ConditionalFieldAccess(self, node):
        return self._print(sp.Piecewise((node.outofbounds_value, node.outofbounds_condition), (node.access, True)))
@@ -609,27 +598,6 @@ class CustomSympyPrinter(CCodePrinter):
            return f"(({a} < {b}) ? {a} : {b})"
        return inner_print_min(expr.args)
-    def _print_re(self, expr):
-        return f"real({self._print(expr.args[0])})"
-    def _print_im(self, expr):
-        return f"imag({self._print(expr.args[0])})"
-    def _print_ImaginaryUnit(self, expr):
-        return "complex<double>{0,1}"
-    def _print_TypedImaginaryUnit(self, expr):
-        if expr.dtype.numpy_dtype == np.complex64:
-            return "complex<float>{0,1}"
-        elif expr.dtype.numpy_dtype == np.complex128:
-            return "complex<double>{0,1}"
-        else:
-            raise NotImplementedError(
-                "only complex64 and complex128 supported")
-    def _print_Complex(self, expr):
-        return self._typed_number(expr, np.complex64)
 # noinspection PyPep8Naming
 class VectorizedCustomSympyPrinter(CustomSympyPrinter):
@@ -648,55 +616,100 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
            return None
    def _print_Abs(self, expr):
-        if 'abs' in self.instruction_set and isinstance(expr.args[0], vector_memory_access):
+        if isinstance(get_type_of_expression(expr), (VectorType, VectorMemoryAccess)):
            return self.instruction_set['abs'].format(self._print(expr.args[0]), **self._kwargs)
        return super()._print_Abs(expr)
+    def _typed_vectorized_number(self, expr, data_type):
+        basic_data_type = data_type.base_type
+        number = self._typed_number(expr, basic_data_type)
+        instruction = 'makeVecConst'
+        if basic_data_type.is_bool():
+            instruction = 'makeVecConstBool'
+        # TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
+        elif basic_data_type.is_int():
+            instruction = 'makeVecConstInt'
+        return self.instruction_set[instruction].format(number, **self._kwargs)
+    def _typed_vectorized_symbol(self, expr, data_type):
+        if not isinstance(expr, TypedSymbol):
+            raise ValueError(f'{expr} is not a TypeSymbol. It is {expr.type=}')
+        basic_data_type = data_type.base_type
+        symbol = self._print(expr)
+        if basic_data_type != expr.dtype:
+            symbol = f'(({basic_data_type})({symbol}))'
+        instruction = 'makeVecConst'
+        if basic_data_type.is_bool():
+            instruction = 'makeVecConstBool'
+        # TODO Vectorization Revamp: is int, or sint, or uint (my guess is sint)
+        elif basic_data_type.is_int():
+            instruction = 'makeVecConstInt'
+        return self.instruction_set[instruction].format(symbol, **self._kwargs)
+    def _print_CastFunc(self, expr):
+        arg, data_type = expr.args
+        if type(data_type) is VectorType:
+            base_type = data_type.base_type
+            # vector_memory_access is a cast_func itself so it should't be directly inside a cast_func
+            assert not isinstance(arg, VectorMemoryAccess)
+            if isinstance(arg, sp.Tuple):
+                is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
+                is_integer = get_type_of_expression(arg[0]) == create_type("int")
+                printed_args = [self._print(a) for a in arg]
+                instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
+                if instruction == 'makeVecInt' and 'makeVecIndex' in self.instruction_set:
+                    increments = np.array(arg)[1:] - np.array(arg)[:-1]
+                    if len(set(increments)) == 1:
+                        return self.instruction_set['makeVecIndex'].format(printed_args[0], increments[0],
+                                                                           **self._kwargs)
+                return self.instruction_set[instruction].format(*printed_args, **self._kwargs)
+            else:
+                if arg.is_Number and not isinstance(arg, (sp.core.numbers.Infinity, sp.core.numbers.NegativeInfinity)):
+                    return self._typed_vectorized_number(arg, data_type)
+                elif isinstance(arg, TypedSymbol):
+                    return self._typed_vectorized_symbol(arg, data_type)
+                elif isinstance(arg, (InverseTrigonometricFunction, TrigonometricFunction, HyperbolicFunction)) \
+                        and base_type == BasicType('float32'):
+                    raise NotImplementedError('Vectorizer is not tested for trigonometric functions yet')
+                    # known = self.known_functions[arg.__class__.__name__.lower()]
+                    # code = self._print(arg)
+                    # return code.replace(known, f"{known}f")
+                elif isinstance(arg, sp.Pow):
+                    if base_type == BasicType('float32') or base_type == BasicType('float64'):
+                        return self._print_Pow(arg)
+                    else:
+                        raise NotImplementedError('Integer Pow is not implemented')
+                elif isinstance(arg, sp.UnevaluatedExpr):
+                    return self._print(arg.args[0])
+                else:
+                    raise NotImplementedError('Vectorizer cannot cast between different datatypes')
+                    # to_type = self.instruction_set['suffix'][data_type.base_type.c_name]
+                    # from_type = self.instruction_set['suffix'][get_type_of_expression(arg).base_type.c_name]
+                    # return self.instruction_set['cast'].format(from_type, to_type, self._print(arg))
+        else:
+            return self._scalarFallback('_print_Function', expr)
+            # raise ValueError(f'Non VectorType cast "{data_type}" in vectorized code.')
    def _print_Function(self, expr):
-        if isinstance(expr, vector_memory_access):
+        if isinstance(expr, VectorMemoryAccess):
            arg, data_type, aligned, _, mask, stride = expr.args
            if stride != 1:
-                return self.instruction_set['loadS'].format("& " + self._print(arg), stride, **self._kwargs)
+                return self.instruction_set['loadS'].format(f"& {self._print(arg)}", stride, **self._kwargs)
            instruction = self.instruction_set['loadA'] if aligned else self.instruction_set['loadU']
-            return instruction.format("& " + self._print(arg), **self._kwargs)
+            return instruction.format(f"& {self._print(arg)}", **self._kwargs)
-        elif isinstance(expr, cast_func):
+        elif expr.func == DivFunc:
-            arg, data_type = expr.args
-            if type(data_type) is VectorType:
-                # vector_memory_access is a cast_func itself so it should't be directly inside a cast_func
-                assert not isinstance(arg, vector_memory_access)
-                if isinstance(arg, sp.Tuple):
-                    is_boolean = get_type_of_expression(arg[0]) == create_type("bool")
-                    is_integer = get_type_of_expression(arg[0]) == create_type("int")
-                    printed_args = [self._print(a) for a in arg]
-                    instruction = 'makeVecBool' if is_boolean else 'makeVecInt' if is_integer else 'makeVec'
-                    if instruction == 'makeVecInt' and 'makeVecIndex' in self.instruction_set:
-                        increments = np.array(arg)[1:] - np.array(arg)[:-1]
-                        if len(set(increments)) == 1:
-                            return self.instruction_set['makeVecIndex'].format(printed_args[0], increments[0],
-                                                                               **self._kwargs)
-                    return self.instruction_set[instruction].format(*printed_args, **self._kwargs)
-                else:
-                    is_boolean = get_type_of_expression(arg) == create_type("bool")
-                    is_integer = get_type_of_expression(arg) == create_type("int") or \
-                        (isinstance(arg, TypedSymbol) and not isinstance(arg.dtype, VectorType) and arg.dtype.is_int())
-                    instruction = 'makeVecConstBool' if is_boolean else \
-                                  'makeVecConstInt' if is_integer else 'makeVecConst'
-                    return self.instruction_set[instruction].format(self._print(arg), **self._kwargs)
-        elif expr.func == fast_division:
            result = self._scalarFallback('_print_Function', expr)
            if not result:
-                result = self.instruction_set['/'].format(self._print(expr.args[0]), self._print(expr.args[1]),
+                result = self.instruction_set['/'].format(self._print(expr.divisor), self._print(expr.dividend),
                                                          **self._kwargs)
            return result
-        elif expr.func == fast_sqrt:
+        elif isinstance(expr, fast_division):
-            return f"({self._print(sp.sqrt(expr.args[0]))})"
+            raise ValueError("fast_division is only supported for Taget.GPU")
-        elif expr.func == fast_inv_sqrt:
+        elif isinstance(expr, fast_sqrt):
-            result = self._scalarFallback('_print_Function', expr)
+            raise ValueError("fast_sqrt is only supported for Taget.GPU")
-            if not result:
+        elif isinstance(expr, fast_inv_sqrt):
-                if 'rsqrt' in self.instruction_set:
+            raise ValueError("fast_inv_sqrt is only supported for Taget.GPU")
-                    return self.instruction_set['rsqrt'].format(self._print(expr.args[0]), **self._kwargs)
-                else:
-                    return f"({self._print(1 / sp.sqrt(expr.args[0]))})"
        elif isinstance(expr, vec_any) or isinstance(expr, vec_all):
            instr = 'any' if isinstance(expr, vec_any) else 'all'
            expr_type = get_type_of_expression(expr.args[0])
@@ -747,12 +760,12 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        # special treatment for all-integer args, for loop index arithmetic until we have proper int vectorization
        suffix = ""
-        if all([(type(e) is cast_func and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer)
+        if all([(type(e) is CastFunc and str(e.dtype) == self.instruction_set['int']) or isinstance(e, sp.Integer)
                or (type(e) is TypedSymbol and isinstance(e.dtype, BasicType) and e.dtype.is_int()) for e in args]):
-            dtype = set([e.dtype for e in args if type(e) is cast_func])
+            dtype = set([e.dtype for e in args if type(e) is CastFunc])
            assert len(dtype) == 1
            dtype = dtype.pop()
-            args = [cast_func(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e
+            args = [CastFunc(e, dtype) if (isinstance(e, sp.Integer) or isinstance(e, TypedSymbol)) else e
                    for e in args]
            suffix = "int"
@@ -778,26 +791,31 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        return processed
    def _print_Pow(self, expr):
-        result = self._scalarFallback('_print_Pow', expr)
+        # Due to loop cutting sp.Mul is evaluated again.
+        try:
+            result = self._scalarFallback('_print_Pow', expr)
+        except ValueError:
+            result = None
        if result:
            return result
        one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs)
+        root = self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs)
-        if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8:
+        if isinstance(expr.exp, CastFunc) and expr.exp.args[0].is_number:
-            return "(" + self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False)) + ")"
+            exp = expr.exp.args[0]
-        elif expr.exp == -1:
+        else:
-            one = self.instruction_set['makeVecConst'].format(1.0, **self._kwargs)
+            exp = expr.exp
-            return self.instruction_set['/'].format(one, self._print(expr.base), **self._kwargs)
-        elif expr.exp == 0.5:
+        # TODO the printer should not have any intelligence like this.
-            return self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs)
+        # TODO To remove all of these cases the vectoriser needs to be reworked. See loop cutting
-        elif expr.exp == -0.5:
+        if exp.is_integer and exp.is_number and 0 < exp < 8:
-            root = self.instruction_set['sqrt'].format(self._print(expr.base), **self._kwargs)
+            return self._print(sp.Mul(*[expr.base] * exp, evaluate=False))
+        elif exp == 0.5:
+            return root
+        elif exp == -0.5:
            return self.instruction_set['/'].format(one, root, **self._kwargs)
-        elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0:
-            return self.instruction_set['/'].format(one,
-                                                    self._print(sp.Mul(*[expr.base] * (-expr.exp), evaluate=False)),
-                                                    **self._kwargs)
        else:
            raise ValueError("Generic exponential not supported: " + str(expr))
@@ -805,7 +823,10 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        # noinspection PyProtectedMember
        from sympy.core.mul import _keep_coeff
-        result = self._scalarFallback('_print_Mul', expr)
+        if not inside_add:
+            result = self._scalarFallback('_print_Mul', expr)
+        else:
+            result = None
        if result:
            return result
@@ -880,12 +901,9 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
        result = self._print(expr.args[-1][0])
        for true_expr, condition in reversed(expr.args[:-1]):
-            if isinstance(condition, cast_func) and get_type_of_expression(condition.args[0]) == create_type("bool"):
+            if isinstance(condition, CastFunc) and get_type_of_expression(condition.args[0]) == create_type("bool"):
-                if not KERNCRAFT_NO_TERNARY_MODE:
+                result = "(({}) ? ({}) : ({}))".format(self._print(condition.args[0]), self._print(true_expr),
-                    result = "(({}) ? ({}) : ({}))".format(self._print(condition.args[0]), self._print(true_expr),
+                                                       result, **self._kwargs)
-                                                           result, **self._kwargs)
-                else:
-                    print("Warning - skipping ternary op")
            else:
                # noinspection SpellCheckingInspection
                result = self.instruction_set['blendv'].format(result, self._print(true_expr), self._print(condition),

--- a/pystencils/backends/cuda_backend.py
+++ b/pystencils/backends/cuda_backend.py
-from os.path import dirname, join
 from pystencils.astnodes import Node
 from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, generate_c
+from pystencils.enums import Backend
 from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
-from pystencils.interpolation_astnodes import DiffInterpolatorAccess, InterpolationMode
-with open(join(dirname(__file__), 'cuda_known_functions.txt')) as f:
-    lines = f.readlines()
-    CUDA_KNOWN_FUNCTIONS = {l.strip(): l.strip() for l in lines if l}
 def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=None, with_globals=True) -> str:
@@ -22,7 +16,7 @@ def generate_cuda(ast_node: Node, signature_only: bool = False, custom_backend=N
    Returns:
        CUDA code for the ast node and its descendants
    """
-    return generate_c(ast_node, signature_only, dialect='cuda',
+    return generate_c(ast_node, signature_only, dialect=Backend.CUDA,
                      custom_backend=custom_backend, with_globals=with_globals)
@@ -33,7 +27,7 @@ class CudaBackend(CBackend):
        if not sympy_printer:
            sympy_printer = CudaSympyPrinter()
-        super().__init__(sympy_printer, signature_only, dialect='cuda')
+        super().__init__(sympy_printer, signature_only, dialect=Backend.CUDA)
    def _print_SharedMemoryAllocation(self, node):
        dtype = node.symbol.dtype
@@ -43,26 +37,13 @@ class CudaBackend(CBackend):
        return code
    @staticmethod
-    def _print_ThreadBlockSynchronization(node):
+    def _print_ThreadBlockSynchronization(_):
-        code = "__synchtreads();"
+        return "__synchtreads();"
-        return code
    def _print_TextureDeclaration(self, node):
+        cond = node.texture.field.dtype.numpy_dtype.itemsize > 4
-        # TODO: use fStrings here
+        return f'texture<{"fp_tex_" if cond else ""}{str(node.texture.field.dtype)}, ' \
-        if node.texture.field.dtype.numpy_dtype.itemsize > 4:
+               f'cudaTextureType{node.texture.field.spacial_dimensions}D, cudaReadModeElementType> {node.texture};'
-            code = "texture<fp_tex_%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
-                str(node.texture.field.dtype),
-                node.texture.field.spatial_dimensions,
-                node.texture
-            )
-        else:
-            code = "texture<%s, cudaTextureType%iD, cudaReadModeElementType> %s;" % (
-                str(node.texture.field.dtype),
-                node.texture.field.spatial_dimensions,
-                node.texture
-            )
-        return code
    def _print_SkipIteration(self, _):
        return "return;"
@@ -73,31 +54,6 @@ class CudaSympyPrinter(CustomSympyPrinter):
    def __init__(self):
        super(CudaSympyPrinter, self).__init__()
-        self.known_functions.update(CUDA_KNOWN_FUNCTIONS)
-    def _print_InterpolatorAccess(self, node):
-        dtype = node.interpolator.field.dtype.numpy_dtype
-        if type(node) == DiffInterpolatorAccess:
-            # cubicTex3D_1st_derivative_x(texture tex, float3 coord)
-            template = f"cubicTex%iD_1st_derivative_{list(reversed('xyz'[:node.ndim]))[node.diff_coordinate_idx]}(%s, %s)"  # noqa
-        elif node.interpolator.interpolation_mode == InterpolationMode.CUBIC_SPLINE:
-            template = "cubicTex%iDSimple(%s, %s)"
-        else:
-            if dtype.itemsize > 4:
-                # Use PyCuda hack!
-                # https://github.com/inducer/pycuda/blob/master/pycuda/cuda/pycuda-helpers.hpp
-                template = "fp_tex%iD(%s, %s)"
-            else:
-                template = "tex%iD(%s, %s)"
-        code = template % (
-            node.interpolator.field.spatial_dimensions,
-            str(node.interpolator),
-            # + 0.5 comes from Nvidia's staggered indexing
-            ', '.join(self._print(o + 0.5) for o in reversed(node.offsets))
-        )
-        return code
    def _print_Function(self, expr):
        if isinstance(expr, fast_division):

--- a/pystencils/backends/dot.py
+++ b/pystencils/backends/dot.py
 import graphviz
-from graphviz import Digraph, lang
+try:
+    from graphviz import Digraph
+    import graphviz.quoting as quote
+except ImportError:
+    from graphviz import Digraph
+    import graphviz.lang as quote
 from sympy.printing.printer import Printer
@@ -12,7 +17,7 @@ class DotPrinter(Printer):
        super(DotPrinter, self).__init__()
        self._node_to_str_function = node_to_str_function
        self.dot = Digraph(**kwargs)
-        self.dot.quote_edge = lang.quote
+        self.dot.quote_edge = quote.quote
    def _print_KernelFunction(self, func):
        self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func))

--- a/pystencils/backends/json.py
+++ b/pystencils/backends/json.py
--- a/pystencils/backends/ppc_instruction_sets.py
+++ b/pystencils/backends/ppc_instruction_sets.py
--- a/pystencils/backends/riscv_instruction_sets.py
+++ b/pystencils/backends/riscv_instruction_sets.py
+from pystencils.typing import CFunction
 def get_argument_string(function_shortcut, last=''):
    args = function_shortcut[function_shortcut.index('[') + 1: -1]
    arg_string = "("
@@ -30,14 +33,11 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
        'sqrt': 'fsqrt_v[0]',
        'loadU': f'le{bits[data_type]}_v[0]',
-        'loadA': f'le{bits[data_type]}_v[0]',
        'storeU': f'se{bits[data_type]}_v[0, 1]',
-        'storeA': f'se{bits[data_type]}_v[0, 1]',
        'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
-        'maskStoreA': f'se{bits[data_type]}_v[2, 0, 1]',
        'loadS': f'lse{bits[data_type]}_v[0, 1]',
        'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
-        'maskStoreS': f'sse{bits[data_type]}_v[2, 0, 3, 1]',
+        'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',
        'abs': 'fabs_v[0]',
        '==': 'mfeq_vv[0, 1]',
@@ -50,8 +50,8 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
        '|': 'mor_mm[0, 1]',
        'blendv': 'merge_vvm[2, 0, 1]',
-        'any': 'popc_m[0]',
+        'any': 'cpop_m[0]',
-        'all': 'popc_m[0]',
+        'all': 'cpop_m[0]',
    }
    result = dict()
@@ -81,7 +81,6 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
        result[intrinsic_id] = prefix + name + suffix2 + arg_string
-    from pystencils.backends.cbackend import CFunction
    result['width'] = CFunction(width, "int")
    result['intwidth'] = CFunction(intwidth, "int")
@@ -92,7 +91,7 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
    result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
    result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
-    result['maskStoreS'] = result['maskStoreS'].replace('{3}', f'{{3}}*{bits[data_type]//8}')
+    result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
    result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
@@ -101,9 +100,12 @@ def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
    result['int'] = f'vint{bits["int"]}m1_t'
    result['bool'] = f'vbool{bits[data_type]}_t'
-    result['headers'] = ['<riscv_vector.h>']
+    result['headers'] = ['<riscv_vector.h>', '"riscv_v_helpers.h"']
    result['any'] += ' > 0x0'
    result['all'] += f' == vsetvl_e{bits[data_type]}m1({vl})'
+    result['cachelineSize'] = 'cachelineSize()'
+    result['cachelineZero'] = 'cachelineZero((void*) {0})'
    return result
--- a/pystencils/backends/simd_instruction_sets.py
+++ b/pystencils/backends/simd_instruction_sets.py
-import math
 import os
 import platform
-from ctypes import CDLL
+from ctypes import CDLL, c_int, c_size_t, sizeof, byref
+from warnings import warn
+import numpy as np
 from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
 from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
 from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
 from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
+from pystencils.cache import memorycache
+from pystencils.typing import numpy_name_to_c
 def get_vector_instruction_set(data_type='double', instruction_set='avx'):
-    if instruction_set in ['neon'] or instruction_set.startswith('sve'):
+    if data_type == 'float':
-        return get_vector_instruction_set_arm(data_type, instruction_set)
+        warn(f"Ambiguous input for data_type: {data_type}. For single precision please use float32. "
+             f"For more information please take numpy.dtype as a reference. This input will not be supported in future "
+             f"releases")
+        data_type = 'float64'
+    type_name = numpy_name_to_c(np.dtype(data_type).name)
+    if instruction_set in ['neon', 'sme'] or instruction_set.startswith('sve'):
+        return get_vector_instruction_set_arm(type_name, instruction_set)
    elif instruction_set in ['vsx']:
-        return get_vector_instruction_set_ppc(data_type, instruction_set)
+        return get_vector_instruction_set_ppc(type_name, instruction_set)
    elif instruction_set in ['rvv']:
-        return get_vector_instruction_set_riscv(data_type, instruction_set)
+        return get_vector_instruction_set_riscv(type_name, instruction_set)
    else:
-        return get_vector_instruction_set_x86(data_type, instruction_set)
+        return get_vector_instruction_set_x86(type_name, instruction_set)
-_cache = None
-_cachelinesize = None
+@memorycache
 def get_supported_instruction_sets():
    """List of supported instruction sets on current hardware, or None if query failed."""
-    global _cache
-    if _cache is not None:
-        return _cache.copy()
    if 'PYSTENCILS_SIMD' in os.environ:
        return os.environ['PYSTENCILS_SIMD'].split(',')
-    if platform.system() == 'Darwin' and platform.machine() == 'arm64':  # not supported by cpuinfo
+    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
+        result = ['neon']
+        libc = CDLL('/usr/lib/libc.dylib')
+        value = c_int(0)
+        size = c_size_t(sizeof(value))
+        status = libc.sysctlbyname(b"hw.optional.arm.FEAT_SME", byref(value), byref(size), None, 0)
+        if status == 0 and value.value == 1:
+            result.insert(0, "sme")
+        return result
+    elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
        return ['neon']
-    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):  # not supported by cpuinfo
+    elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
+        result = ['neon']  # Neon is mandatory on 64-bit ARM
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap2 = libc.getauxval(26)  # AT_HWCAP2
+        if hwcap & (1 << 22):  # HWCAP_SVE
+            if hwcap2 & (1 << 1):  # HWCAP2_SVE2
+                name = 'sve2'
+            else:
+                name = 'sve'
+            length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
+            if length < 0:
+                raise OSError("SVE length query failed")
+            while length >= 128:
+                result.append(f"{name}{length}")
+                length //= 2
+            result.append(name)
+        if hwcap2 & (1 << 23):  # HWCAP2_SME
+            result.insert(0, "sme")  # prepend to list so it is not automatically chosen as best instruction set
+        return result
+    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
        libc = CDLL('libc.so.6')
        hwcap = libc.getauxval(16)  # AT_HWCAP
        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
        return ['rvv'] if hwcap & hwcap_isa_v else []
-    elif platform.machine().startswith('ppc64'):  # no flags reported by cpuinfo
+    elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
-        import subprocess
+        libc = CDLL('libc.so.6')
-        import tempfile
+        hwcap = libc.getauxval(16)  # AT_HWCAP
-        from pystencils.cpu.cpujit import get_compiler_config
+        return ['vsx'] if hwcap & 0x00000080 else []  # PPC_FEATURE_HAS_VSX
-        f = tempfile.NamedTemporaryFile(suffix='.cpp')
+    elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
-        command = [get_compiler_config()['command'], '-mcpu=native', '-dM', '-E', f.name]
+        try:
-        macros = subprocess.check_output(command, input='', text=True)
+            from cpuinfo import get_cpu_info
-        if '#define __VSX__' in macros and '#define __ALTIVEC__' in macros:
+        except ImportError:
-            _cache = ['vsx']
+            return None
-        else:
-            _cache = []
-        return _cache.copy()
-    try:
-        from cpuinfo import get_cpu_info
-    except ImportError:
-        return None
-    result = []
+        result = []
-    required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
+        required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
-    required_avx_flags = {'avx', 'avx2'}
+        required_avx_flags = {'avx', 'avx2'}
-    required_avx512_flags = {'avx512f'}
+        required_avx512_flags = {'avx512f'}
-    required_neon_flags = {'neon'}
+        possible_avx512vl_flags = {'avx512vl', 'avx10_1'}
-    required_sve_flags = {'sve'}
+        flags = set(get_cpu_info()['flags'])
-    flags = set(get_cpu_info()['flags'])
+        if flags.issuperset(required_sse_flags):
-    if flags.issuperset(required_sse_flags):
+            result.append("sse")
-        result.append("sse")
+        if flags.issuperset(required_avx_flags):
-    if flags.issuperset(required_avx_flags):
+            result.append("avx")
-        result.append("avx")
+        if flags.issuperset(required_avx512_flags):
-    if flags.issuperset(required_avx512_flags):
+            result.append("avx512")
-        result.append("avx512")
+        if not flags.isdisjoint(possible_avx512vl_flags):
-    if flags.issuperset(required_neon_flags):
+            result.append("avx512vl")
-        result.append("neon")
+        return result
-    if flags.issuperset(required_sve_flags):
+    else:
-        if platform.system() == 'Linux':
+        raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
-            libc = CDLL('libc.so.6')
+                                  (platform.system(), platform.machine()))
-            native_length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
-            if native_length < 0:
-                raise OSError("SVE length query failed")
-            pwr2_length = int(2**math.floor(math.log2(native_length)))
-            if pwr2_length % 256 == 0:
-                result.append(f"sve{pwr2_length//2}")
-            if native_length != pwr2_length:
-                result.append(f"sve{pwr2_length}")
-            result.append(f"sve{native_length}")
-        result.append("sve")
-    return result
+@memorycache
 def get_cacheline_size(instruction_set):
    """Get the size (in bytes) of a cache block that can be zeroed without memory access.
       Usually, this is identical to the cache line size."""
-    global _cachelinesize
    instruction_sets = get_vector_instruction_set('double', instruction_set)
    if 'cachelineSize' not in instruction_sets:
        return None
-    if _cachelinesize is not None:
-        return _cachelinesize
    import pystencils as ps
+    from pystencils.astnodes import SympyAssignment
    import numpy as np
    from pystencils.cpu.vectorization import CachelineSize
    arr = np.zeros((1, 1), dtype=np.float32)
    f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
-    ass = [CachelineSize(), ps.Assignment(f.center, CachelineSize.symbol)]
+    ass = [CachelineSize(), SympyAssignment(f.center, CachelineSize.symbol)]
    ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
    kernel = ast.compile()
    kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
-    _cachelinesize = int(arr[0, 0])
+    return int(arr[0, 0])
-    return _cachelinesize
--- a/pystencils/backends/x86_instruction_sets.py
+++ b/pystencils/backends/x86_instruction_sets.py
@@ -51,14 +51,14 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
        'makeVecConstBool': 'set[]',
        'makeVecInt': 'set[]',
        'makeVecConstInt': 'set[]',
        'loadU': 'loadu[0]',
        'loadA': 'load[0]',
        'storeU': 'storeu[0,1]',
        'storeA': 'store[0,1]',
        'stream': 'stream[0,1]',
-        'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]',
+        'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
-        'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set == 'avx512' else 'maskstore[0, 2, 1]',
+        'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
    }
    for comparison_op, constant in comparisons.items():
@@ -66,6 +66,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
    headers = {
        'avx512': ['<immintrin.h>'],
+        'avx512vl': ['<immintrin.h>'],
        'avx': ['<immintrin.h>'],
        'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>',
                '<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>']
@@ -79,6 +80,7 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
    prefix = {
        'sse': '_mm',
        'avx': '_mm256',
+        'avx512vl': '_mm256',
        'avx512': '_mm512',
    }
@@ -89,11 +91,13 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
        ("double", "avx"): 4,
        ("float", "avx"): 8,
        ("int", "avx"): 8,
+        ("double", "avx512vl"): 4,
+        ("float", "avx512vl"): 8,
+        ("int", "avx512vl"): 8,
        ("double", "avx512"): 8,
        ("float", "avx512"): 16,
        ("int", "avx512"): 16,
    }
    result = {
        'width': width[(data_type, instruction_set)],
        'intwidth': width[('int', instruction_set)],
@@ -111,14 +115,9 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
            suf = suffix[data_type]
            arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
-        mask_suffix = '_mask' if instruction_set == 'avx512' and intrinsic_id in comparisons.keys() else ''
+        mask_suffix = '_mask' if instruction_set.startswith('avx512') and intrinsic_id in comparisons.keys() else ''
        result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
-    result['dataTypePrefix'] = {
-        'double': "_" + pre + 'd',
-        'float': "_" + pre,
-    }
    bit_width = result['width'] * (64 if data_type == 'double' else 32)
    result['double'] = f"__m{bit_width}d"
    result['float'] = f"__m{bit_width}"
@@ -129,29 +128,45 @@ def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
    result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0"
    result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}"
-    if instruction_set == 'avx512':
+    setsuf = "x" if bit_width < 512 and bit_width // result['width'] == 64 else ""
+    if instruction_set.startswith('avx512'):
        size = result['width']
-        result['&'] = f'_kand_mask{size}({{0}}, {{1}})'
+        masksize = max(size, 8)
-        result['|'] = f'_kor_mask{size}({{0}}, {{1}})'
+        result['&'] = f'_kand_mask{masksize}({{0}}, {{1}})'
-        result['any'] = f'!_ktestz_mask{size}_u8({{0}}, {{0}})'
+        result['|'] = f'_kor_mask{masksize}({{0}}, {{1}})'
-        result['all'] = f'_kortestc_mask{size}_u8({{0}}, {{0}})'
+        result['any'] = f'!_ktestz_mask{masksize}_u8({{0}}, {{0}})'
+        result['all'] = f'_kortestc_mask{masksize}_u8({{0}}, {{0}})'
        result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})'
        result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})"
-        result['abs'] = f"{pre}_abs_{suf}({{0}})"
+        result['bool'] = f"__mmask{masksize}"
-        result['bool'] = f"__mmask{size}"
        params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)])
        result['makeVecBool'] = f"__mmask8(({params}) )"
        params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)])
        result['makeVecConstBool'] = f"__mmask8(({params}) )"
-        vindex = f'{pre}_set_epi{bit_width//size}(' + ', '.join([str(i) for i in range(result['width'])][::-1]) + ')'
+        vindex = f'{pre}_set_epi{bit_width//size}{setsuf}(' + \
-        vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}({{0}}))'
+                 ', '.join([str(i) for i in range(result['width'])][::-1]) + ')'
+        vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}{setsuf}({{0}}))'
+        scale = bit_width // size // 8
        result['storeS'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \
-                           f', {{1}}, {64//size})'
+                           f', {{1}}, {scale})'
        result['maskStoreS'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \
-                               f', {{1}}, {64//size})'
+                               f', {{1}}, {scale})'
-        result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {64//size})'
+        if bit_width == 512:
+            result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {scale})'
+        else:
+            result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}({{0}}, ' + vindex.format("{1}") + f', {scale})'
+    # abs intrinsic exists in 512 bits, but expands to a sequence. We generate that same sequence for 128 and 256 bits
+    if instruction_set == 'avx512':
+        result['abs'] = f"{pre}_abs_{suf}({{0}})"
+    else:
+        result['abs'] = f"{pre}_castsi{bit_width}_{suf}({pre}_and_si{bit_width}(" + \
+                        f"{pre}_set1_epi{bit_width // result['width']}{setsuf}(0x7" + \
+                        'f' * (bit_width // result['width'] // 4 - 1) + "), " + \
+                        f"{pre}_cast{suf}_si{bit_width}({{0}})))"
    if instruction_set == 'avx' and data_type == 'float':
        result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})"

--- a/src/pystencils/bit_masks.py
+++ b/src/pystencils/bit_masks.py
+import sympy as sp
+# from pystencils.typing import get_type_of_expression
+# noinspection PyPep8Naming
+class flag_cond(sp.Function):
+    """Evaluates a flag condition on a bit mask, and returns the value of one of two expressions,
+    depending on whether the flag is set. 
+    Three argument version:
+    ```
+        flag_cond(flag_bit, mask, expr) = expr if (flag_bit is set in mask) else 0
+    ```
+    Four argument version:
+    ```
+        flag_cond(flag_bit, mask, expr_then, expr_else) = expr_then if (flag_bit is set in mask) else expr_else
+    ```
+    """
+    nargs = (3, 4)
+    def __new__(cls, flag_bit, mask_expression, *expressions):
+        # TODO Jan reintroduce checking
+        # flag_dtype = get_type_of_expression(flag_bit)
+        # if not flag_dtype.is_int():
+        #     raise ValueError('Argument flag_bit must be of integer type.')
+        #
+        # mask_dtype = get_type_of_expression(mask_expression)
+        # if not mask_dtype.is_int():
+        #     raise ValueError('Argument mask_expression must be of integer type.')
+        return super().__new__(cls, flag_bit, mask_expression, *expressions)
+    def to_c(self, print_func):
+        flag_bit = self.args[0]
+        mask = self.args[1]
+        then_expression = self.args[2]
+        flag_bit_code = print_func(flag_bit)
+        mask_code = print_func(mask)
+        then_code = print_func(then_expression)
+        code = f"(({mask_code}) >> ({flag_bit_code}) & 1) * ({then_code})"
+        if len(self.args) > 3:
+            else_expression = self.args[3]
+            else_code = print_func(else_expression)
+            code += f" + (({mask_code}) >> ({flag_bit_code}) ^ 1) * ({else_code})"
+        return code
--- a/pystencils/boundaries/__init__.py
+++ b/pystencils/boundaries/__init__.py
--- a/pystencils/boundaries/boundaryconditions.py
+++ b/pystencils/boundaries/boundaryconditions.py
 from typing import Any, List, Tuple
-from pystencils import Assignment
+from pystencils.astnodes import SympyAssignment
 from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo
-from pystencils.data_types import create_type
+from pystencils.typing import create_type
 class Boundary:
@@ -14,7 +14,7 @@ class Boundary:
    def __init__(self, name=None):
        self._name = name
-    def __call__(self, field, direction_symbol, index_field) -> List[Assignment]:
+    def __call__(self, field, direction_symbol, index_field) -> List[SympyAssignment]:
        """Defines the boundary behavior and must therefore be implemented by all boundaries.
        Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated.
@@ -63,20 +63,20 @@ class Neumann(Boundary):
        neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions)
        if field.index_dimensions == 0:
-            return [Assignment(field.center, field[neighbor])]
+            return [SympyAssignment(field.center, field[neighbor])]
        else:
            from itertools import product
            if not field.has_fixed_index_shape:
                raise NotImplementedError("Neumann boundary works only for fields with fixed index shape")
            index_iter = product(*(range(i) for i in field.index_shape))
-            return [Assignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
+            return [SympyAssignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
    def __hash__(self):
        # All boundaries of these class behave equal -> should also be equal
        return hash("Neumann")
    def __eq__(self, other):
-        return type(other) == Neumann
+        return type(other) is Neumann
 class Dirichlet(Boundary):
@@ -103,11 +103,11 @@ class Dirichlet(Boundary):
    def __call__(self, field, direction_symbol, index_field, **kwargs):
        if field.index_dimensions == 0:
-            return [Assignment(field.center, index_field("value") if self.additional_data else self._value)]
+            return [SympyAssignment(field.center, index_field("value") if self.additional_data else self._value)]
        elif field.index_dimensions == 1:
            assert not self.additional_data
            if not field.has_fixed_index_shape:
                raise NotImplementedError("Field needs fixed index shape")
            assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field"
-            return [Assignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
+            return [SympyAssignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
        raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension")
--- a/pystencils/boundaries/boundaryhandling.py
+++ b/pystencils/boundaries/boundaryhandling.py
+from functools import lru_cache
 import numpy as np
 import sympy as sp
-from pystencils import create_indexed_kernel
+from pystencils import create_kernel, CreateKernelConfig, Target
-from pystencils.assignment import Assignment
+from pystencils.astnodes import SympyAssignment
 from pystencils.backends.cbackend import CustomCodeNode
 from pystencils.boundaries.createindexlist import (
    create_boundary_index_array, numpy_data_type_for_boundary_object)
-from pystencils.cache import memorycache
+from pystencils.typing import TypedSymbol, create_type
-from pystencils.data_types import TypedSymbol, create_type
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler
-from pystencils.datahandling.pycuda import PyCudaArrayHandler
 from pystencils.field import Field
-from pystencils.kernelparameters import FieldPointerSymbol
+from pystencils.typing.typed_sympy import FieldPointerSymbol
 try:
    # noinspection PyPep8Naming
    import waLBerla as wlb
    if wlb.cpp_available:
        from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
+        import cupy.cuda.runtime
    else:
        ParallelDataHandling = None
 except ImportError:
@@ -33,11 +35,11 @@ class FlagInterface:
        >>> dh = create_data_handling((4, 5))
        >>> fi = FlagInterface(dh, 'flag_field', np.uint8)
        >>> assert dh.has_data('flag_field')
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        2
-        >>> fi.reserve_flag(4)
+        >>> int(fi.reserve_flag(4))
        4
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        8
    """
@@ -84,7 +86,7 @@ class FlagInterface:
 class BoundaryHandling:
    def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None,
-                 target='cpu', openmp=True):
+                 target: Target = Target.CPU, openmp=True):
        assert data_handling.has_data(field_name)
        assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match"
        self._data_handling = data_handling
@@ -99,7 +101,7 @@ class BoundaryHandling:
        self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
        if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
-            array_handler = PyCudaArrayHandler()
+            array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
        else:
            array_handler = self.data_handling.array_handler
@@ -115,7 +117,8 @@ class BoundaryHandling:
            for obj, cpu_arr in cpu_version.items():
                if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
-                    gpu_version[obj] = array_handler.to_gpu(cpu_arr)
+                    gpu_version[obj] = array_handler.empty(cpu_arr.shape, cpu_arr.dtype)
+                    array_handler.upload(gpu_version[obj], cpu_arr)
                else:
                    array_handler.upload(gpu_version[obj], cpu_arr)
@@ -378,15 +381,15 @@ class BoundaryDataSetter:
        assert coord < self.dim
        return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5
-    @memorycache()
+    @lru_cache()
    def link_offsets(self):
        return self.stencil[self.index_array['dir']]
-    @memorycache()
+    @lru_cache()
    def link_positions(self, coord):
        return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord]
-    @memorycache()
+    @lru_cache()
    def boundary_cell_positions(self, coord):
        return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord]
@@ -423,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode):
        code = "\n"
        for i in range(dim):
            offset_str = ", ".join([str(d[i]) for d in stencil])
-            code += "const int64_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
+            code += "const int32_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
        inv_dirs = []
        for direction in stencil:
            inverse_dir = tuple([-i for i in direction])
            inv_dirs.append(str(stencil.index(inverse_dir)))
-        code += "const int %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
+        code += "const int32_t %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
        offset_symbols = BoundaryOffsetInfo._offset_symbols(dim)
        super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(),
                                                 symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL]))
    @staticmethod
    def _offset_symbols(dim):
-        return [TypedSymbol(f"c{d}", create_type(np.int64)) for d in ['x', 'y', 'z'][:dim]]
+        return [TypedSymbol(f"c{d}", create_type(np.int32)) for d in ['x', 'y', 'z'][:dim]]
-    INV_DIR_SYMBOL = TypedSymbol("invdir", "int")
+    INV_DIR_SYMBOL = TypedSymbol("invdir", np.int32)
-def create_boundary_kernel(field, index_field, stencil, boundary_functor, target='cpu', **kernel_creation_args):
+def create_boundary_kernel(field, index_field, stencil, boundary_functor, target=Target.CPU, **kernel_creation_args):
    elements = [BoundaryOffsetInfo(stencil)]
-    index_arr_dtype = index_field.dtype.numpy_dtype
+    dir_symbol = TypedSymbol("dir", np.int32)
-    dir_symbol = TypedSymbol("dir", index_arr_dtype.fields['dir'][0])
+    elements += [SympyAssignment(dir_symbol, index_field[0]('dir'))]
-    elements += [Assignment(dir_symbol, index_field[0]('dir'))]
    elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field)
-    return create_indexed_kernel(elements, [index_field], target=target, **kernel_creation_args)
+    config = CreateKernelConfig(index_fields=[index_field], target=target, skip_independence_check=True,
+                                **kernel_creation_args)
+    return create_kernel(elements, config=config)
--- a/pystencils/boundaries/createindexlist.py
+++ b/pystencils/boundaries/createindexlist.py
-import itertools
 import warnings
 import numpy as np
 try:
-    # Try to import right away - assume compiled code is available
+    import pyximport
-    # compile with: python setup.py build_ext --inplace --use-cython
-    from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
-        create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, create_boundary_cell_index_list_3d
+    pyximport.install(language_level=3)
    cython_funcs_available = True
 except ImportError:
-    try:
+    cython_funcs_available = False
-        # If not, try development mode and import via pyximport
-        import pyximport
+if cython_funcs_available:
+    from pystencils.boundaries.createindexlistcython import (
-        pyximport.install(language_level=3)
+        create_boundary_neighbor_index_list_2d,
-        cython_funcs_available = True
+        create_boundary_neighbor_index_list_3d,
-    except ImportError:
+        create_boundary_cell_index_list_2d,
-        cython_funcs_available = False
+        create_boundary_cell_index_list_3d,
-    if cython_funcs_available:
+    )
-        from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
-            create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, \
-            create_boundary_cell_index_list_3d
 boundary_index_array_coordinate_names = ["x", "y", "z"]
 direction_member_name = "dir"
+default_index_array_dtype = np.int32
 def numpy_data_type_for_boundary_object(boundary_object, dim):
    coordinate_names = boundary_index_array_coordinate_names[:dim]
-    return np.dtype([(name, np.int32) for name in coordinate_names]
+    return np.dtype(
-                    + [(direction_member_name, np.int32)]
+        [(name, default_index_array_dtype) for name in coordinate_names]
-                    + [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data], align=True)
+        + [(direction_member_name, default_index_array_dtype)]
+        + [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data],
+        align=True,
-def _create_boundary_neighbor_index_list_python(flag_field_arr, nr_of_ghost_layers, boundary_mask,
+    )
-                                                fluid_mask, stencil, single_link):
-    coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
+def _create_index_list_python(
+    flag_field_arr,
-    result = []
+    boundary_mask,
-    gl = nr_of_ghost_layers
+    fluid_mask,
-    for cell in itertools.product(*reversed([range(gl, i - gl) for i in flag_field_arr.shape])):
+    stencil,
-        cell = cell[::-1]
+    single_link,
-        if not flag_field_arr[cell] & fluid_mask:
+    inner_or_boundary=False,
-            continue
+    nr_of_ghost_layers=None,
+):
+    if inner_or_boundary and nr_of_ghost_layers is None:
+        raise ValueError(
+            "If inner_or_boundary is set True the number of ghost layers "
+            "around the inner domain has to be specified"
+        )
+    if nr_of_ghost_layers is None:
+        nr_of_ghost_layers = 0
+    coordinate_names = boundary_index_array_coordinate_names[
+        : len(flag_field_arr.shape)
+    ]
+    index_arr_dtype = np.dtype(
+        [(name, default_index_array_dtype) for name in coordinate_names]
+        + [(direction_member_name, default_index_array_dtype)]
+    )
+    # boundary cells are extracted via np.where. To ensure continous memory access in the compute kernel these cells
+    # have to be sorted.
+    boundary_cells = np.transpose(np.nonzero(flag_field_arr == boundary_mask))
+    for i in range(len(flag_field_arr.shape)):
+        boundary_cells = boundary_cells[boundary_cells[:, i].argsort(kind="mergesort")]
+    # First a set is created to save all fluid cells which are near boundary
+    fluid_cells = set()
+    for cell in boundary_cells:
+        cell = tuple(cell)
        for dir_idx, direction in enumerate(stencil):
-            neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)])
+            neighbor_cell = tuple(
-            if flag_field_arr[neighbor_cell] & boundary_mask:
+                [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
-                result.append(cell + (dir_idx,))
+            )
-                if single_link:
+            # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
-                    break
+            if any(
+                not 0 + nr_of_ghost_layers <= e < upper - nr_of_ghost_layers
-    return np.array(result, dtype=index_arr_dtype)
+                for e, upper in zip(neighbor_cell, flag_field_arr.shape)
+            ):
+                continue
+            if flag_field_arr[neighbor_cell] & fluid_mask:
+                fluid_cells.add(neighbor_cell)
+    # then this is set is transformed to a list to make it sortable. This ensures continoous memory access later.
+    fluid_cells = list(fluid_cells)
+    if len(flag_field_arr.shape) == 3:
+        fluid_cells.sort(key=lambda tup: (tup[-1], tup[-2], tup[0]))
+    else:
+        fluid_cells.sort(key=lambda tup: (tup[-1], tup[0]))
-def _create_boundary_cell_index_list_python(flag_field_arr, boundary_mask,
+    cells_to_iterate = fluid_cells if inner_or_boundary else boundary_cells
-                                            fluid_mask, stencil, single_link):
+    checkmask = boundary_mask if inner_or_boundary else fluid_mask
-    coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
    result = []
-    for cell in itertools.product(*reversed([range(0, i) for i in flag_field_arr.shape])):
+    for cell in cells_to_iterate:
-        cell = cell[::-1]
+        cell = tuple(cell)
-        if not flag_field_arr[cell] & boundary_mask:
+        sum_cells = np.zeros(len(cell))
-            continue
        for dir_idx, direction in enumerate(stencil):
-            neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)])
+            neighbor_cell = tuple(
-            if any(not 0 <= e < upper for e, upper in zip(neighbor_cell, flag_field_arr.shape)):
+                [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
+            )
+            # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
+            if any(
+                not 0 <= e < upper
+                for e, upper in zip(neighbor_cell, flag_field_arr.shape)
+            ):
                continue
-            if flag_field_arr[neighbor_cell] & fluid_mask:
+            if flag_field_arr[neighbor_cell] & checkmask:
-                result.append(cell + (dir_idx,))
                if single_link:
-                    break
+                    sum_cells += np.array(direction)
+                else:
+                    result.append(tuple(cell) + (dir_idx,))
+        # the discrete normal direction is the one which gives the maximum inner product to the stencil direction
+        if single_link and any(sum_cells != 0):
+            idx = np.argmax(np.inner(sum_cells, stencil))
+            result.append(tuple(cell) + (idx,))
    return np.array(result, dtype=index_arr_dtype)
-def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
+def create_boundary_index_list(
-                               nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
+    flag_field,
+    stencil,
+    boundary_mask,
+    fluid_mask,
+    nr_of_ghost_layers=1,
+    inner_or_boundary=True,
+    single_link=False,
+):
    """Creates a numpy array storing links (connections) between domain cells and boundary cells.
    Args:
@@ -91,15 +141,25 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
        nr_of_ghost_layers: only relevant if neighbors is True
        inner_or_boundary: if true, the result contains the cell coordinates of the domain cells -
                    if false the boundary cells are listed
-        single_link: if true only the first link is reported from this cell
+        single_link: if true only the link in normal direction to this cell is reported
    """
    dim = len(flag_field.shape)
    coordinate_names = boundary_index_array_coordinate_names[:dim]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
+    index_arr_dtype = np.dtype(
+        [(name, default_index_array_dtype) for name in coordinate_names]
-    stencil = np.array(stencil, dtype=np.int32)
+        + [(direction_member_name, default_index_array_dtype)]
-    args = (flag_field, nr_of_ghost_layers, boundary_mask, fluid_mask, stencil, single_link)
+    )
+    stencil = np.array(stencil, dtype=default_index_array_dtype)
+    args = (
+        flag_field,
+        nr_of_ghost_layers,
+        boundary_mask,
+        fluid_mask,
+        stencil,
+        single_link,
+    )
    args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link)
    if cython_funcs_available:
@@ -118,24 +178,42 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
        return np.array(idx_list, dtype=index_arr_dtype)
    else:
        if flag_field.size > 1e6:
-            warnings.warn("Boundary setup may take very long! Consider installing cython to speed it up")
+            warnings.warn(
-        if inner_or_boundary:
+                "Boundary setup may take very long! Consider installing cython to speed it up"
-            return _create_boundary_neighbor_index_list_python(*args)
+            )
-        else:
+        return _create_index_list_python(
-            return _create_boundary_cell_index_list_python(*args_no_gl)
+            *args_no_gl,
+            inner_or_boundary=inner_or_boundary,
+            nr_of_ghost_layers=nr_of_ghost_layers,
-def create_boundary_index_array(flag_field, stencil, boundary_mask, fluid_mask, boundary_object,
+        )
-                                nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
-    idx_array = create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
-                                           nr_of_ghost_layers, inner_or_boundary, single_link)
+def create_boundary_index_array(
+    flag_field,
+    stencil,
+    boundary_mask,
+    fluid_mask,
+    boundary_object,
+    nr_of_ghost_layers=1,
+    inner_or_boundary=True,
+    single_link=False,
+):
+    idx_array = create_boundary_index_list(
+        flag_field,
+        stencil,
+        boundary_mask,
+        fluid_mask,
+        nr_of_ghost_layers,
+        inner_or_boundary,
+        single_link,
+    )
    dim = len(flag_field.shape)
    if boundary_object.additional_data:
        coordinate_names = boundary_index_array_coordinate_names[:dim]
        index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
        extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype)
-        for prop in coordinate_names + ['dir']:
+        for prop in coordinate_names + ["dir"]:
            extended_idx_field[prop] = idx_array[prop]
        idx_array = extended_idx_field

--- a/pystencils/boundaries/createindexlistcython.pyx
+++ b/pystencils/boundaries/createindexlistcython.pyx
-# distutils: language=c
+# cython: language_level=3str
-# Workaround for cython bug
-# see https://stackoverflow.com/questions/8024805/cython-compiled-c-extension-importerror-dynamic-module-does-not-define-init-fu
-WORKAROUND = "Something"
 import cython
@@ -22,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel
    cdef int xs, ys, x, y
    cdef int dirIdx, num_directions, dx, dy
+    cdef int sum_x, sum_y
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
    for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
        for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
+            sum_x = 0; sum_y = 0;
            if flag_field[x, y] & fluid_mask:
                for dirIdx in range(num_directions):
-                    dx = stencil[dirIdx,0]
+                    dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
-                    dy = stencil[dirIdx,1]
                    if flag_field[x + dx, y + dy] & boundary_mask:
-                        boundary_index_list.append((x,y, dirIdx))
                        if single_link:
-                            break
+                            sum_x += dx; sum_y += dy;
+                        else:
+                            boundary_index_list.append((x, y, dirIdx))
+            dot = 0; maxn = 0; calculated_idx = 0
+            if single_link and (sum_x != 0 or sum_y != 0):
+                for dirIdx in range(num_directions):
+                    dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1];
+                    dot = dx * sum_x + dy * sum_y
+                    if dot > maxn:
+                        maxn = dot
+                        calculated_idx = dirIdx
+                boundary_index_list.append((x, y, calculated_idx))
    return boundary_index_list
@@ -47,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
    cdef int xs, ys, zs, x, y, z
    cdef int dirIdx, num_directions, dx, dy, dz
+    cdef int sum_x, sum_y, sum_z
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys, zs = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
@@ -54,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
    for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers):
        for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
            for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
+                sum_x = 0; sum_y = 0; sum_z = 0
                if flag_field[x, y, z] & fluid_mask:
                    for dirIdx in range(num_directions):
-                        dx = stencil[dirIdx,0]
+                        dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]; dz = stencil[dirIdx,2]
-                        dy = stencil[dirIdx,1]
-                        dz = stencil[dirIdx,2]
                        if flag_field[x + dx, y + dy, z + dz] & boundary_mask:
-                            boundary_index_list.append((x,y,z, dirIdx))
                            if single_link:
-                                break
+                                sum_x += dx; sum_y += dy; sum_z += dz
+                            else:
+                                boundary_index_list.append((x, y, z, dirIdx))
+                dot = 0; maxn = 0; calculated_idx = 0
+                if single_link and (sum_x != 0 or sum_y != 0 or sum_z != 0):
+                    for dirIdx in range(num_directions):
+                        dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
+                        dot = dx * sum_x + dy * sum_y + dz * sum_z
+                        if dot > maxn:
+                            maxn = dot
+                            calculated_idx = dirIdx
+                    boundary_index_list.append((x, y, z, calculated_idx))
    return boundary_index_list
@@ -75,21 +105,39 @@ def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field,
    cdef int xs, ys, x, y
    cdef int dirIdx, num_directions, dx, dy
+    cdef int sum_x, sum_y
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
    for y in range(0, ys):
        for x in range(0, xs):
+            sum_x = 0; sum_y = 0;
            if flag_field[x, y] & boundary_mask:
                for dirIdx in range(num_directions):
-                    dx = stencil[dirIdx,0]
+                    dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
-                    dy = stencil[dirIdx,1]
                    if 0 <= x + dx < xs and 0 <= y + dy < ys:
                        if flag_field[x + dx, y + dy] & fluid_mask:
-                            boundary_index_list.append((x,y, dirIdx))
                            if single_link:
-                                break
+                                sum_x += dx; sum_y += dy
+                            else:
+                                boundary_index_list.append((x, y, dirIdx))
+            dot = 0; maxn = 0; calculated_idx = 0
+            if single_link and (sum_x != 0 or sum_y != 0):
+                for dirIdx in range(num_directions):
+                    dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]
+                    dot = dx * sum_x + dy * sum_y
+                    if dot > maxn:
+                        maxn = dot
+                        calculated_idx = dirIdx
+                boundary_index_list.append((x, y, calculated_idx))
    return boundary_index_list
@@ -101,6 +149,10 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
    cdef int xs, ys, zs, x, y, z
    cdef int dirIdx, num_directions, dx, dy, dz
+    cdef int sum_x, sum_y, sum_z
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys, zs = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
@@ -108,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
    for z in range(0, zs):
        for y in range(0, ys):
            for x in range(0, xs):
+                sum_x = 0; sum_y = 0; sum_z = 0
                if flag_field[x, y, z] & boundary_mask:
                    for dirIdx in range(num_directions):
-                        dx = stencil[dirIdx,0]
+                        dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
-                        dy = stencil[dirIdx,1]
-                        dz = stencil[dirIdx,2]
                        if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs:
                            if flag_field[x + dx, y + dy, z + dz] & fluid_mask:
-                                boundary_index_list.append((x,y,z, dirIdx))
                                if single_link:
-                                    break
+                                    sum_x += dx; sum_y += dy; sum_z += dz
+                                else:
+                                    boundary_index_list.append((x, y, z, dirIdx))
+                dot = 0; maxn = 0; calculated_idx=0
+                if single_link and (sum_x != 0 or sum_y !=0 or sum_z !=0):
+                    for dirIdx in range(num_directions):
+                        dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
+                        dot = dx*sum_x + dy*sum_y + dz*sum_z
+                        if dot > maxn:
+                            maxn = dot
+                            calculated_idx = dirIdx
+                    boundary_index_list.append((x, y, z, calculated_idx))
    return boundary_index_list
\ No newline at end of file
--- a/pystencils/boundaries/inkernel.py
+++ b/pystencils/boundaries/inkernel.py
 import sympy as sp
 from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE
-from pystencils.data_types import TypedSymbol, create_type
+from pystencils.typing import TypedSymbol, create_type
 from pystencils.field import Field
 from pystencils.integer_functions import bitwise_and

--- a/src/pystencils/cache.py
+++ b/src/pystencils/cache.py
+import os
+from collections.abc import Hashable
+from functools import partial, wraps
+from itertools import chain
+from functools import lru_cache as memorycache
+from joblib import Memory
+from appdirs import user_cache_dir
+if 'PYSTENCILS_CACHE_DIR' in os.environ:
+    cache_dir = os.environ['PYSTENCILS_CACHE_DIR']
+else:
+    cache_dir = user_cache_dir('pystencils')
+disk_cache = Memory(cache_dir, verbose=False).cache
+disk_cache_no_fallback = disk_cache
+def _wrapper(wrapped_func, cached_func, *args, **kwargs):
+    if all(isinstance(a, Hashable) for a in chain(args, kwargs.values())):
+        return cached_func(*args, **kwargs)
+    else:
+        return wrapped_func(*args, **kwargs)
+def memorycache_if_hashable(maxsize=128, typed=False):
+    def wrapper(func):
+        return partial(_wrapper, func, memorycache(maxsize, typed)(func))
+    return wrapper
+def sharedmethodcache(cache_id: str):
+    """Decorator for memoization of instance methods, allowing multiple methods to use the same cache.
+    This decorator caches results of instance methods per instantiated object of the surrounding class.
+    It allows multiple methods to use the same cache, by passing them the same `cache_id` string.
+    Cached values are stored in a dictionary, which is added as a member `self.<cache_id>` to the 
+    `self` object instance. Make sure that this doesn't cause any naming conflicts with other members!
+    Of course, for this to be useful, said methods must have the same signature (up to additional kwargs)
+    and must return the same result when called with the same arguments."""
+    def _decorator(user_method):
+        @wraps(user_method)
+        def _decorated_func(self, *args, **kwargs):
+            objdict = self.__dict__
+            cache = objdict.setdefault(cache_id, dict())
+            key = args
+            for item in kwargs.items():
+                key += item
+            if key not in cache:
+                result = user_method(self, *args, **kwargs)
+                cache[key] = result
+                return result
+            else:
+                return cache[key]
+        return _decorated_func
+    return _decorator
+def clear_cache():
+    """
+    Clears the pystencils cache created by joblib.
+    """
+    memory = Memory(cache_dir, verbose=0)
+    memory.clear(warn=False)
+# Disable memory cache:
+# disk_cache = lambda o: o
+# disk_cache_no_fallback = lambda o: o
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
+from copy import copy
+from collections import defaultdict
+from dataclasses import dataclass, field
+from types import MappingProxyType
+from typing import Union, Tuple, List, Dict, Callable, Any, DefaultDict, Iterable
+from pystencils import Target, Backend, Field
+from pystencils.typing.typed_sympy import BasicType
+from pystencils.typing.utilities import collate_types
+import numpy as np
+# TODO: There exists DTypeLike in NumPy which would be better than type for type hinting, to new at the moment
+# from numpy.typing import DTypeLike
+# TODO: CreateKernelConfig is bloated think of more classes better usage, factory whatever ...
+# Proposition: CreateKernelConfigs Classes for different targets?
+@dataclass
+class CreateKernelConfig:
+    """
+    **Below all parameters for the CreateKernelConfig are explained**
+    """
+    target: Target = Target.CPU
+    """
+    All targets are defined in :class:`pystencils.enums.Target`
+    """
+    backend: Backend = None
+    """
+    All backends are defined in :class:`pystencils.enums.Backend`
+    """
+    function_name: str = 'kernel'
+    """
+    Name of the generated function - only important if generated code is written out
+    """
+    data_type: Union[type, str, DefaultDict[str, BasicType], Dict[str, BasicType]] = np.float64
+    """
+    Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type.
+    If specified as a dict ideally a defaultdict is used to define a default value for symbols not listed in the
+    dict. If a plain dict is provided it will be transformed into a defaultdict internally. The default value 
+    will then be specified via type collation then.
+    """
+    default_number_float: Union[type, str, BasicType] = None
+    """
+    Data type used for all untyped floating point numbers (i.e. 0.5). By default the value of data_type is used.
+    If data_type is given as a defaultdict its default_factory is used.
+    """
+    default_number_int: Union[type, str, BasicType] = np.int64
+    """
+    Data type used for all untyped integer numbers (i.e. 1)
+    """
+    iteration_slice: Tuple = None
+    """
+    Rectangular subset to iterate over, if not specified the complete non-ghost layer part of the field is iterated over
+    """
+    ghost_layers: Union[bool, int, List[Tuple[int]]] = None
+    """
+    A single integer specifies the ghost layer count at all borders, can also be a sequence of
+    pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
+    If left to default, the number of ghost layers is determined automatically from the assignments.
+    """
+    cpu_openmp: Union[bool, int] = False
+    """
+    `True` or number of threads for OpenMP parallelization, `False` for no OpenMP. If set to `True`, the maximum number
+    of available threads will be chosen.
+    """
+    cpu_vectorize_info: Dict = None
+    """
+    A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
+    for documentation of these parameters see vectorize function. Example:
+    '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
+    """
+    cpu_blocking: Tuple[int] = None
+    """
+    A tuple of block sizes or `None` if no blocking should be applied
+    """
+    omp_single_loop: bool = True
+    """
+    If OpenMP is active: whether multiple outer loops are permitted
+    """
+    base_pointer_specification: Union[List[Iterable[str]], List[Iterable[int]]] = None
+    """
+    Specification of how many and which intermediate pointers are created for a field access.
+    For example [ (0), (2,3,)]  creates on base pointer for coordinates 2 and 3 and writes the offset for coordinate
+    zero directly in the field access. These specifications are defined dependent on the loop ordering.
+    This function translates more readable version into the specification above.
+    For more information see: `pystencils.transformations.create_intermediate_base_pointer`
+    """
+    gpu_indexing: str = 'block'
+    """
+    Either 'block' or 'line' , or custom indexing class, see `pystencils.gpu.AbstractIndexing`
+    """
+    gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({}))
+    """
+    Dict with indexing parameters (constructor parameters of indexing class)
+    e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
+    """
+    # TODO Markus rework this docstring
+    default_assignment_simplifications: bool = False
+    """
+    If `True` default simplifications are first performed on the Assignments. If problems occur during the
+    simplification a warning will be thrown.
+    Furthermore, it is essential to know that this is a two-stage process. The first stage of the process acts
+    on the level of the `pystencils.AssignmentCollection`.  In this part,
+    `pystencil.simp.create_simplification_strategy` from pystencils.simplificationfactory will be used to
+    apply optimisations like insertion of constants to
+    remove pressure from the registers. Thus the first part of the optimisations can only be executed if
+    an `AssignmentCollection` is passed. The second part of the optimisation acts on the level of each Assignment
+    individually. In this stage, all optimisations from `sympy.codegen.rewriting.optims_c99` are applied
+    to each Assignment. Thus this stage can also be applied if a list of Assignments is passed.
+    """
+    cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
+    """
+    List of extra optimizations to perform first on the AST.
+    """
+    use_auto_for_assignments: bool = False
+    """
+    If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust.
+    """
+    index_fields: List[Field] = None
+    """
+    List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel`
+    instead of `create_domain_kernel` is used.
+    """
+    coordinate_names: Tuple[str, Any] = ('x', 'y', 'z')
+    """
+    Name of the coordinate fields in the struct data type.
+    """
+    allow_double_writes: bool = False
+    """
+    If True, don't check if every field is only written at a single location. This is required
+    for example for kernels that are compiled with loop step sizes > 1, that handle multiple
+    cells at once. Use with care!
+    """
+    skip_independence_check: bool = False
+    """
+    By default the assignment list is checked for read/write independence. This means fields are only written at
+    locations where they are read. Doing so guarantees thread safety. In some cases e.g. for
+    periodicity kernel, this can not be assured and does the check needs to be deactivated. Use with care!
+    """
+    class DataTypeFactory:
+        """Because of pickle, we need to have a nested class, instead of a lambda in __post_init__"""
+        def __init__(self, dt):
+            self.dt = dt
+        def __call__(self):
+            return BasicType(self.dt)
+    def _check_type(self, dtype_to_check):
+        if isinstance(dtype_to_check, str) and (dtype_to_check == 'float' or dtype_to_check == 'int'):
+            self._typing_error()
+        if isinstance(dtype_to_check, type) and not hasattr(dtype_to_check, "dtype"):
+            # NumPy-types are also of type 'type'. However, they have more properties
+            self._typing_error()
+    @staticmethod
+    def _typing_error():
+        raise ValueError("It is not possible to use python types (float, int) for datatypes because these "
+                         "types are ambiguous. For example float will map to double. "
+                         "Also the string version like 'float' is not allowed, e.g. use 'float64' instead")
+    def __post_init__(self):
+        # ----  Legacy parameters
+        if not isinstance(self.target, Target):
+            raise ValueError("target must be provided by the 'Target' enum")
+        # ---- Auto Backend
+        if not self.backend:
+            if self.target == Target.CPU:
+                self.backend = Backend.C
+            elif self.target == Target.GPU:
+                self.backend = Backend.CUDA
+            else:
+                raise NotImplementedError(f'Target {self.target} has no default backend')
+        if not isinstance(self.backend, Backend):
+            raise ValueError("backend must be provided by the 'Backend' enum")
+        # Normalise data types
+        for dtype in [self.data_type, self.default_number_float, self.default_number_int]:
+            self._check_type(dtype)
+        if not isinstance(self.data_type, dict):
+            dt = copy(self.data_type)  # The copy is necessary because BasicType has sympy shinanigans
+            self.data_type = defaultdict(self.DataTypeFactory(dt))
+        if isinstance(self.data_type, dict) and not isinstance(self.data_type, defaultdict):
+            for dtype in self.data_type.values():
+                self._check_type(dtype)
+            dt = collate_types([BasicType(dtype) for dtype in self.data_type.values()])
+            dtype_dict = self.data_type
+            self.data_type = defaultdict(self.DataTypeFactory(dt), dtype_dict)
+        assert isinstance(self.data_type, defaultdict), "At this point data_type must be a defaultdict!"
+        for dtype in self.data_type.values():
+            self._check_type(dtype)
+        self._check_type(self.data_type.default_factory())
+        if self.default_number_float is None:
+            self.default_number_float = self.data_type.default_factory()
+        if not isinstance(self.default_number_float, BasicType):
+            self.default_number_float = BasicType(self.default_number_float)
+        if not isinstance(self.default_number_int, BasicType):
+            self.default_number_int = BasicType(self.default_number_int)
--- a/pystencils/cpu/__init__.py
+++ b/pystencils/cpu/__init__.py
 from pystencils.cpu.cpujit import make_python_function
-from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel
+from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel, add_pragmas
-__all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'make_python_function']
+__all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'add_pragmas', 'make_python_function']
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -13,7 +13,7 @@ in a configuration file.
 3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or
   ``%HOMEPATH%\.pystencils\config.json`` (Windows)
-If no configuration file is found, a default configuration is created at the above mentioned location in your home.
+If no configuration file is found, a default configuration is created at the above-mentioned location in your home.
 So run *pystencils* once, then edit the created configuration file.
@@ -23,7 +23,7 @@ Compiler Config (Linux)
 - **'os'**: should be detected automatically as 'linux'
 - **'command'**: path to C++ compiler (defaults to 'g++')
 - **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler
- **'restrict_qualifier'**: the restrict qualifier is not standardized accross compilers.
+- **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
  For most Linux compilers the qualifier is ``__restrict__``
@@ -39,31 +39,37 @@ Then 'cl.exe' is used to compile.
  where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat'
 - **'arch'**: 'x86' or 'x64'
 - **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated
- **'restrict_qualifier'**: the restrict qualifier is not standardized across compilers.
+- **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
  For Windows compilers the qualifier should be ``__restrict``
 """
+from appdirs import user_cache_dir, user_config_dir
+from collections import OrderedDict
 import hashlib
+import importlib.util
 import json
 import os
 import platform
 import shutil
 import subprocess
+import sysconfig
+import tempfile
 import textwrap
-from collections import OrderedDict
+import time
-from sysconfig import get_paths
+import warnings
-from tempfile import TemporaryDirectory, NamedTemporaryFile
+import pathlib
 import numpy as np
-from appdirs import user_cache_dir, user_config_dir
 from pystencils import FieldType
 from pystencils.astnodes import LoopOverCoordinate
-from pystencils.backends.cbackend import generate_c, get_headers, CFunction
+from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.data_types import cast_func, VectorType, vector_memory_access
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
+from pystencils.cpu.msvc_detection import get_environment
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
-from pystencils.utils import atomic_file_write, file_handle_for_atomic_write, recursive_dict_update
+from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess
+from pystencils.utils import atomic_file_write, recursive_dict_update
 def make_python_function(kernel_function_node, custom_backend=None):
@@ -75,6 +81,7 @@ def make_python_function(kernel_function_node, custom_backend=None):
        - all symbols which are not defined in the kernel itself are expected as parameters
    :param kernel_function_node: the abstract syntax tree
+    :param custom_backend: use own custom printer for code generation
    :return: kernel functor
    """
    result = compile_and_load(kernel_function_node, custom_backend)
@@ -117,15 +124,15 @@ def get_configuration_file_path():
    # 1) Read path from environment variable if found
    if 'PYSTENCILS_CONFIG' in os.environ:
-        return os.environ['PYSTENCILS_CONFIG'], True
+        return os.environ['PYSTENCILS_CONFIG']
    # 2) Look in current directory for pystencils.json
    elif os.path.exists("pystencils.json"):
-        return "pystencils.json", True
+        return "pystencils.json"
    # 3) Try ~/.pystencils.json
    elif os.path.exists(config_path_in_home):
-        return config_path_in_home, True
+        return config_path_in_home
    else:
-        return config_path_in_home, False
+        return config_path_in_home
 def create_folder(path, is_file):
@@ -137,52 +144,50 @@ def create_folder(path, is_file):
        pass
-def get_llc_command():
-    """Try to get executable for llvm's IR compiler llc
-    We try if one of the following is in PATH: llc, llc-10, llc-9, llc-8, llc-7, llc-6
-    """
-    candidates = ['llc', 'llc-10', 'llc-9', 'llc-8', 'llc-7', 'llc-6']
-    found_executables = (e for e in candidates if shutil.which(e))
-    return next(found_executables, None)
 def read_config():
    if platform.system().lower() == 'linux':
        default_compiler_config = OrderedDict([
            ('os', 'linux'),
            ('command', 'g++'),
-            ('llc_command', get_llc_command() or 'llc'),
            ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
            ('restrict_qualifier', '__restrict__')
        ])
-        if platform.machine().startswith('ppc64'):
+        if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
                                                                                        '-mcpu=native')
    elif platform.system().lower() == 'windows':
        default_compiler_config = OrderedDict([
            ('os', 'windows'),
            ('msvc_version', 'latest'),
-            ('llc_command', get_llc_command() or 'llc'),
            ('arch', 'x64'),
            ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
            ('restrict_qualifier', '__restrict')
        ])
+        if platform.machine() == 'ARM64':
+            default_compiler_config['arch'] = 'ARM64'
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
    elif platform.system().lower() == 'darwin':
        default_compiler_config = OrderedDict([
            ('os', 'darwin'),
            ('command', 'clang++'),
-            ('llc_command', get_llc_command() or 'llc'),
            ('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'),
            ('restrict_qualifier', '__restrict__')
        ])
        if platform.machine() == 'arm64':
-            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', '')
+            if 'sme' in get_supported_instruction_sets():
+                flag = '-march=armv8.7-a+sme '
+            else:
+                flag = ''
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', flag)
        for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib',
                       '/opt/homebrew/lib/libomp.dylib']:
            if os.path.exists(libomp):
                default_compiler_config['flags'] += ' ' + libomp
                break
+    else:
+        raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
+                                  (platform.system(),))
    default_cache_config = OrderedDict([
        ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
        ('clear_cache_on_start', False),
@@ -191,42 +196,47 @@ def read_config():
    default_config = OrderedDict([('compiler', default_compiler_config),
                                  ('cache', default_cache_config)])
-    config_path, config_exists = get_configuration_file_path()
+    from fasteners import InterProcessLock
+    config_path = pathlib.Path(get_configuration_file_path())
+    config_path.parent.mkdir(parents=True, exist_ok=True)
    config = default_config.copy()
-    if config_exists:
-        with open(config_path, 'r') as json_config_file:
+    lockfile = config_path.with_suffix(config_path.suffix + ".lock")
-            loaded_config = json.load(json_config_file)
+    with InterProcessLock(lockfile):
-        config = recursive_dict_update(config, loaded_config)
+        if config_path.exists():
-    else:
+            with open(config_path, 'r') as json_config_file:
-        create_folder(config_path, True)
+                loaded_config = json.load(json_config_file)
-        with open(config_path, 'w') as f:
+            config = recursive_dict_update(config, loaded_config)
-            json.dump(config, f, indent=4)
+        else:
+            with open(config_path, 'w') as f:
+                json.dump(config, f, indent=4)
    if config['cache']['object_cache'] is not False:
        config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid())
-        clear_cache = False
+        clear_cache_on_start = False
        cache_status_file = os.path.join(config['cache']['object_cache'], 'last_config.json')
        if os.path.exists(cache_status_file):
            # check if compiler config has changed
            last_config = json.load(open(cache_status_file, 'r'))
            if set(last_config.items()) != set(config['compiler'].items()):
-                clear_cache = True
+                clear_cache_on_start = True
            else:
                for key in last_config.keys():
                    if last_config[key] != config['compiler'][key]:
-                        clear_cache = True
+                        clear_cache_on_start = True
-        if config['cache']['clear_cache_on_start'] or clear_cache:
+        if config['cache']['clear_cache_on_start'] or clear_cache_on_start:
            shutil.rmtree(config['cache']['object_cache'], ignore_errors=True)
        create_folder(config['cache']['object_cache'], False)
-        with NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
+        with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
            json.dump(config['compiler'], f, indent=4)
        os.replace(f.name, cache_status_file)
    if config['compiler']['os'] == 'windows':
-        from pystencils.cpu.msvc_detection import get_environment
        msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch'])
        if 'env' not in config['compiler']:
            config['compiler']['env'] = {}
@@ -273,6 +283,7 @@ def clear_cache():
        create_folder(cache_config['object_cache'], False)
+# TODO don't hardcode C type. [1] of tuple output
 type_mapping = {
    np.float32: ('PyFloat_AsDouble', 'float'),
    np.float64: ('PyFloat_AsDouble', 'double'),
@@ -282,8 +293,6 @@ type_mapping = {
    np.uint16: ('PyLong_AsUnsignedLong', 'uint16_t'),
    np.uint32: ('PyLong_AsUnsignedLong', 'uint32_t'),
    np.uint64: ('PyLong_AsUnsignedLong', 'uint64_t'),
-    np.complex64: (('PyComplex_RealAsDouble', 'PyComplex_ImagAsDouble'), 'ComplexFloat'),
-    np.complex128: (('PyComplex_RealAsDouble', 'PyComplex_ImagAsDouble'), 'ComplexDouble'),
 }
 template_extract_scalar = """
@@ -293,14 +302,6 @@ if( obj_{name} == NULL) {{  PyErr_SetString(PyExc_TypeError, "Keyword argument '
 if( PyErr_Occurred() ) {{ return NULL; }}
 """
-template_extract_complex = """
-PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
-if( obj_{name} == NULL) {{  PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
-{target_type} {name}{{ ({real_type}) {extract_function_real}( obj_{name} ),
-                       ({real_type}) {extract_function_imag}( obj_{name} ) }};
-if( PyErr_Occurred() ) {{ return NULL; }}
-"""
 template_extract_array = """
 PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
 if( obj_{name} == NULL) {{  PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
@@ -368,8 +369,7 @@ def equal_size_check(fields):
        return ""
    ref_field = fields[0]
-    cond = ["(buffer_{field.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])".format(ref_field=ref_field,
+    cond = [f"(buffer_{field_to_test.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])"
-                                                                                            field=field_to_test, i=i)
            for field_to_test in fields[1:]
            for i in range(fields[0].spatial_dimensions)]
    cond = " && ".join(cond)
@@ -397,7 +397,7 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                aligned = False
                if ast_node.assignments:
                    aligned = any([a.lhs.args[2] for a in ast_node.assignments
-                                   if hasattr(a, 'lhs') and isinstance(a.lhs, cast_func)
+                                   if hasattr(a, 'lhs') and isinstance(a.lhs, CastFunc)
                                   and hasattr(a.lhs, 'dtype') and isinstance(a.lhs.dtype, VectorType)])
                if ast_node.instruction_set and aligned:
@@ -407,9 +407,10 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                        for loop in ast_node.atoms(LoopOverCoordinate):
                            has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
                            has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
-                                                                      loop.atoms(vector_memory_access)])
+                                                                      loop.atoms(VectorMemoryAccess)])
                        if has_openmp and has_nontemporal:
-                            byte_width = ast_node.instruction_set['cachelineSize']
+                            cl_size = ast_node.instruction_set['cachelineSize']
+                            byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
                    offset = max(max(ast_node.ghost_layers)) * item_size
                    offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"
@@ -426,8 +427,7 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                if (np_dtype.isbuiltin and FieldType.is_generic(field)
                        and not np.issubdtype(field.dtype.numpy_dtype, np.complexfloating)):
-                    dtype_cond = "buffer_{name}.format[0] == '{format}'".format(name=field.name,
+                    dtype_cond = f"buffer_{field.name}.format[0] == '{field.dtype.numpy_dtype.char}'"
-                                                                                format=field.dtype.numpy_dtype.char)
                    pre_call_code += template_check_array.format(cond=dtype_cond, what="data type", name=field.name,
                                                                 expected=str(field.dtype.numpy_dtype))
@@ -456,25 +456,14 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
        elif param.is_field_stride:
            field = param.fields[0]
            item_size = field.dtype.numpy_dtype.itemsize
-            parameters.append("buffer_{name}.strides[{i}] / {bytes}".format(bytes=item_size, i=param.symbol.coordinate,
+            parameters.append(f"buffer_{field.name}.strides[{param.symbol.coordinate}] / {item_size}")
-                                                                            name=field.name))
        elif param.is_field_shape:
            parameters.append(f"buffer_{param.field_name}.shape[{param.symbol.coordinate}]")
-        elif type(param.symbol) is CFunction:
-            continue
        else:
            extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type]
-            if np.issubdtype(param.symbol.dtype.numpy_dtype, np.complexfloating):
+            pre_call_code += template_extract_scalar.format(extract_function=extract_function,
-                pre_call_code += template_extract_complex.format(extract_function_real=extract_function[0],
+                                                            target_type=target_type,
-                                                                 extract_function_imag=extract_function[1],
+                                                            name=param.symbol.name)
-                                                                 target_type=target_type,
-                                                                 real_type="float" if target_type == "ComplexFloat"
-                                                                 else "double",
-                                                                 name=param.symbol.name)
-            else:
-                pre_call_code += template_extract_scalar.format(extract_function=extract_function,
-                                                                target_type=target_type,
-                                                                name=param.symbol.name)
            parameters.append(param.symbol.name)
@@ -494,18 +483,15 @@ def create_module_boilerplate_code(module_name, names):
 def load_kernel_from_file(module_name, function_name, path):
-    from importlib.util import spec_from_file_location, module_from_spec
    try:
-        spec = spec_from_file_location(name=module_name, location=path)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
-        mod = module_from_spec(spec)
+        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)
    except ImportError:
-        import time
+        warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
-        import warnings
+        time.sleep(5)
-        warnings.warn("Could not load " + path + ", trying on more time...")
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
-        time.sleep(1)
+        mod = importlib.util.module_from_spec(spec)
-        spec = spec_from_file_location(name=module_name, location=path)
-        mod = module_from_spec(spec)
        spec.loader.exec_module(mod)
    return getattr(mod, function_name)
@@ -544,15 +530,19 @@ class ExtensionModuleCode:
        headers = {'<math.h>', '<stdint.h>'}
        for ast in self._ast_nodes:
+            for field in ast.fields_accessed:
+                if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                    # Add the half precision header only if half precision numbers occur in the AST
+                    headers.add('"half_precision.h"')
            headers.update(get_headers(ast))
-        header_list = list(headers)
-        header_list.sort()
+        header_list = sorted(headers)
        header_list.insert(0, '"Python.h"')
        ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list
                      if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))]
        header_hash = b''.join([hashlib.sha256(open(h, 'rb').read()).digest() for h in ps_headers])
-        includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
+        includes = "\n".join([f"#include {include_file}" for include_file in header_list])
        self._code_string += includes
        self._code_string += "\n"
        self._code_string += f"#define RESTRICT {restrict_qualifier} \n"
@@ -561,7 +551,7 @@ class ExtensionModuleCode:
        for ast, name in zip(self._ast_nodes, self._function_names):
            old_name = ast.function_name
-            ast.function_name = "kernel_" + name
+            ast.function_name = f"kernel_{name}"
            self._code_string += generate_c(ast, custom_backend=self._custom_backend)
            self._code_string += create_function_boilerplate_code(ast.get_parameters(), name, ast)
            ast.function_name = old_name
@@ -578,9 +568,12 @@ class ExtensionModuleCode:
        print(self._code_string, file=file)
-def compile_module(code, code_hash, base_dir, compile_flags=[]):
+def compile_module(code, code_hash, base_dir, compile_flags=None):
+    if compile_flags is None:
+        compile_flags = []
    compiler_config = get_compiler_config()
-    extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
+    extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
    if compiler_config['os'].lower() == 'windows':
        lib_suffix = '.pyd'
@@ -596,8 +589,11 @@ def compile_module(code, code_hash, base_dir, compile_flags=[]):
    object_file = os.path.join(base_dir, code_hash + object_suffix)
    if not os.path.exists(object_file):
-        with file_handle_for_atomic_write(src_file) as f:
+        try:
-            code.write_to_file(f)
+            with open(src_file, 'x') as f:
+                code.write_to_file(f)
+        except FileExistsError:
+            pass
        if windows:
            compile_cmd = ['cl.exe', '/c', '/EHsc'] + compiler_config['flags'].split()
@@ -611,7 +607,6 @@ def compile_module(code, code_hash, base_dir, compile_flags=[]):
        # Linking
        if windows:
-            import sysconfig
            config_vars = sysconfig.get_config_vars()
            py_lib = os.path.join(config_vars["installed_base"], "libs",
                                  f"python{config_vars['py_version_nodot']}.lib")
@@ -632,7 +627,12 @@ def compile_and_load(ast, custom_backend=None):
    cache_config = get_cache_config()
    compiler_config = get_compiler_config()
-    function_prefix = '__declspec(dllexport)' if compiler_config['os'].lower() == 'windows' else ''
+    if compiler_config['os'].lower() == 'windows':
+        function_prefix = '__declspec(dllexport)'
+    elif ast.instruction_set and 'function_prefix' in ast.instruction_set:
+        function_prefix = ast.instruction_set['function_prefix']
+    else:
+        function_prefix = ''
    code = ExtensionModuleCode(custom_backend=custom_backend)
    code.add_function(ast, ast.function_name)
@@ -645,7 +645,7 @@ def compile_and_load(ast, custom_backend=None):
        compile_flags = ast.instruction_set['compile_flags']
    if cache_config['object_cache'] is False:
-        with TemporaryDirectory() as base_dir:
+        with tempfile.TemporaryDirectory() as base_dir:
            lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags)
            result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
    else:
No results found