5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils/backends/dot.py
+++ b/pystencils/backends/dot.py
-from sympy.printing.printer import Printer
-from graphviz import Digraph, lang
 import graphviz
+try:
+    from graphviz import Digraph
+    import graphviz.quoting as quote
+except ImportError:
+    from graphviz import Digraph
+    import graphviz.lang as quote
+from sympy.printing.printer import Printer
 # noinspection PyPep8Naming
@@ -12,7 +17,7 @@ class DotPrinter(Printer):
        super(DotPrinter, self).__init__()
        self._node_to_str_function = node_to_str_function
        self.dot = Digraph(**kwargs)
-        self.dot.quote_edge = lang.quote
+        self.dot.quote_edge = quote.quote
    def _print_KernelFunction(self, func):
        self.dot.node(str(id(func)), style='filled', fillcolor='#a056db', label=self._node_to_str_function(func))
@@ -50,22 +55,20 @@ class DotPrinter(Printer):
 def __shortened(node):
-    from pystencils.astnodes import LoopOverCoordinate, KernelFunction, SympyAssignment, Block, Conditional
+    from pystencils.astnodes import LoopOverCoordinate, KernelFunction, SympyAssignment, Conditional
    if isinstance(node, LoopOverCoordinate):
        return "Loop over dim %d" % (node.coordinate_to_loop_over,)
    elif isinstance(node, KernelFunction):
        params = node.get_parameters()
        param_names = [p.field_name for p in params if p.is_field_pointer]
        param_names += [p.symbol.name for p in params if not p.is_field_parameter]
-        return "Func: %s (%s)" % (node.function_name, ",".join(param_names))
+        return f"Func: {node.function_name} ({','.join(param_names)})"
    elif isinstance(node, SympyAssignment):
        return repr(node.lhs)
-    elif isinstance(node, Block):
-        return "Block" + str(id(node))
    elif isinstance(node, Conditional):
        return repr(node)
    else:
-        raise NotImplementedError("Cannot handle node type %s" % (type(node),))
+        raise NotImplementedError(f"Cannot handle node type {type(node)}")
 def print_dot(node, view=False, short=False, **kwargs):

--- a/src/pystencils/backends/json.py
+++ b/src/pystencils/backends/json.py
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2019 Stephan Seitz <stephan.seitz@fau.de>
+#
+# Distributed under terms of the GPLv3 license.
+"""
+"""
+import json
+from pystencils.astnodes import NodeOrExpr
+from pystencils.backends.cbackend import CustomSympyPrinter, generate_c
+try:
+    import yaml
+except ImportError:
+    raise ImportError('yaml not installed')
+def expr_to_dict(expr_or_node: NodeOrExpr, with_c_code=True, full_class_names=False):
+    """Converts a SymPy expression to a serializable dict (mainly for debugging purposes)
+    The dict recursively contains all args of the expression as ``dict``s
+    See :func:`.write_json`
+    Args:
+        expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node`
+        with_c_code (bool, optional): include C representation of the nodes
+        full_class_names (bool, optional): use full class names (type(object) instead of ``type(object).__name__``
+    """
+    self = {'str': str(expr_or_node)}
+    if with_c_code:
+        try:
+            self.update({'c': generate_c(expr_or_node)})
+        except Exception:
+            try:
+                self.update({'c': CustomSympyPrinter().doprint(expr_or_node)})
+            except Exception:
+                pass
+    for a in expr_or_node.args:
+        self.update({str(a.__class__ if full_class_names else a.__class__.__name__): expr_to_dict(a)})
+    return self
+def print_json(expr_or_node: NodeOrExpr):
+    """Print debug JSON of an AST to string
+    Args:
+        expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node`
+    Returns:
+        str: JSON representation of AST
+    """
+    expr_or_node_dict = expr_to_dict(expr_or_node)
+    return json.dumps(expr_or_node_dict, indent=4)
+def write_json(filename: str, expr_or_node: NodeOrExpr):
+    """Writes debug JSON represenation of AST to file
+    Args:
+        filename (str): Path for the file to write
+        expr_or_node (NodeOrExpr): a SymPy expression or a :class:`pystencils.astnodes.Node`
+    """
+    expr_or_node_dict = expr_to_dict(expr_or_node)
+    with open(filename, 'w') as f:
+        json.dump(expr_or_node_dict, f, indent=4)
+def print_yaml(expr_or_node):
+    expr_or_node_dict = expr_to_dict(expr_or_node, full_class_names=False)
+    return yaml.dump(expr_or_node_dict)
+def write_yaml(filename, expr_or_node):
+    expr_or_node_dict = expr_to_dict(expr_or_node)
+    with open(filename, 'w') as f:
+        yaml.dump(expr_or_node_dict, f)
--- a/src/pystencils/backends/ppc_instruction_sets.py
+++ b/src/pystencils/backends/ppc_instruction_sets.py
+def get_argument_string(function_shortcut):
+    args = function_shortcut[function_shortcut.index('[') + 1: -1]
+    arg_string = "("
+    for arg in args.split(","):
+        arg = arg.strip()
+        if not arg:
+            continue
+        if arg in ('0', '1', '2', '3', '4', '5'):
+            arg_string += "{" + arg + "},"
+        else:
+            arg_string += arg + ","
+    arg_string = arg_string[:-1] + ")"
+    return arg_string
+def get_vector_instruction_set_ppc(data_type='double', instruction_set='vsx'):
+    if instruction_set != 'vsx':
+        raise NotImplementedError(instruction_set)
+    base_names = {
+        '+': 'add[0, 1]',
+        '-': 'sub[0, 1]',
+        '*': 'mul[0, 1]',
+        '/': 'div[0, 1]',
+        'sqrt': 'sqrt[0]',
+        'rsqrt': 'rsqrte[0]',  # rsqrt is available too, but not on Clang
+        'loadU': 'xl[0x0, 0]',
+        'loadA': 'ld[0x0, 0]',
+        'storeU': 'xst[1, 0x0, 0]',
+        'storeA': 'st[1, 0x0, 0]',
+        'storeAAndFlushCacheline': 'stl[1, 0x0, 0]',
+        'abs': 'abs[0]',
+        '==': 'cmpeq[0, 1]',
+        '!=': 'cmpne[0, 1]',
+        '<=': 'cmple[0, 1]',
+        '<': 'cmplt[0, 1]',
+        '>=': 'cmpge[0, 1]',
+        '>': 'cmpgt[0, 1]',
+        '&': 'and[0, 1]',
+        '|': 'or[0, 1]',
+        'blendv': 'sel[0, 1, 2]',
+        ('any', '=='): 'any_eq[0, 1]',
+        ('any', '!='): 'any_ne[0, 1]',
+        ('any', '<='): 'any_le[0, 1]',
+        ('any', '<'): 'any_lt[0, 1]',
+        ('any', '>='): 'any_ge[0, 1]',
+        ('any', '>'): 'any_gt[0, 1]',
+        ('all', '=='): 'all_eq[0, 1]',
+        ('all', '!='): 'all_ne[0, 1]',
+        ('all', '<='): 'all_le[0, 1]',
+        ('all', '<'): 'all_lt[0, 1]',
+        ('all', '>='): 'all_ge[0, 1]',
+        ('all', '>'): 'all_gt[0, 1]',
+    }
+    bits = {'double': 64,
+            'float': 32,
+            'int': 32}
+    width = 128 // bits[data_type]
+    intwidth = 128 // bits['int']
+    result = dict()
+    result['bytes'] = 16
+    for intrinsic_id, function_shortcut in base_names.items():
+        function_shortcut = function_shortcut.strip()
+        name = function_shortcut[:function_shortcut.index('[')]
+        arg_string = get_argument_string(function_shortcut)
+        result[intrinsic_id] = 'vec_' + name + arg_string
+    if data_type == 'double':
+        # Clang and XL C++ are missing these for doubles
+        result['loadA'] = '(__vector double)' + result['loadA'].format('(float*) {0}')
+        result['storeA'] = result['storeA'].format('(float*) {0}', '(__vector float) {1}')
+        result['storeAAndFlushCacheline'] = result['storeAAndFlushCacheline'].format('(float*) {0}',
+                                                                                     '(__vector float) {1}')
+    result['+int'] = "vec_add({0}, {1})"
+    result['width'] = width
+    result['intwidth'] = intwidth
+    result[data_type] = f'__vector {data_type}'
+    result['int'] = '__vector int'
+    result['bool'] = f'__vector __bool {"long long" if data_type == "double" else "int"}'
+    result['headers'] = ['<altivec.h>', '"ppc_altivec_helpers.h"']
+    result['makeVecConst'] = '((' + result[data_type] + '){{' + \
+        ", ".join(['(' + data_type + ') {0}' for _ in range(width)]) + '}})'
+    result['makeVec'] = '((' + result[data_type] + '){{' + \
+        ", ".join(['{' + data_type + '} {' + str(i) + '}' for i in range(width)]) + '}})'
+    result['makeVecConstInt'] = '((' + result['int'] + '){{' + ", ".join(['(int) {0}' for _ in range(intwidth)]) + '}})'
+    result['makeVecInt'] = '((' + result['int'] + '){{(int) {0}, (int) {1}, (int) {2}, (int) {3}}})'
+    result['any'] = 'vec_any_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
+    result['all'] = 'vec_all_ne({0}, ((' + result['bool'] + ') {{' + ", ".join(['0'] * width) + '}}))'
+    result['cachelineSize'] = 'cachelineSize()'
+    result['cachelineZero'] = 'cachelineZero((void*) {0})'
+    return result
--- a/src/pystencils/backends/riscv_instruction_sets.py
+++ b/src/pystencils/backends/riscv_instruction_sets.py
+from pystencils.typing import CFunction
+def get_argument_string(function_shortcut, last=''):
+    args = function_shortcut[function_shortcut.index('[') + 1: -1]
+    arg_string = "("
+    for arg in args.split(","):
+        arg = arg.strip()
+        if not arg:
+            continue
+        if arg in ('0', '1', '2', '3', '4', '5'):
+            arg_string += "{" + arg + "},"
+        else:
+            arg_string += arg + ","
+    if last:
+        arg_string += last + ','
+    arg_string = arg_string[:-1] + ")"
+    return arg_string
+def get_vector_instruction_set_riscv(data_type='double', instruction_set='rvv'):
+    assert instruction_set == 'rvv'
+    bits = {'double': 64,
+            'float': 32,
+            'int': 32}
+    base_names = {
+        '+': 'fadd_vv[0, 1]',
+        '-': 'fsub_vv[0, 1]',
+        '*': 'fmul_vv[0, 1]',
+        '/': 'fdiv_vv[0, 1]',
+        'sqrt': 'fsqrt_v[0]',
+        'loadU': f'le{bits[data_type]}_v[0]',
+        'storeU': f'se{bits[data_type]}_v[0, 1]',
+        'maskStoreU': f'se{bits[data_type]}_v[2, 0, 1]',
+        'loadS': f'lse{bits[data_type]}_v[0, 1]',
+        'storeS': f'sse{bits[data_type]}_v[0, 2, 1]',
+        'maskStoreS': f'sse{bits[data_type]}_v[3, 0, 2, 1]',
+        'abs': 'fabs_v[0]',
+        '==': 'mfeq_vv[0, 1]',
+        '!=': 'mfne_vv[0, 1]',
+        '<=': 'mfle_vv[0, 1]',
+        '<': 'mflt_vv[0, 1]',
+        '>=': 'mfge_vv[0, 1]',
+        '>': 'mfgt_vv[0, 1]',
+        '&': 'mand_mm[0, 1]',
+        '|': 'mor_mm[0, 1]',
+        'blendv': 'merge_vvm[2, 0, 1]',
+        'any': 'cpop_m[0]',
+        'all': 'cpop_m[0]',
+    }
+    result = dict()
+    width = f'vsetvlmax_e{bits[data_type]}m1()'
+    intwidth = 'vsetvlmax_e{bits["int"]}m1()'
+    result['bytes'] = 'vsetvlmax_e8m1()'
+    prefix = 'v'
+    suffix = f'_f{bits[data_type]}m1'
+    vl = '{loop_stop} - {loop_counter}'
+    int_vl = f'({vl})*{bits[data_type]//bits["int"]}'
+    for intrinsic_id, function_shortcut in base_names.items():
+        function_shortcut = function_shortcut.strip()
+        name = function_shortcut[:function_shortcut.index('[')]
+        if name.startswith('mf'):
+            suffix2 = suffix + f'_b{bits[data_type]}'
+        elif name.endswith('_mm') or name.endswith('_m'):
+            suffix2 = f'_b{bits[data_type]}'
+        elif intrinsic_id.startswith('mask'):
+            suffix2 = suffix + '_m'
+        else:
+            suffix2 = suffix
+        arg_string = get_argument_string(function_shortcut, last=vl)
+        result[intrinsic_id] = prefix + name + suffix2 + arg_string
+    result['width'] = CFunction(width, "int")
+    result['intwidth'] = CFunction(intwidth, "int")
+    result['makeVecConst'] = f'vfmv_v_f_f{bits[data_type]}m1({{0}}, {vl})'
+    result['makeVecConstInt'] = f'vmv_v_x_i{bits["int"]}m1({{0}}, {int_vl})'
+    result['makeVecIndex'] = f'vmacc_vx_i{bits["int"]}m1({result["makeVecConstInt"]}, {{1}}, ' + \
+                             f'vid_v_i{bits["int"]}m1({int_vl}), {int_vl})'
+    result['storeS'] = result['storeS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
+    result['loadS'] = result['loadS'].replace('{1}', f'{{1}}*{bits[data_type]//8}')
+    result['maskStoreS'] = result['maskStoreS'].replace('{2}', f'{{2}}*{bits[data_type]//8}')
+    result['+int'] = f"vadd_vv_i{bits['int']}m1({{0}}, {{1}}, {int_vl})"
+    result['float'] = f'vfloat{bits["float"]}m1_t'
+    result['double'] = f'vfloat{bits["double"]}m1_t'
+    result['int'] = f'vint{bits["int"]}m1_t'
+    result['bool'] = f'vbool{bits[data_type]}_t'
+    result['headers'] = ['<riscv_vector.h>', '"riscv_v_helpers.h"']
+    result['any'] += ' > 0x0'
+    result['all'] += f' == vsetvl_e{bits[data_type]}m1({vl})'
+    result['cachelineSize'] = 'cachelineSize()'
+    result['cachelineZero'] = 'cachelineZero((void*) {0})'
+    return result
--- a/src/pystencils/backends/simd_instruction_sets.py
+++ b/src/pystencils/backends/simd_instruction_sets.py
+import os
+import platform
+from ctypes import CDLL, c_int, c_size_t, sizeof, byref
+from warnings import warn
+import numpy as np
+from pystencils.backends.x86_instruction_sets import get_vector_instruction_set_x86
+from pystencils.backends.arm_instruction_sets import get_vector_instruction_set_arm
+from pystencils.backends.ppc_instruction_sets import get_vector_instruction_set_ppc
+from pystencils.backends.riscv_instruction_sets import get_vector_instruction_set_riscv
+from pystencils.cache import memorycache
+from pystencils.typing import numpy_name_to_c
+def get_vector_instruction_set(data_type='double', instruction_set='avx'):
+    if data_type == 'float':
+        warn(f"Ambiguous input for data_type: {data_type}. For single precision please use float32. "
+             f"For more information please take numpy.dtype as a reference. This input will not be supported in future "
+             f"releases")
+        data_type = 'float64'
+    type_name = numpy_name_to_c(np.dtype(data_type).name)
+    if instruction_set in ['neon', 'sme'] or instruction_set.startswith('sve'):
+        return get_vector_instruction_set_arm(type_name, instruction_set)
+    elif instruction_set in ['vsx']:
+        return get_vector_instruction_set_ppc(type_name, instruction_set)
+    elif instruction_set in ['rvv']:
+        return get_vector_instruction_set_riscv(type_name, instruction_set)
+    else:
+        return get_vector_instruction_set_x86(type_name, instruction_set)
+@memorycache
+def get_supported_instruction_sets():
+    """List of supported instruction sets on current hardware, or None if query failed."""
+    if 'PYSTENCILS_SIMD' in os.environ:
+        return os.environ['PYSTENCILS_SIMD'].split(',')
+    if platform.system() == 'Darwin' and platform.machine() == 'arm64':
+        result = ['neon']
+        libc = CDLL('/usr/lib/libc.dylib')
+        value = c_int(0)
+        size = c_size_t(sizeof(value))
+        status = libc.sysctlbyname(b"hw.optional.arm.FEAT_SME", byref(value), byref(size), None, 0)
+        if status == 0 and value.value == 1:
+            result.insert(0, "sme")
+        return result
+    elif platform.system() == 'Windows' and platform.machine() == 'ARM64':
+        return ['neon']
+    elif platform.system() == 'Linux' and platform.machine() == 'aarch64':
+        result = ['neon']  # Neon is mandatory on 64-bit ARM
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap2 = libc.getauxval(26)  # AT_HWCAP2
+        if hwcap & (1 << 22):  # HWCAP_SVE
+            if hwcap2 & (1 << 1):  # HWCAP2_SVE2
+                name = 'sve2'
+            else:
+                name = 'sve'
+            length = 8 * libc.prctl(51, 0, 0, 0, 0)  # PR_SVE_GET_VL
+            if length < 0:
+                raise OSError("SVE length query failed")
+            while length >= 128:
+                result.append(f"{name}{length}")
+                length //= 2
+            result.append(name)
+        if hwcap2 & (1 << 23):  # HWCAP2_SME
+            result.insert(0, "sme")  # prepend to list so it is not automatically chosen as best instruction set
+        return result
+    elif platform.system() == 'Linux' and platform.machine().startswith('riscv'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        hwcap_isa_v = 1 << (ord('V') - ord('A'))  # COMPAT_HWCAP_ISA_V
+        return ['rvv'] if hwcap & hwcap_isa_v else []
+    elif platform.system() == 'Linux' and platform.machine().startswith('ppc64'):
+        libc = CDLL('libc.so.6')
+        hwcap = libc.getauxval(16)  # AT_HWCAP
+        return ['vsx'] if hwcap & 0x00000080 else []  # PPC_FEATURE_HAS_VSX
+    elif platform.machine() in ['x86_64', 'x86', 'AMD64', 'i386']:
+        try:
+            from cpuinfo import get_cpu_info
+        except ImportError:
+            return None
+        result = []
+        required_sse_flags = {'sse', 'sse2', 'ssse3', 'sse4_1', 'sse4_2'}
+        required_avx_flags = {'avx', 'avx2'}
+        required_avx512_flags = {'avx512f'}
+        possible_avx512vl_flags = {'avx512vl', 'avx10_1'}
+        flags = set(get_cpu_info()['flags'])
+        if flags.issuperset(required_sse_flags):
+            result.append("sse")
+        if flags.issuperset(required_avx_flags):
+            result.append("avx")
+        if flags.issuperset(required_avx512_flags):
+            result.append("avx512")
+        if not flags.isdisjoint(possible_avx512vl_flags):
+            result.append("avx512vl")
+        return result
+    else:
+        raise NotImplementedError('Instruction set detection for %s on %s is not implemented' %
+                                  (platform.system(), platform.machine()))
+@memorycache
+def get_cacheline_size(instruction_set):
+    """Get the size (in bytes) of a cache block that can be zeroed without memory access.
+       Usually, this is identical to the cache line size."""
+    instruction_sets = get_vector_instruction_set('double', instruction_set)
+    if 'cachelineSize' not in instruction_sets:
+        return None
+    import pystencils as ps
+    from pystencils.astnodes import SympyAssignment
+    import numpy as np
+    from pystencils.cpu.vectorization import CachelineSize
+    arr = np.zeros((1, 1), dtype=np.float32)
+    f = ps.Field.create_from_numpy_array('f', arr, index_dimensions=0)
+    ass = [CachelineSize(), SympyAssignment(f.center, CachelineSize.symbol)]
+    ast = ps.create_kernel(ass, cpu_vectorize_info={'instruction_set': instruction_set})
+    kernel = ast.compile()
+    kernel(**{f.name: arr, CachelineSize.symbol.name: 0})
+    return int(arr[0, 0])
--- a/src/pystencils/backends/x86_instruction_sets.py
+++ b/src/pystencils/backends/x86_instruction_sets.py
+def get_argument_string(intrinsic_id, width, function_shortcut):
+    if intrinsic_id == 'makeVecConst' or intrinsic_id == 'makeVecConstInt':
+        arg_string = f"({','.join(['{0}'] * width)})"
+    elif intrinsic_id == 'makeVec' or intrinsic_id == 'makeVecInt':
+        params = ["{" + str(i) + "}" for i in reversed(range(width))]
+        arg_string = f"({','.join(params)})"
+    elif intrinsic_id == 'makeVecBool':
+        params = [f"(({{{i}}} ? -1.0 : 0.0)" for i in reversed(range(width))]
+        arg_string = f"({','.join(params)})"
+    elif intrinsic_id == 'makeVecConstBool':
+        params = ["(({0}) ? -1.0 : 0.0)" for _ in range(width)]
+        arg_string = f"({','.join(params)})"
+    else:
+        args = function_shortcut[function_shortcut.index('[') + 1: -1]
+        arg_string = "("
+        for arg in args.split(","):
+            arg = arg.strip()
+            if not arg:
+                continue
+            if arg in ('0', '1', '2', '3', '4', '5'):
+                arg_string += "{" + arg + "},"
+            else:
+                arg_string += arg + ","
+        arg_string = arg_string[:-1] + ")"
+    return arg_string
+def get_vector_instruction_set_x86(data_type='double', instruction_set='avx'):
+    comparisons = {
+        '==': '_CMP_EQ_UQ',
+        '!=': '_CMP_NEQ_UQ',
+        '>=': '_CMP_GE_OQ',
+        '<=': '_CMP_LE_OQ',
+        '<': '_CMP_NGE_UQ',
+        '>': '_CMP_NLE_UQ',
+    }
+    base_names = {
+        '+': 'add[0, 1]',
+        '-': 'sub[0, 1]',
+        '*': 'mul[0, 1]',
+        '/': 'div[0, 1]',
+        '&': 'and[0, 1]',
+        '|': 'or[0, 1]',
+        'blendv': 'blendv[0, 1, 2]',
+        'sqrt': 'sqrt[0]',
+        'makeVecConst': 'set[]',
+        'makeVec': 'set[]',
+        'makeVecBool': 'set[]',
+        'makeVecConstBool': 'set[]',
+        'makeVecInt': 'set[]',
+        'makeVecConstInt': 'set[]',
+        'loadU': 'loadu[0]',
+        'loadA': 'load[0]',
+        'storeU': 'storeu[0,1]',
+        'storeA': 'store[0,1]',
+        'stream': 'stream[0,1]',
+        'maskStoreA': 'mask_store[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
+        'maskStoreU': 'mask_storeu[0, 2, 1]' if instruction_set.startswith('avx512') else 'maskstore[0, 2, 1]',
+    }
+    for comparison_op, constant in comparisons.items():
+        base_names[comparison_op] = f'cmp[0, 1, {constant}]'
+    headers = {
+        'avx512': ['<immintrin.h>'],
+        'avx512vl': ['<immintrin.h>'],
+        'avx': ['<immintrin.h>'],
+        'sse': ['<immintrin.h>', '<xmmintrin.h>', '<emmintrin.h>', '<pmmintrin.h>',
+                '<tmmintrin.h>', '<smmintrin.h>', '<nmmintrin.h>']
+    }
+    suffix = {
+        'double': 'pd',
+        'float': 'ps',
+        'int': 'epi32'
+    }
+    prefix = {
+        'sse': '_mm',
+        'avx': '_mm256',
+        'avx512vl': '_mm256',
+        'avx512': '_mm512',
+    }
+    width = {
+        ("double", "sse"): 2,
+        ("float", "sse"): 4,
+        ("int", "sse"): 4,
+        ("double", "avx"): 4,
+        ("float", "avx"): 8,
+        ("int", "avx"): 8,
+        ("double", "avx512vl"): 4,
+        ("float", "avx512vl"): 8,
+        ("int", "avx512vl"): 8,
+        ("double", "avx512"): 8,
+        ("float", "avx512"): 16,
+        ("int", "avx512"): 16,
+    }
+    result = {
+        'width': width[(data_type, instruction_set)],
+        'intwidth': width[('int', instruction_set)],
+        'bytes': 4 * width[("float", instruction_set)]
+    }
+    pre = prefix[instruction_set]
+    for intrinsic_id, function_shortcut in base_names.items():
+        function_shortcut = function_shortcut.strip()
+        name = function_shortcut[:function_shortcut.index('[')]
+        if 'Int' in intrinsic_id:
+            suf = suffix['int']
+            arg_string = get_argument_string(intrinsic_id, result['intwidth'], function_shortcut)
+        else:
+            suf = suffix[data_type]
+            arg_string = get_argument_string(intrinsic_id, result['width'], function_shortcut)
+        mask_suffix = '_mask' if instruction_set.startswith('avx512') and intrinsic_id in comparisons.keys() else ''
+        result[intrinsic_id] = pre + "_" + name + "_" + suf + mask_suffix + arg_string
+    bit_width = result['width'] * (64 if data_type == 'double' else 32)
+    result['double'] = f"__m{bit_width}d"
+    result['float'] = f"__m{bit_width}"
+    result['int'] = f"__m{bit_width}i"
+    result['bool'] = result[data_type]
+    result['headers'] = headers[instruction_set]
+    result['any'] = f"{pre}_movemask_{suf}({{0}}) > 0"
+    result['all'] = f"{pre}_movemask_{suf}({{0}}) == {hex(2**result['width']-1)}"
+    setsuf = "x" if bit_width < 512 and bit_width // result['width'] == 64 else ""
+    if instruction_set.startswith('avx512'):
+        size = result['width']
+        masksize = max(size, 8)
+        result['&'] = f'_kand_mask{masksize}({{0}}, {{1}})'
+        result['|'] = f'_kor_mask{masksize}({{0}}, {{1}})'
+        result['any'] = f'!_ktestz_mask{masksize}_u8({{0}}, {{0}})'
+        result['all'] = f'_kortestc_mask{masksize}_u8({{0}}, {{0}})'
+        result['blendv'] = f'{pre}_mask_blend_{suf}({{2}}, {{0}}, {{1}})'
+        result['rsqrt'] = f"{pre}_rsqrt14_{suf}({{0}})"
+        result['bool'] = f"__mmask{masksize}"
+        params = " | ".join(["({{{i}}} ? {power} : 0)".format(i=i, power=2 ** i) for i in range(8)])
+        result['makeVecBool'] = f"__mmask8(({params}) )"
+        params = " | ".join(["({{0}} ? {power} : 0)".format(power=2 ** i) for i in range(8)])
+        result['makeVecConstBool'] = f"__mmask8(({params}) )"
+        vindex = f'{pre}_set_epi{bit_width//size}{setsuf}(' + \
+                 ', '.join([str(i) for i in range(result['width'])][::-1]) + ')'
+        vindex = f'{pre}_mullo_epi{bit_width//size}({vindex}, {pre}_set1_epi{bit_width//size}{setsuf}({{0}}))'
+        scale = bit_width // size // 8
+        result['storeS'] = f'{pre}_i{bit_width//size}scatter_{suf}({{0}}, ' + vindex.format("{2}") + \
+                           f', {{1}}, {scale})'
+        result['maskStoreS'] = f'{pre}_mask_i{bit_width//size}scatter_{suf}({{0}}, {{3}}, ' + vindex.format("{2}") + \
+                               f', {{1}}, {scale})'
+        if bit_width == 512:
+            result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}(' + vindex.format("{1}") + f', {{0}}, {scale})'
+        else:
+            result['loadS'] = f'{pre}_i{bit_width//size}gather_{suf}({{0}}, ' + vindex.format("{1}") + f', {scale})'
+    # abs intrinsic exists in 512 bits, but expands to a sequence. We generate that same sequence for 128 and 256 bits
+    if instruction_set == 'avx512':
+        result['abs'] = f"{pre}_abs_{suf}({{0}})"
+    else:
+        result['abs'] = f"{pre}_castsi{bit_width}_{suf}({pre}_and_si{bit_width}(" + \
+                        f"{pre}_set1_epi{bit_width // result['width']}{setsuf}(0x7" + \
+                        'f' * (bit_width // result['width'] // 4 - 1) + "), " + \
+                        f"{pre}_cast{suf}_si{bit_width}({{0}})))"
+    if instruction_set == 'avx' and data_type == 'float':
+        result['rsqrt'] = f"{pre}_rsqrt_{suf}({{0}})"
+    result['+int'] = f"{pre}_add_{suffix['int']}({{0}}, {{1}})"
+    result['streamFence'] = '_mm_mfence()'
+    return result
--- a/src/pystencils/bit_masks.py
+++ b/src/pystencils/bit_masks.py
+import sympy as sp
+# from pystencils.typing import get_type_of_expression
+# noinspection PyPep8Naming
+class flag_cond(sp.Function):
+    """Evaluates a flag condition on a bit mask, and returns the value of one of two expressions,
+    depending on whether the flag is set. 
+    Three argument version:
+    ```
+        flag_cond(flag_bit, mask, expr) = expr if (flag_bit is set in mask) else 0
+    ```
+    Four argument version:
+    ```
+        flag_cond(flag_bit, mask, expr_then, expr_else) = expr_then if (flag_bit is set in mask) else expr_else
+    ```
+    """
+    nargs = (3, 4)
+    def __new__(cls, flag_bit, mask_expression, *expressions):
+        # TODO Jan reintroduce checking
+        # flag_dtype = get_type_of_expression(flag_bit)
+        # if not flag_dtype.is_int():
+        #     raise ValueError('Argument flag_bit must be of integer type.')
+        #
+        # mask_dtype = get_type_of_expression(mask_expression)
+        # if not mask_dtype.is_int():
+        #     raise ValueError('Argument mask_expression must be of integer type.')
+        return super().__new__(cls, flag_bit, mask_expression, *expressions)
+    def to_c(self, print_func):
+        flag_bit = self.args[0]
+        mask = self.args[1]
+        then_expression = self.args[2]
+        flag_bit_code = print_func(flag_bit)
+        mask_code = print_func(mask)
+        then_code = print_func(then_expression)
+        code = f"(({mask_code}) >> ({flag_bit_code}) & 1) * ({then_code})"
+        if len(self.args) > 3:
+            else_expression = self.args[3]
+            else_code = print_func(else_expression)
+            code += f" + (({mask_code}) >> ({flag_bit_code}) ^ 1) * ({else_code})"
+        return code
--- a/pystencils/boundaries/__init__.py
+++ b/pystencils/boundaries/__init__.py
+from pystencils.boundaries.boundaryconditions import Dirichlet, Neumann
 from pystencils.boundaries.boundaryhandling import BoundaryHandling
-from pystencils.boundaries.boundaryconditions import Neumann, Dirichlet
 from pystencils.boundaries.inkernel import add_neumann_boundary
 __all__ = ['BoundaryHandling', 'Neumann', 'Dirichlet', 'add_neumann_boundary']
--- a/pystencils/boundaries/boundaryconditions.py
+++ b/pystencils/boundaries/boundaryconditions.py
-from typing import List, Tuple, Any
+from typing import Any, List, Tuple
-from pystencils import Assignment
+from pystencils.astnodes import SympyAssignment
 from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo
-from pystencils.data_types import create_type
+from pystencils.typing import create_type
 class Boundary:
@@ -13,7 +14,7 @@ class Boundary:
    def __init__(self, name=None):
        self._name = name
-    def __call__(self, field, direction_symbol, index_field) -> List[Assignment]:
+    def __call__(self, field, direction_symbol, index_field) -> List[SympyAssignment]:
        """Defines the boundary behavior and must therefore be implemented by all boundaries.
        Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated.
@@ -62,20 +63,20 @@ class Neumann(Boundary):
        neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions)
        if field.index_dimensions == 0:
-            return [Assignment(field.center, field[neighbor])]
+            return [SympyAssignment(field.center, field[neighbor])]
        else:
            from itertools import product
            if not field.has_fixed_index_shape:
                raise NotImplementedError("Neumann boundary works only for fields with fixed index shape")
            index_iter = product(*(range(i) for i in field.index_shape))
-            return [Assignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
+            return [SympyAssignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
    def __hash__(self):
        # All boundaries of these class behave equal -> should also be equal
        return hash("Neumann")
    def __eq__(self, other):
-        return type(other) == Neumann
+        return type(other) is Neumann
 class Dirichlet(Boundary):
@@ -83,7 +84,7 @@ class Dirichlet(Boundary):
    inner_or_boundary = False
    single_link = True
-    def __init__(self, value, name="Dirchlet"):
+    def __init__(self, value, name=None):
        super().__init__(name)
        self._value = value
@@ -102,11 +103,11 @@ class Dirichlet(Boundary):
    def __call__(self, field, direction_symbol, index_field, **kwargs):
        if field.index_dimensions == 0:
-            return [Assignment(field, index_field("value") if self.additional_data else self._value)]
+            return [SympyAssignment(field.center, index_field("value") if self.additional_data else self._value)]
        elif field.index_dimensions == 1:
            assert not self.additional_data
            if not field.has_fixed_index_shape:
                raise NotImplementedError("Field needs fixed index shape")
            assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field"
-            return [Assignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
+            return [SympyAssignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
        raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension")
--- a/pystencils/boundaries/boundaryhandling.py
+++ b/pystencils/boundaries/boundaryhandling.py
+from functools import lru_cache
 import numpy as np
 import sympy as sp
-from pystencils.assignment import Assignment
-from pystencils import Field, TypedSymbol, create_indexed_kernel
+from pystencils import create_kernel, CreateKernelConfig, Target
+from pystencils.astnodes import SympyAssignment
 from pystencils.backends.cbackend import CustomCodeNode
-from pystencils.boundaries.createindexlist import numpy_data_type_for_boundary_object, create_boundary_index_array
+from pystencils.boundaries.createindexlist import (
-from pystencils.cache import memorycache
+    create_boundary_index_array, numpy_data_type_for_boundary_object)
-from pystencils.data_types import create_type
+from pystencils.typing import TypedSymbol, create_type
-from pystencils.kernelparameters import FieldPointerSymbol
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler
+from pystencils.field import Field
+from pystencils.typing.typed_sympy import FieldPointerSymbol
+try:
+    # noinspection PyPep8Naming
+    import waLBerla as wlb
+    if wlb.cpp_available:
+        from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
+        import cupy.cuda.runtime
+    else:
+        ParallelDataHandling = None
+except ImportError:
+    ParallelDataHandling = None
 DEFAULT_FLAG_TYPE = np.uint32
@@ -19,11 +35,11 @@ class FlagInterface:
        >>> dh = create_data_handling((4, 5))
        >>> fi = FlagInterface(dh, 'flag_field', np.uint8)
        >>> assert dh.has_data('flag_field')
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        2
-        >>> fi.reserve_flag(4)
+        >>> int(fi.reserve_flag(4))
        4
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        8
    """
@@ -52,13 +68,13 @@ class FlagInterface:
                self._used_flags.add(flag)
                assert self._is_power_of_2(flag)
                return flag
-        raise ValueError("All available {} flags are reserved".format(self.max_bits))
+        raise ValueError(f"All available {self.max_bits} flags are reserved")
    def reserve_flag(self, flag):
        assert self._is_power_of_2(flag)
        flag = self.dtype(flag)
        if flag in self._used_flags:
-            raise ValueError("The flag {flag} is already reserved".format(flag=flag))
+            raise ValueError(f"The flag {flag} is already reserved")
        self._used_flags.add(flag)
        return flag
@@ -70,7 +86,7 @@ class FlagInterface:
 class BoundaryHandling:
    def __init__(self, data_handling, field_name, stencil, name="boundary_handling", flag_interface=None,
-                 target='cpu', openmp=True):
+                 target: Target = Target.CPU, openmp=True):
        assert data_handling.has_data(field_name)
        assert data_handling.dim == len(stencil[0]), "Dimension of stencil and data handling do not match"
        self._data_handling = data_handling
@@ -84,8 +100,33 @@ class BoundaryHandling:
        fi = flag_interface
        self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")
-        gpu = self._target == 'gpu'
+        if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
-        data_handling.add_custom_class(self._index_array_name, self.IndexFieldBlockData, cpu=True, gpu=gpu)
+            array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
+        else:
+            array_handler = self.data_handling.array_handler
+        def to_cpu(gpu_version, cpu_version):
+            gpu_version = gpu_version.boundary_object_to_index_list
+            cpu_version = cpu_version.boundary_object_to_index_list
+            for obj, cpu_arr in cpu_version.items():
+                array_handler.download(gpu_version[obj], cpu_arr)
+        def to_gpu(gpu_version, cpu_version):
+            gpu_version = gpu_version.boundary_object_to_index_list
+            cpu_version = cpu_version.boundary_object_to_index_list
+            for obj, cpu_arr in cpu_version.items():
+                if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
+                    gpu_version[obj] = array_handler.empty(cpu_arr.shape, cpu_arr.dtype)
+                    array_handler.upload(gpu_version[obj], cpu_arr)
+                else:
+                    array_handler.upload(gpu_version[obj], cpu_arr)
+        class_ = self.IndexFieldBlockData
+        class_.to_cpu = to_cpu
+        class_.to_gpu = to_gpu
+        gpu = self._target in data_handling._GPU_LIKE_TARGETS
+        data_handling.add_custom_class(self._index_array_name, class_, cpu=True, gpu=gpu)
    @property
    def data_handling(self):
@@ -201,7 +242,7 @@ class BoundaryHandling:
        if self._dirty:
            self.prepare()
-        for b in self._data_handling.iterate(gpu=self._target == 'gpu'):
+        for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
            for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
                kwargs[self._field_name] = b[self._field_name]
                kwargs['indexField'] = idx_arr
@@ -216,7 +257,7 @@ class BoundaryHandling:
        if self._dirty:
            self.prepare()
-        for b in self._data_handling.iterate(gpu=self._target == 'gpu'):
+        for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
            for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
                arguments = kwargs.copy()
                arguments[self._field_name] = b[self._field_name]
@@ -233,11 +274,13 @@ class BoundaryHandling:
        """
        Writes a VTK field where each cell with the given boundary is marked with 1, other cells are 0
        This can be used to display the simulation geometry in Paraview
-        :param file_name: vtk filename
-        :param boundaries: boundary object, or special string 'domain' for domain cells or special string 'all' for all
+        Params:
-                         boundary conditions.
+            file_name: vtk filename
-                         can also  be a sequence, to write multiple boundaries to VTK file
+            boundaries: boundary object, or special string 'domain' for domain cells or special string 'all' for all
-        :param ghost_layers: number of ghost layers to write, or True for all, False for none
+                      boundary conditions.
+                      can also  be a sequence, to write multiple boundaries to VTK file
+            ghost_layers: number of ghost layers to write, or True for all, False for none
        """
        if boundaries == 'all':
            boundaries = list(self._boundary_object_to_boundary_info.keys()) + ['domain']
@@ -272,7 +315,7 @@ class BoundaryHandling:
    def _create_boundary_kernel(self, symbolic_field, symbolic_index_field, boundary_obj):
        return create_boundary_kernel(symbolic_field, symbolic_index_field, self.stencil, boundary_obj,
-                                      target=self._target, openmp=self._openmp)
+                                      target=self._target, cpu_openmp=self._openmp)
    def _create_index_fields(self):
        dh = self._data_handling
@@ -299,7 +342,7 @@ class BoundaryHandling:
    def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs):
        if boundary_obj.additional_data_init_callback:
            boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs)
-        if self._target == 'gpu':
+        if self._target in self._data_handling._GPU_LIKE_TARGETS:
            self._data_handling.to_gpu(self._index_array_name)
    class BoundaryInfo(object):
@@ -309,7 +352,7 @@ class BoundaryHandling:
            self.kernel = kernel
    class IndexFieldBlockData:
-        def __init__(self, *_1, **_2):
+        def __init__(self, *args, **kwargs):
            self.boundary_object_to_index_list = {}
            self.boundary_object_to_data_setter = {}
@@ -317,24 +360,6 @@ class BoundaryHandling:
            self.boundary_object_to_index_list.clear()
            self.boundary_object_to_data_setter.clear()
-        @staticmethod
-        def to_cpu(gpu_version, cpu_version):
-            gpu_version = gpu_version.boundary_object_to_index_list
-            cpu_version = cpu_version.boundary_object_to_index_list
-            for obj, cpu_arr in cpu_version.items():
-                gpu_version[obj].get(cpu_arr)
-        @staticmethod
-        def to_gpu(gpu_version, cpu_version):
-            from pycuda import gpuarray
-            gpu_version = gpu_version.boundary_object_to_index_list
-            cpu_version = cpu_version.boundary_object_to_index_list
-            for obj, cpu_arr in cpu_version.items():
-                if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
-                    gpu_version[obj] = gpuarray.to_gpu(cpu_arr)
-                else:
-                    gpu_version[obj].set(cpu_arr)
 class BoundaryDataSetter:
@@ -356,26 +381,26 @@ class BoundaryDataSetter:
        assert coord < self.dim
        return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5
-    @memorycache()
+    @lru_cache()
    def link_offsets(self):
        return self.stencil[self.index_array['dir']]
-    @memorycache()
+    @lru_cache()
    def link_positions(self, coord):
        return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord]
-    @memorycache()
+    @lru_cache()
    def boundary_cell_positions(self, coord):
        return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord]
    def __setitem__(self, key, value):
        if key not in self.boundary_data_names:
-            raise KeyError("Invalid boundary data name %s. Allowed are %s" % (key, self.boundary_data_names))
+            raise KeyError(f"Invalid boundary data name {key}. Allowed are {self.boundary_data_names}")
        self.index_array[key] = value
    def __getitem__(self, item):
        if item not in self.boundary_data_names:
-            raise KeyError("Invalid boundary data name %s. Allowed are %s" % (item, self.boundary_data_names))
+            raise KeyError(f"Invalid boundary data name {item}. Allowed are {self.boundary_data_names}")
        return self.index_array[item]
@@ -401,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode):
        code = "\n"
        for i in range(dim):
            offset_str = ", ".join([str(d[i]) for d in stencil])
-            code += "const int64_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
+            code += "const int32_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
        inv_dirs = []
        for direction in stencil:
            inverse_dir = tuple([-i for i in direction])
            inv_dirs.append(str(stencil.index(inverse_dir)))
-        code += "const int %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
+        code += "const int32_t %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
        offset_symbols = BoundaryOffsetInfo._offset_symbols(dim)
        super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(),
                                                 symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL]))
    @staticmethod
    def _offset_symbols(dim):
-        return [TypedSymbol("c%s" % (d,), create_type(np.int64)) for d in ['x', 'y', 'z'][:dim]]
+        return [TypedSymbol(f"c{d}", create_type(np.int32)) for d in ['x', 'y', 'z'][:dim]]
-    INV_DIR_SYMBOL = TypedSymbol("invdir", "int")
+    INV_DIR_SYMBOL = TypedSymbol("invdir", np.int32)
-def create_boundary_kernel(field, index_field, stencil, boundary_functor, target='cpu', openmp=True):
+def create_boundary_kernel(field, index_field, stencil, boundary_functor, target=Target.CPU, **kernel_creation_args):
    elements = [BoundaryOffsetInfo(stencil)]
-    index_arr_dtype = index_field.dtype.numpy_dtype
+    dir_symbol = TypedSymbol("dir", np.int32)
-    dir_symbol = TypedSymbol("dir", index_arr_dtype.fields['dir'][0])
+    elements += [SympyAssignment(dir_symbol, index_field[0]('dir'))]
-    elements += [Assignment(dir_symbol, index_field[0]('dir'))]
    elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field)
-    return create_indexed_kernel(elements, [index_field], target=target, cpu_openmp=openmp)
+    config = CreateKernelConfig(index_fields=[index_field], target=target, skip_independence_check=True,
+                                **kernel_creation_args)
+    return create_kernel(elements, config=config)
--- a/pystencils/boundaries/createindexlist.py
+++ b/pystencils/boundaries/createindexlist.py
-import numpy as np
-import itertools
 import warnings
+import numpy as np
 try:
    import pyximport
    pyximport.install(language_level=3)
-    from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
-        create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, create_boundary_cell_index_list_3d
    cython_funcs_available = True
-except Exception:
+except ImportError:
    cython_funcs_available = False
-    create_boundary_index_list_2d = None
-    create_boundary_index_list_3d = None
+if cython_funcs_available:
+    from pystencils.boundaries.createindexlistcython import (
+        create_boundary_neighbor_index_list_2d,
+        create_boundary_neighbor_index_list_3d,
+        create_boundary_cell_index_list_2d,
+        create_boundary_cell_index_list_3d,
+    )
 boundary_index_array_coordinate_names = ["x", "y", "z"]
 direction_member_name = "dir"
+default_index_array_dtype = np.int32
 def numpy_data_type_for_boundary_object(boundary_object, dim):
    coordinate_names = boundary_index_array_coordinate_names[:dim]
-    return np.dtype([(name, np.int32) for name in coordinate_names]
+    return np.dtype(
-                    + [(direction_member_name, np.int32)]
+        [(name, default_index_array_dtype) for name in coordinate_names]
-                    + [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data], align=True)
+        + [(direction_member_name, default_index_array_dtype)]
+        + [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data],
+        align=True,
-def _create_boundary_neighbor_index_list_python(flag_field_arr, nr_of_ghost_layers, boundary_mask,
+    )
-                                                fluid_mask, stencil, single_link):
-    coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
+def _create_index_list_python(
+    flag_field_arr,
-    result = []
+    boundary_mask,
-    gl = nr_of_ghost_layers
+    fluid_mask,
-    for cell in itertools.product(*reversed([range(gl, i - gl) for i in flag_field_arr.shape])):
+    stencil,
-        cell = cell[::-1]
+    single_link,
-        if not flag_field_arr[cell] & fluid_mask:
+    inner_or_boundary=False,
-            continue
+    nr_of_ghost_layers=None,
+):
+    if inner_or_boundary and nr_of_ghost_layers is None:
+        raise ValueError(
+            "If inner_or_boundary is set True the number of ghost layers "
+            "around the inner domain has to be specified"
+        )
+    if nr_of_ghost_layers is None:
+        nr_of_ghost_layers = 0
+    coordinate_names = boundary_index_array_coordinate_names[
+        : len(flag_field_arr.shape)
+    ]
+    index_arr_dtype = np.dtype(
+        [(name, default_index_array_dtype) for name in coordinate_names]
+        + [(direction_member_name, default_index_array_dtype)]
+    )
+    # boundary cells are extracted via np.where. To ensure continous memory access in the compute kernel these cells
+    # have to be sorted.
+    boundary_cells = np.transpose(np.nonzero(flag_field_arr == boundary_mask))
+    for i in range(len(flag_field_arr.shape)):
+        boundary_cells = boundary_cells[boundary_cells[:, i].argsort(kind="mergesort")]
+    # First a set is created to save all fluid cells which are near boundary
+    fluid_cells = set()
+    for cell in boundary_cells:
+        cell = tuple(cell)
        for dir_idx, direction in enumerate(stencil):
-            neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)])
+            neighbor_cell = tuple(
-            if flag_field_arr[neighbor_cell] & boundary_mask:
+                [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
-                result.append(cell + (dir_idx,))
+            )
-                if single_link:
+            # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
-                    continue
+            if any(
+                not 0 + nr_of_ghost_layers <= e < upper - nr_of_ghost_layers
-    return np.array(result, dtype=index_arr_dtype)
+                for e, upper in zip(neighbor_cell, flag_field_arr.shape)
+            ):
+                continue
+            if flag_field_arr[neighbor_cell] & fluid_mask:
+                fluid_cells.add(neighbor_cell)
+    # then this is set is transformed to a list to make it sortable. This ensures continoous memory access later.
+    fluid_cells = list(fluid_cells)
+    if len(flag_field_arr.shape) == 3:
+        fluid_cells.sort(key=lambda tup: (tup[-1], tup[-2], tup[0]))
+    else:
+        fluid_cells.sort(key=lambda tup: (tup[-1], tup[0]))
-def _create_boundary_cell_index_list_python(flag_field_arr, nr_of_ghost_layers, boundary_mask,
+    cells_to_iterate = fluid_cells if inner_or_boundary else boundary_cells
-                                            fluid_mask, stencil, single_link):
+    checkmask = boundary_mask if inner_or_boundary else fluid_mask
-    coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
    result = []
-    gl = nr_of_ghost_layers
+    for cell in cells_to_iterate:
-    for cell in itertools.product(*reversed([range(gl, i - gl) for i in flag_field_arr.shape])):
+        cell = tuple(cell)
-        cell = cell[::-1]
+        sum_cells = np.zeros(len(cell))
-        if not flag_field_arr[cell] & boundary_mask:
-            continue
        for dir_idx, direction in enumerate(stencil):
-            neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)])
+            neighbor_cell = tuple(
-            neighbor_is_fluid = False
+                [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
-            try:
+            )
-                neighbor_is_fluid = flag_field_arr[neighbor_cell] & fluid_mask
+            # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
-            except IndexError:
+            if any(
-                pass
+                not 0 <= e < upper
-            if neighbor_is_fluid:
+                for e, upper in zip(neighbor_cell, flag_field_arr.shape)
-                result.append(cell + (dir_idx,))
+            ):
+                continue
+            if flag_field_arr[neighbor_cell] & checkmask:
                if single_link:
-                    continue
+                    sum_cells += np.array(direction)
+                else:
+                    result.append(tuple(cell) + (dir_idx,))
+        # the discrete normal direction is the one which gives the maximum inner product to the stencil direction
+        if single_link and any(sum_cells != 0):
+            idx = np.argmax(np.inner(sum_cells, stencil))
+            result.append(tuple(cell) + (idx,))
    return np.array(result, dtype=index_arr_dtype)
-def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
+def create_boundary_index_list(
-                               nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
+    flag_field,
+    stencil,
+    boundary_mask,
+    fluid_mask,
+    nr_of_ghost_layers=1,
+    inner_or_boundary=True,
+    single_link=False,
+):
    """Creates a numpy array storing links (connections) between domain cells and boundary cells.
    Args:
@@ -86,50 +141,79 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
        nr_of_ghost_layers: only relevant if neighbors is True
        inner_or_boundary: if true, the result contains the cell coordinates of the domain cells -
                    if false the boundary cells are listed
-        single_link: if true only the first link is reported from this cell
+        single_link: if true only the link in normal direction to this cell is reported
    """
    dim = len(flag_field.shape)
    coordinate_names = boundary_index_array_coordinate_names[:dim]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
+    index_arr_dtype = np.dtype(
+        [(name, default_index_array_dtype) for name in coordinate_names]
-    stencil = np.array(stencil, dtype=np.int32)
+        + [(direction_member_name, default_index_array_dtype)]
-    args = (flag_field, nr_of_ghost_layers, boundary_mask, fluid_mask, stencil, single_link)
+    )
+    stencil = np.array(stencil, dtype=default_index_array_dtype)
+    args = (
+        flag_field,
+        nr_of_ghost_layers,
+        boundary_mask,
+        fluid_mask,
+        stencil,
+        single_link,
+    )
+    args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link)
    if cython_funcs_available:
        if dim == 2:
            if inner_or_boundary:
                idx_list = create_boundary_neighbor_index_list_2d(*args)
            else:
-                idx_list = create_boundary_cell_index_list_2d(*args)
+                idx_list = create_boundary_cell_index_list_2d(*args_no_gl)
        elif dim == 3:
            if inner_or_boundary:
                idx_list = create_boundary_neighbor_index_list_3d(*args)
            else:
-                idx_list = create_boundary_cell_index_list_3d(*args)
+                idx_list = create_boundary_cell_index_list_3d(*args_no_gl)
        else:
            raise ValueError("Flag field has to be a 2 or 3 dimensional numpy array")
        return np.array(idx_list, dtype=index_arr_dtype)
    else:
        if flag_field.size > 1e6:
-            warnings.warn("Boundary setup may take very long! Consider installing cython to speed it up")
+            warnings.warn(
-        if inner_or_boundary:
+                "Boundary setup may take very long! Consider installing cython to speed it up"
-            return _create_boundary_neighbor_index_list_python(*args)
+            )
-        else:
+        return _create_index_list_python(
-            return _create_boundary_cell_index_list_python(*args)
+            *args_no_gl,
+            inner_or_boundary=inner_or_boundary,
+            nr_of_ghost_layers=nr_of_ghost_layers,
-def create_boundary_index_array(flag_field, stencil, boundary_mask, fluid_mask, boundary_object,
+        )
-                                nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
-    idx_array = create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
-                                           nr_of_ghost_layers, inner_or_boundary, single_link)
+def create_boundary_index_array(
+    flag_field,
+    stencil,
+    boundary_mask,
+    fluid_mask,
+    boundary_object,
+    nr_of_ghost_layers=1,
+    inner_or_boundary=True,
+    single_link=False,
+):
+    idx_array = create_boundary_index_list(
+        flag_field,
+        stencil,
+        boundary_mask,
+        fluid_mask,
+        nr_of_ghost_layers,
+        inner_or_boundary,
+        single_link,
+    )
    dim = len(flag_field.shape)
    if boundary_object.additional_data:
        coordinate_names = boundary_index_array_coordinate_names[:dim]
        index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
        extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype)
-        for prop in coordinate_names + ['dir']:
+        for prop in coordinate_names + ["dir"]:
            extended_idx_field[prop] = idx_array[prop]
        idx_array = extended_idx_field

--- a/pystencils/boundaries/createindexlistcython.pyx
+++ b/pystencils/boundaries/createindexlistcython.pyx
-# Workaround for cython bug
+# cython: language_level=3str
-# see https://stackoverflow.com/questions/8024805/cython-compiled-c-extension-importerror-dynamic-module-does-not-define-init-fu
-WORKAROUND = "Something"
 import cython
@@ -21,20 +19,37 @@ def create_boundary_neighbor_index_list_2d(object[IntegerType, ndim=2] flag_fiel
    cdef int xs, ys, x, y
    cdef int dirIdx, num_directions, dx, dy
+    cdef int sum_x, sum_y
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
    for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
        for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
+            sum_x = 0; sum_y = 0;
            if flag_field[x, y] & fluid_mask:
                for dirIdx in range(num_directions):
-                    dx = stencil[dirIdx,0]
+                    dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
-                    dy = stencil[dirIdx,1]
                    if flag_field[x + dx, y + dy] & boundary_mask:
-                        boundary_index_list.append((x,y, dirIdx))
                        if single_link:
-                            break
+                            sum_x += dx; sum_y += dy;
+                        else:
+                            boundary_index_list.append((x, y, dirIdx))
+            dot = 0; maxn = 0; calculated_idx = 0
+            if single_link and (sum_x != 0 or sum_y != 0):
+                for dirIdx in range(num_directions):
+                    dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1];
+                    dot = dx * sum_x + dy * sum_y
+                    if dot > maxn:
+                        maxn = dot
+                        calculated_idx = dirIdx
+                boundary_index_list.append((x, y, calculated_idx))
    return boundary_index_list
@@ -46,6 +61,10 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
    cdef int xs, ys, zs, x, y, z
    cdef int dirIdx, num_directions, dx, dy, dz
+    cdef int sum_x, sum_y, sum_z
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys, zs = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
@@ -53,15 +72,27 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
    for z in range(nr_of_ghost_layers, zs - nr_of_ghost_layers):
        for y in range(nr_of_ghost_layers, ys - nr_of_ghost_layers):
            for x in range(nr_of_ghost_layers, xs - nr_of_ghost_layers):
+                sum_x = 0; sum_y = 0; sum_z = 0
                if flag_field[x, y, z] & fluid_mask:
                    for dirIdx in range(num_directions):
-                        dx = stencil[dirIdx,0]
+                        dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]; dz = stencil[dirIdx,2]
-                        dy = stencil[dirIdx,1]
-                        dz = stencil[dirIdx,2]
                        if flag_field[x + dx, y + dy, z + dz] & boundary_mask:
-                            boundary_index_list.append((x,y,z, dirIdx))
                            if single_link:
-                                break
+                                sum_x += dx; sum_y += dy; sum_z += dz
+                            else:
+                                boundary_index_list.append((x, y, z, dirIdx))
+                dot = 0; maxn = 0; calculated_idx = 0
+                if single_link and (sum_x != 0 or sum_y != 0 or sum_z != 0):
+                    for dirIdx in range(num_directions):
+                        dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
+                        dot = dx * sum_x + dy * sum_y + dz * sum_z
+                        if dot > maxn:
+                            maxn = dot
+                            calculated_idx = dirIdx
+                    boundary_index_list.append((x, y, z, calculated_idx))
    return boundary_index_list
@@ -69,37 +100,59 @@ def create_boundary_neighbor_index_list_3d(object[IntegerType, ndim=3] flag_fiel
 @cython.boundscheck(False) # turn off bounds-checking for entire function
 @cython.wraparound(False)  # turn off negative index wrapping for entire function
 def create_boundary_cell_index_list_2d(object[IntegerType, ndim=2] flag_field,
-                                       int nr_of_ghost_layers, IntegerType boundary_mask, IntegerType fluid_mask,
+                                       IntegerType boundary_mask, IntegerType fluid_mask,
                                       object[int, ndim=2] stencil, int single_link):
    cdef int xs, ys, x, y
    cdef int dirIdx, num_directions, dx, dy
+    cdef int sum_x, sum_y
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
    for y in range(0, ys):
        for x in range(0, xs):
+            sum_x = 0; sum_y = 0;
            if flag_field[x, y] & boundary_mask:
                for dirIdx in range(num_directions):
-                    dx = stencil[dirIdx,0]
+                    dx = stencil[dirIdx,0]; dy = stencil[dirIdx,1]
-                    dy = stencil[dirIdx,1]
                    if 0 <= x + dx < xs and 0 <= y + dy < ys:
                        if flag_field[x + dx, y + dy] & fluid_mask:
-                            boundary_index_list.append((x,y, dirIdx))
                            if single_link:
-                                break
+                                sum_x += dx; sum_y += dy
+                            else:
+                                boundary_index_list.append((x, y, dirIdx))
+            dot = 0; maxn = 0; calculated_idx = 0
+            if single_link and (sum_x != 0 or sum_y != 0):
+                for dirIdx in range(num_directions):
+                    dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]
+                    dot = dx * sum_x + dy * sum_y
+                    if dot > maxn:
+                        maxn = dot
+                        calculated_idx = dirIdx
+                boundary_index_list.append((x, y, calculated_idx))
    return boundary_index_list
 @cython.boundscheck(False) # turn off bounds-checking for entire function
 @cython.wraparound(False)  # turn off negative index wrapping for entire function
 def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
-                                       int nr_of_ghost_layers, IntegerType boundary_mask, IntegerType fluid_mask,
+                                       IntegerType boundary_mask, IntegerType fluid_mask,
                                       object[int, ndim=2] stencil, int single_link):
    cdef int xs, ys, zs, x, y, z
    cdef int dirIdx, num_directions, dx, dy, dz
+    cdef int sum_x, sum_y, sum_z
+    cdef float dot, maxn
+    cdef int calculated_idx
    xs, ys, zs = flag_field.shape
    boundary_index_list = []
    num_directions = stencil.shape[0]
@@ -107,14 +160,27 @@ def create_boundary_cell_index_list_3d(object[IntegerType, ndim=3] flag_field,
    for z in range(0, zs):
        for y in range(0, ys):
            for x in range(0, xs):
+                sum_x = 0; sum_y = 0; sum_z = 0
                if flag_field[x, y, z] & boundary_mask:
                    for dirIdx in range(num_directions):
-                        dx = stencil[dirIdx,0]
+                        dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
-                        dy = stencil[dirIdx,1]
-                        dz = stencil[dirIdx,2]
                        if 0 <= x + dx < xs and 0 <= y + dy < ys and 0 <= z + dz < zs:
                            if flag_field[x + dx, y + dy, z + dz] & fluid_mask:
-                                boundary_index_list.append((x,y,z, dirIdx))
                                if single_link:
-                                    break
+                                    sum_x += dx; sum_y += dy; sum_z += dz
+                                else:
+                                    boundary_index_list.append((x, y, z, dirIdx))
+                dot = 0; maxn = 0; calculated_idx=0
+                if single_link and (sum_x != 0 or sum_y !=0 or sum_z !=0):
+                    for dirIdx in range(num_directions):
+                        dx = stencil[dirIdx, 0]; dy = stencil[dirIdx, 1]; dz = stencil[dirIdx, 2]
+                        dot = dx*sum_x + dy*sum_y + dz*sum_z
+                        if dot > maxn:
+                            maxn = dot
+                            calculated_idx = dirIdx
+                    boundary_index_list.append((x, y, z, calculated_idx))
    return boundary_index_list
\ No newline at end of file
--- a/pystencils/boundaries/inkernel.py
+++ b/pystencils/boundaries/inkernel.py
 import sympy as sp
-from pystencils import Field, TypedSymbol
-from pystencils.integer_functions import bitwise_and
 from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE
-from pystencils.data_types import create_type
+from pystencils.typing import TypedSymbol, create_type
+from pystencils.field import Field
+from pystencils.integer_functions import bitwise_and
 def add_neumann_boundary(eqs, fields, flag_field, boundary_flag="neumann_flag", inverse_flag=False):
    """
    Replaces all neighbor accesses by flag field guarded accesses.
    If flag in neighboring cell is set, the center value is used instead
-    :param eqs: list of equations containing field accesses to direct neighbors
-    :param fields: fields for which the Neumann boundary should be applied
+    Args:
-    :param flag_field: integer field marking boundary cells
+        eqs: list of equations containing field accesses to direct neighbors
-    :param boundary_flag: if flag field has value 'boundary_flag' (no bit operations yet)
+        fields: fields for which the Neumann boundary should be applied
-                          the cell is assumed to be boundary
+        flag_field: integer field marking boundary cells
-    :param inverse_flag: if true, boundary cells are where flag field has not the value of boundary_flag
+        boundary_flag: if flag field has value 'boundary_flag' (no bit operations yet)
-    :return: list of equations with guarded field accesses
+                       the cell is assumed to be boundary
+        inverse_flag: if true, boundary cells are where flag field has not the value of boundary_flag
+    Returns:
+        list of equations with guarded field accesses
    """
    if not hasattr(fields, "__len__"):
        fields = [fields]

--- a/src/pystencils/cache.py
+++ b/src/pystencils/cache.py
+import os
+from collections.abc import Hashable
+from functools import partial, wraps
+from itertools import chain
+from functools import lru_cache as memorycache
+from joblib import Memory
+from appdirs import user_cache_dir
+if 'PYSTENCILS_CACHE_DIR' in os.environ:
+    cache_dir = os.environ['PYSTENCILS_CACHE_DIR']
+else:
+    cache_dir = user_cache_dir('pystencils')
+disk_cache = Memory(cache_dir, verbose=False).cache
+disk_cache_no_fallback = disk_cache
+def _wrapper(wrapped_func, cached_func, *args, **kwargs):
+    if all(isinstance(a, Hashable) for a in chain(args, kwargs.values())):
+        return cached_func(*args, **kwargs)
+    else:
+        return wrapped_func(*args, **kwargs)
+def memorycache_if_hashable(maxsize=128, typed=False):
+    def wrapper(func):
+        return partial(_wrapper, func, memorycache(maxsize, typed)(func))
+    return wrapper
+def sharedmethodcache(cache_id: str):
+    """Decorator for memoization of instance methods, allowing multiple methods to use the same cache.
+    This decorator caches results of instance methods per instantiated object of the surrounding class.
+    It allows multiple methods to use the same cache, by passing them the same `cache_id` string.
+    Cached values are stored in a dictionary, which is added as a member `self.<cache_id>` to the 
+    `self` object instance. Make sure that this doesn't cause any naming conflicts with other members!
+    Of course, for this to be useful, said methods must have the same signature (up to additional kwargs)
+    and must return the same result when called with the same arguments."""
+    def _decorator(user_method):
+        @wraps(user_method)
+        def _decorated_func(self, *args, **kwargs):
+            objdict = self.__dict__
+            cache = objdict.setdefault(cache_id, dict())
+            key = args
+            for item in kwargs.items():
+                key += item
+            if key not in cache:
+                result = user_method(self, *args, **kwargs)
+                cache[key] = result
+                return result
+            else:
+                return cache[key]
+        return _decorated_func
+    return _decorator
+def clear_cache():
+    """
+    Clears the pystencils cache created by joblib.
+    """
+    memory = Memory(cache_dir, verbose=0)
+    memory.clear(warn=False)
+# Disable memory cache:
+# disk_cache = lambda o: o
+# disk_cache_no_fallback = lambda o: o
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
+from copy import copy
+from collections import defaultdict
+from dataclasses import dataclass, field
+from types import MappingProxyType
+from typing import Union, Tuple, List, Dict, Callable, Any, DefaultDict, Iterable
+from pystencils import Target, Backend, Field
+from pystencils.typing.typed_sympy import BasicType
+from pystencils.typing.utilities import collate_types
+import numpy as np
+# TODO: There exists DTypeLike in NumPy which would be better than type for type hinting, to new at the moment
+# from numpy.typing import DTypeLike
+# TODO: CreateKernelConfig is bloated think of more classes better usage, factory whatever ...
+# Proposition: CreateKernelConfigs Classes for different targets?
+@dataclass
+class CreateKernelConfig:
+    """
+    **Below all parameters for the CreateKernelConfig are explained**
+    """
+    target: Target = Target.CPU
+    """
+    All targets are defined in :class:`pystencils.enums.Target`
+    """
+    backend: Backend = None
+    """
+    All backends are defined in :class:`pystencils.enums.Backend`
+    """
+    function_name: str = 'kernel'
+    """
+    Name of the generated function - only important if generated code is written out
+    """
+    data_type: Union[type, str, DefaultDict[str, BasicType], Dict[str, BasicType]] = np.float64
+    """
+    Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type.
+    If specified as a dict ideally a defaultdict is used to define a default value for symbols not listed in the
+    dict. If a plain dict is provided it will be transformed into a defaultdict internally. The default value 
+    will then be specified via type collation then.
+    """
+    default_number_float: Union[type, str, BasicType] = None
+    """
+    Data type used for all untyped floating point numbers (i.e. 0.5). By default the value of data_type is used.
+    If data_type is given as a defaultdict its default_factory is used.
+    """
+    default_number_int: Union[type, str, BasicType] = np.int64
+    """
+    Data type used for all untyped integer numbers (i.e. 1)
+    """
+    iteration_slice: Tuple = None
+    """
+    Rectangular subset to iterate over, if not specified the complete non-ghost layer part of the field is iterated over
+    """
+    ghost_layers: Union[bool, int, List[Tuple[int]]] = None
+    """
+    A single integer specifies the ghost layer count at all borders, can also be a sequence of
+    pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
+    If left to default, the number of ghost layers is determined automatically from the assignments.
+    """
+    cpu_openmp: Union[bool, int] = False
+    """
+    `True` or number of threads for OpenMP parallelization, `False` for no OpenMP. If set to `True`, the maximum number
+    of available threads will be chosen.
+    """
+    cpu_vectorize_info: Dict = None
+    """
+    A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
+    for documentation of these parameters see vectorize function. Example:
+    '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
+    """
+    cpu_blocking: Tuple[int] = None
+    """
+    A tuple of block sizes or `None` if no blocking should be applied
+    """
+    omp_single_loop: bool = True
+    """
+    If OpenMP is active: whether multiple outer loops are permitted
+    """
+    base_pointer_specification: Union[List[Iterable[str]], List[Iterable[int]]] = None
+    """
+    Specification of how many and which intermediate pointers are created for a field access.
+    For example [ (0), (2,3,)]  creates on base pointer for coordinates 2 and 3 and writes the offset for coordinate
+    zero directly in the field access. These specifications are defined dependent on the loop ordering.
+    This function translates more readable version into the specification above.
+    For more information see: `pystencils.transformations.create_intermediate_base_pointer`
+    """
+    gpu_indexing: str = 'block'
+    """
+    Either 'block' or 'line' , or custom indexing class, see `pystencils.gpu.AbstractIndexing`
+    """
+    gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({}))
+    """
+    Dict with indexing parameters (constructor parameters of indexing class)
+    e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
+    """
+    # TODO Markus rework this docstring
+    default_assignment_simplifications: bool = False
+    """
+    If `True` default simplifications are first performed on the Assignments. If problems occur during the
+    simplification a warning will be thrown.
+    Furthermore, it is essential to know that this is a two-stage process. The first stage of the process acts
+    on the level of the `pystencils.AssignmentCollection`.  In this part,
+    `pystencil.simp.create_simplification_strategy` from pystencils.simplificationfactory will be used to
+    apply optimisations like insertion of constants to
+    remove pressure from the registers. Thus the first part of the optimisations can only be executed if
+    an `AssignmentCollection` is passed. The second part of the optimisation acts on the level of each Assignment
+    individually. In this stage, all optimisations from `sympy.codegen.rewriting.optims_c99` are applied
+    to each Assignment. Thus this stage can also be applied if a list of Assignments is passed.
+    """
+    cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
+    """
+    List of extra optimizations to perform first on the AST.
+    """
+    use_auto_for_assignments: bool = False
+    """
+    If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust.
+    """
+    index_fields: List[Field] = None
+    """
+    List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel`
+    instead of `create_domain_kernel` is used.
+    """
+    coordinate_names: Tuple[str, Any] = ('x', 'y', 'z')
+    """
+    Name of the coordinate fields in the struct data type.
+    """
+    allow_double_writes: bool = False
+    """
+    If True, don't check if every field is only written at a single location. This is required
+    for example for kernels that are compiled with loop step sizes > 1, that handle multiple
+    cells at once. Use with care!
+    """
+    skip_independence_check: bool = False
+    """
+    By default the assignment list is checked for read/write independence. This means fields are only written at
+    locations where they are read. Doing so guarantees thread safety. In some cases e.g. for
+    periodicity kernel, this can not be assured and does the check needs to be deactivated. Use with care!
+    """
+    class DataTypeFactory:
+        """Because of pickle, we need to have a nested class, instead of a lambda in __post_init__"""
+        def __init__(self, dt):
+            self.dt = dt
+        def __call__(self):
+            return BasicType(self.dt)
+    def _check_type(self, dtype_to_check):
+        if isinstance(dtype_to_check, str) and (dtype_to_check == 'float' or dtype_to_check == 'int'):
+            self._typing_error()
+        if isinstance(dtype_to_check, type) and not hasattr(dtype_to_check, "dtype"):
+            # NumPy-types are also of type 'type'. However, they have more properties
+            self._typing_error()
+    @staticmethod
+    def _typing_error():
+        raise ValueError("It is not possible to use python types (float, int) for datatypes because these "
+                         "types are ambiguous. For example float will map to double. "
+                         "Also the string version like 'float' is not allowed, e.g. use 'float64' instead")
+    def __post_init__(self):
+        # ----  Legacy parameters
+        if not isinstance(self.target, Target):
+            raise ValueError("target must be provided by the 'Target' enum")
+        # ---- Auto Backend
+        if not self.backend:
+            if self.target == Target.CPU:
+                self.backend = Backend.C
+            elif self.target == Target.GPU:
+                self.backend = Backend.CUDA
+            else:
+                raise NotImplementedError(f'Target {self.target} has no default backend')
+        if not isinstance(self.backend, Backend):
+            raise ValueError("backend must be provided by the 'Backend' enum")
+        # Normalise data types
+        for dtype in [self.data_type, self.default_number_float, self.default_number_int]:
+            self._check_type(dtype)
+        if not isinstance(self.data_type, dict):
+            dt = copy(self.data_type)  # The copy is necessary because BasicType has sympy shinanigans
+            self.data_type = defaultdict(self.DataTypeFactory(dt))
+        if isinstance(self.data_type, dict) and not isinstance(self.data_type, defaultdict):
+            for dtype in self.data_type.values():
+                self._check_type(dtype)
+            dt = collate_types([BasicType(dtype) for dtype in self.data_type.values()])
+            dtype_dict = self.data_type
+            self.data_type = defaultdict(self.DataTypeFactory(dt), dtype_dict)
+        assert isinstance(self.data_type, defaultdict), "At this point data_type must be a defaultdict!"
+        for dtype in self.data_type.values():
+            self._check_type(dtype)
+        self._check_type(self.data_type.default_factory())
+        if self.default_number_float is None:
+            self.default_number_float = self.data_type.default_factory()
+        if not isinstance(self.default_number_float, BasicType):
+            self.default_number_float = BasicType(self.default_number_float)
+        if not isinstance(self.default_number_int, BasicType):
+            self.default_number_int = BasicType(self.default_number_int)
--- a/pystencils/cpu/__init__.py
+++ b/pystencils/cpu/__init__.py
-from pystencils.cpu.kernelcreation import create_kernel, create_indexed_kernel, add_openmp
 from pystencils.cpu.cpujit import make_python_function
+from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel, add_pragmas
-__all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'make_python_function']
+__all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'add_pragmas', 'make_python_function']
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -13,7 +13,7 @@ in a configuration file.
 3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or
   ``%HOMEPATH%\.pystencils\config.json`` (Windows)
-If no configuration file is found, a default configuration is created at the above mentioned location in your home.
+If no configuration file is found, a default configuration is created at the above-mentioned location in your home.
 So run *pystencils* once, then edit the created configuration file.
@@ -23,7 +23,7 @@ Compiler Config (Linux)
 - **'os'**: should be detected automatically as 'linux'
 - **'command'**: path to C++ compiler (defaults to 'g++')
 - **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler
- **'restrict_qualifier'**: the restrict qualifier is not standardized accross compilers.
+- **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
  For most Linux compilers the qualifier is ``__restrict__``
@@ -39,32 +39,40 @@ Then 'cl.exe' is used to compile.
  where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat'
 - **'arch'**: 'x86' or 'x64'
 - **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated
- **'restrict_qualifier'**: the restrict qualifier is not standardized across compilers.
+- **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
  For Windows compilers the qualifier should be ``__restrict``
 """
-import os
+from appdirs import user_cache_dir, user_config_dir
+from collections import OrderedDict
 import hashlib
+import importlib.util
 import json
+import os
 import platform
 import shutil
+import subprocess
+import sysconfig
+import tempfile
 import textwrap
-from tempfile import TemporaryDirectory
+import time
+import warnings
+import pathlib
 import numpy as np
-import subprocess
-from appdirs import user_config_dir, user_cache_dir
-from collections import OrderedDict
-from pystencils.utils import recursive_dict_update
-from sysconfig import get_paths
 from pystencils import FieldType
+from pystencils.astnodes import LoopOverCoordinate
 from pystencils.backends.cbackend import generate_c, get_headers
-from pystencils.utils import file_handle_for_atomic_write, atomic_file_write
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
+from pystencils.cpu.msvc_detection import get_environment
 from pystencils.include import get_pystencils_include_path
+from pystencils.kernel_wrapper import KernelWrapper
+from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess
+from pystencils.utils import atomic_file_write, recursive_dict_update
-def make_python_function(kernel_function_node):
+def make_python_function(kernel_function_node, custom_backend=None):
    """
    Creates C code from the abstract syntax tree, compiles it and makes it accessible as Python function
@@ -73,9 +81,10 @@ def make_python_function(kernel_function_node):
        - all symbols which are not defined in the kernel itself are expected as parameters
    :param kernel_function_node: the abstract syntax tree
+    :param custom_backend: use own custom printer for code generation
    :return: kernel functor
    """
-    result = compile_and_load(kernel_function_node)
+    result = compile_and_load(kernel_function_node, custom_backend)
    return result
@@ -115,15 +124,15 @@ def get_configuration_file_path():
    # 1) Read path from environment variable if found
    if 'PYSTENCILS_CONFIG' in os.environ:
-        return os.environ['PYSTENCILS_CONFIG'], True
+        return os.environ['PYSTENCILS_CONFIG']
    # 2) Look in current directory for pystencils.json
    elif os.path.exists("pystencils.json"):
-        return "pystencils.json", True
+        return "pystencils.json"
    # 3) Try ~/.pystencils.json
    elif os.path.exists(config_path_in_home):
-        return config_path_in_home, True
+        return config_path_in_home
    else:
-        return config_path_in_home, False
+        return config_path_in_home
 def create_folder(path, is_file):
@@ -143,15 +152,42 @@ def read_config():
            ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
            ('restrict_qualifier', '__restrict__')
        ])
+        if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
+                                                                                        '-mcpu=native')
    elif platform.system().lower() == 'windows':
        default_compiler_config = OrderedDict([
            ('os', 'windows'),
            ('msvc_version', 'latest'),
            ('arch', 'x64'),
-            ('flags', '/Ox /fp:fast /openmp /arch:avx'),
+            ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
            ('restrict_qualifier', '__restrict')
        ])
+        if platform.machine() == 'ARM64':
+            default_compiler_config['arch'] = 'ARM64'
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
+    elif platform.system().lower() == 'darwin':
+        default_compiler_config = OrderedDict([
+            ('os', 'darwin'),
+            ('command', 'clang++'),
+            ('flags', '-Ofast -DNDEBUG -fPIC -march=native -Xclang -fopenmp -std=c++11'),
+            ('restrict_qualifier', '__restrict__')
+        ])
+        if platform.machine() == 'arm64':
+            if 'sme' in get_supported_instruction_sets():
+                flag = '-march=armv8.7-a+sme '
+            else:
+                flag = ''
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', flag)
+        for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib',
+                       '/opt/homebrew/lib/libomp.dylib']:
+            if os.path.exists(libomp):
+                default_compiler_config['flags'] += ' ' + libomp
+                break
+    else:
+        raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
+                                  (platform.system(),))
    default_cache_config = OrderedDict([
        ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
        ('clear_cache_on_start', False),
@@ -160,26 +196,47 @@ def read_config():
    default_config = OrderedDict([('compiler', default_compiler_config),
                                  ('cache', default_cache_config)])
-    config_path, config_exists = get_configuration_file_path()
+    from fasteners import InterProcessLock
+    config_path = pathlib.Path(get_configuration_file_path())
+    config_path.parent.mkdir(parents=True, exist_ok=True)
    config = default_config.copy()
-    if config_exists:
-        with open(config_path, 'r') as json_config_file:
+    lockfile = config_path.with_suffix(config_path.suffix + ".lock")
-            loaded_config = json.load(json_config_file)
+    with InterProcessLock(lockfile):
-        config = recursive_dict_update(config, loaded_config)
+        if config_path.exists():
-    else:
+            with open(config_path, 'r') as json_config_file:
-        create_folder(config_path, True)
+                loaded_config = json.load(json_config_file)
-        json.dump(config, open(config_path, 'w'), indent=4)
+            config = recursive_dict_update(config, loaded_config)
+        else:
+            with open(config_path, 'w') as f:
+                json.dump(config, f, indent=4)
    if config['cache']['object_cache'] is not False:
        config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid())
-        if config['cache']['clear_cache_on_start']:
+        clear_cache_on_start = False
-            clear_cache()
+        cache_status_file = os.path.join(config['cache']['object_cache'], 'last_config.json')
+        if os.path.exists(cache_status_file):
+            # check if compiler config has changed
+            last_config = json.load(open(cache_status_file, 'r'))
+            if set(last_config.items()) != set(config['compiler'].items()):
+                clear_cache_on_start = True
+            else:
+                for key in last_config.keys():
+                    if last_config[key] != config['compiler'][key]:
+                        clear_cache_on_start = True
+        if config['cache']['clear_cache_on_start'] or clear_cache_on_start:
+            shutil.rmtree(config['cache']['object_cache'], ignore_errors=True)
        create_folder(config['cache']['object_cache'], False)
+        with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
+            json.dump(config['compiler'], f, indent=4)
+        os.replace(f.name, cache_status_file)
    if config['compiler']['os'] == 'windows':
-        from pystencils.cpu.msvc_detection import get_environment
        msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch'])
        if 'env' not in config['compiler']:
            config['compiler']['env'] = {}
@@ -226,6 +283,7 @@ def clear_cache():
        create_folder(cache_config['object_cache'], False)
+# TODO don't hardcode C type. [1] of tuple output
 type_mapping = {
    np.float32: ('PyFloat_AsDouble', 'float'),
    np.float64: ('PyFloat_AsDouble', 'double'),
@@ -237,7 +295,6 @@ type_mapping = {
    np.uint64: ('PyLong_AsUnsignedLong', 'uint64_t'),
 }
 template_extract_scalar = """
 PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
 if( obj_{name} == NULL) {{  PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
@@ -312,15 +369,14 @@ def equal_size_check(fields):
        return ""
    ref_field = fields[0]
-    cond = ["(buffer_{field.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])".format(ref_field=ref_field,
+    cond = [f"(buffer_{field_to_test.name}.shape[{i}] == buffer_{ref_field.name}.shape[{i}])"
-                                                                                            field=field_to_test, i=i)
            for field_to_test in fields[1:]
            for i in range(fields[0].spatial_dimensions)]
    cond = " && ".join(cond)
    return template_size_check.format(cond=cond)
-def create_function_boilerplate_code(parameter_info, name, insert_checks=True):
+def create_function_boilerplate_code(parameter_info, name, ast_node, insert_checks=True):
    pre_call_code = ""
    parameters = []
    post_call_code = ""
@@ -332,24 +388,55 @@ def create_function_boilerplate_code(parameter_info, name, insert_checks=True):
            field = param.fields[0]
            pre_call_code += template_extract_array.format(name=field.name)
            post_call_code += template_release_buffer.format(name=field.name)
-            parameters.append("({dtype} *)buffer_{name}.buf".format(dtype=str(field.dtype), name=field.name))
+            parameters.append(f"({str(field.dtype)} *)buffer_{field.name}.buf")
            if insert_checks:
                np_dtype = field.dtype.numpy_dtype
                item_size = np_dtype.itemsize
-                if np_dtype.isbuiltin and FieldType.is_generic(field):
+                aligned = False
-                    dtype_cond = "buffer_{name}.format[0] == '{format}'".format(name=field.name,
+                if ast_node.assignments:
-                                                                                format=field.dtype.numpy_dtype.char)
+                    aligned = any([a.lhs.args[2] for a in ast_node.assignments
+                                   if hasattr(a, 'lhs') and isinstance(a.lhs, CastFunc)
+                                   and hasattr(a.lhs, 'dtype') and isinstance(a.lhs.dtype, VectorType)])
+                if ast_node.instruction_set and aligned:
+                    byte_width = ast_node.instruction_set['width'] * item_size
+                    if 'cachelineZero' in ast_node.instruction_set:
+                        has_openmp, has_nontemporal = False, False
+                        for loop in ast_node.atoms(LoopOverCoordinate):
+                            has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
+                            has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
+                                                                      loop.atoms(VectorMemoryAccess)])
+                        if has_openmp and has_nontemporal:
+                            cl_size = ast_node.instruction_set['cachelineSize']
+                            byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
+                    offset = max(max(ast_node.ghost_layers)) * item_size
+                    offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"
+                    message = str(offset) + ". This is probably due to a different number of ghost_layers chosen for " \
+                                            "the arrays and the kernel creation. If the number of ghost layers for " \
+                                            "the kernel creation is not specified it will choose a suitable value " \
+                                            "automatically. This value might not " \
+                                            "be compatible with the allocated arrays."
+                    if type(byte_width) is not int:
+                        message += " Note that when both OpenMP and non-temporal stores are enabled, alignment to the "\
+                                   "cacheline size is required."
+                    pre_call_code += template_check_array.format(cond=offset_cond, what="offset", name=field.name,
+                                                                 expected=message)
+                if (np_dtype.isbuiltin and FieldType.is_generic(field)
+                        and not np.issubdtype(field.dtype.numpy_dtype, np.complexfloating)):
+                    dtype_cond = f"buffer_{field.name}.format[0] == '{field.dtype.numpy_dtype.char}'"
                    pre_call_code += template_check_array.format(cond=dtype_cond, what="data type", name=field.name,
                                                                 expected=str(field.dtype.numpy_dtype))
-                item_size_cond = "buffer_{name}.itemsize == {size}".format(name=field.name, size=item_size)
+                item_size_cond = f"buffer_{field.name}.itemsize == {item_size}"
                pre_call_code += template_check_array.format(cond=item_size_cond, what="itemsize", name=field.name,
                                                             expected=item_size)
                if field.has_fixed_shape:
-                    shape_cond = ["buffer_{name}.shape[{i}] == {s}".format(s=s, name=field.name, i=i)
+                    shape_cond = [f"buffer_{field.name}.shape[{i}] == {s}"
                                  for i, s in enumerate(field.spatial_shape)]
                    shape_cond = " && ".join(shape_cond)
                    pre_call_code += template_check_array.format(cond=shape_cond, what="shape", name=field.name,
@@ -369,14 +456,15 @@ def create_function_boilerplate_code(parameter_info, name, insert_checks=True):
        elif param.is_field_stride:
            field = param.fields[0]
            item_size = field.dtype.numpy_dtype.itemsize
-            parameters.append("buffer_{name}.strides[{i}] / {bytes}".format(bytes=item_size, i=param.symbol.coordinate,
+            parameters.append(f"buffer_{field.name}.strides[{param.symbol.coordinate}] / {item_size}")
-                                                                            name=field.name))
        elif param.is_field_shape:
-            parameters.append("buffer_{name}.shape[{i}]".format(i=param.symbol.coordinate, name=param.field_name))
+            parameters.append(f"buffer_{param.field_name}.shape[{param.symbol.coordinate}]")
        else:
            extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type]
-            pre_call_code += template_extract_scalar.format(extract_function=extract_function, target_type=target_type,
+            pre_call_code += template_extract_scalar.format(extract_function=extract_function,
+                                                            target_type=target_type,
                                                            name=param.symbol.name)
            parameters.append(param.symbol.name)
    pre_call_code += equal_size_check(variable_sized_normal_fields)
@@ -395,10 +483,17 @@ def create_module_boilerplate_code(module_name, names):
 def load_kernel_from_file(module_name, function_name, path):
-    from importlib.util import spec_from_file_location, module_from_spec
+    try:
-    spec = spec_from_file_location(name=module_name, location=path)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
-    mod = module_from_spec(spec)
+        mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)
+        spec.loader.exec_module(mod)
+    except ImportError:
+        warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
+        time.sleep(5)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
    return getattr(mod, function_name)
@@ -407,7 +502,6 @@ def run_compile_step(command):
    config_env = compiler_config['env'] if 'env' in compiler_config else {}
    compile_environment = os.environ.copy()
    compile_environment.update(config_env)
    try:
        shell = True if compiler_config['os'].lower() == 'windows' else False
        subprocess.check_output(command, env=compile_environment, stderr=subprocess.STDOUT, shell=shell)
@@ -418,61 +512,74 @@ def run_compile_step(command):
 class ExtensionModuleCode:
-    def __init__(self, module_name='generated'):
+    def __init__(self, module_name='generated', custom_backend=None):
        self.module_name = module_name
        self._ast_nodes = []
        self._function_names = []
+        self._custom_backend = custom_backend
+        self._code_string = str()
+        self._code_hash = None
    def add_function(self, ast, name=None):
        self._ast_nodes.append(ast)
        self._function_names.append(name if name is not None else ast.function_name)
-    def write_to_file(self, restrict_qualifier, function_prefix, file):
+    def create_code_string(self, restrict_qualifier, function_prefix):
+        self._code_string = str()
        headers = {'<math.h>', '<stdint.h>'}
        for ast in self._ast_nodes:
+            for field in ast.fields_accessed:
+                if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                    # Add the half precision header only if half precision numbers occur in the AST
+                    headers.add('"half_precision.h"')
            headers.update(get_headers(ast))
-        header_list = list(headers)
-        header_list.sort()
+        header_list = sorted(headers)
        header_list.insert(0, '"Python.h"')
+        ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list
+                      if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))]
+        header_hash = b''.join([hashlib.sha256(open(h, 'rb').read()).digest() for h in ps_headers])
-        includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
+        includes = "\n".join([f"#include {include_file}" for include_file in header_list])
-        print(includes, file=file)
+        self._code_string += includes
-        print("\n", file=file)
+        self._code_string += "\n"
-        print("#define RESTRICT %s" % (restrict_qualifier,), file=file)
+        self._code_string += f"#define RESTRICT {restrict_qualifier} \n"
-        print("#define FUNC_PREFIX %s" % (function_prefix,), file=file)
+        self._code_string += f"#define FUNC_PREFIX {function_prefix}"
-        print("\n", file=file)
+        self._code_string += "\n"
        for ast, name in zip(self._ast_nodes, self._function_names):
            old_name = ast.function_name
-            ast.function_name = "kernel_" + name
+            ast.function_name = f"kernel_{name}"
-            print(generate_c(ast), file=file)
+            self._code_string += generate_c(ast, custom_backend=self._custom_backend)
-            print(create_function_boilerplate_code(ast.get_parameters(), name), file=file)
+            self._code_string += create_function_boilerplate_code(ast.get_parameters(), name, ast)
            ast.function_name = old_name
-        print(create_module_boilerplate_code(self.module_name, self._function_names), file=file)
+        self._code_hash = "mod_" + hashlib.sha256(self._code_string.encode() + header_hash).hexdigest()
+        self._code_string += create_module_boilerplate_code(self._code_hash, self._function_names)
+    def get_hash_of_code(self):
+        assert self._code_string, "The code must be generated first"
+        return self._code_hash
-class KernelWrapper:
+    def write_to_file(self, file):
-    def __init__(self, kernel, parameters, ast_node):
+        assert self._code_string, "The code must be generated first"
-        self.kernel = kernel
+        print(self._code_string, file=file)
-        self.parameters = parameters
-        self.ast = ast_node
-    def __call__(self, **kwargs):
-        return self.kernel(**kwargs)
+def compile_module(code, code_hash, base_dir, compile_flags=None):
+    if compile_flags is None:
+        compile_flags = []
-def compile_module(code, code_hash, base_dir):
    compiler_config = get_compiler_config()
-    extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()]
+    extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
    if compiler_config['os'].lower() == 'windows':
-        function_prefix = '__declspec(dllexport)'
        lib_suffix = '.pyd'
        object_suffix = '.obj'
        windows = True
    else:
-        function_prefix = ''
        lib_suffix = '.so'
        object_suffix = '.o'
        windows = False
@@ -482,8 +589,11 @@ def compile_module(code, code_hash, base_dir):
    object_file = os.path.join(base_dir, code_hash + object_suffix)
    if not os.path.exists(object_file):
-        with file_handle_for_atomic_write(src_file) as f:
+        try:
-            code.write_to_file(compiler_config['restrict_qualifier'], function_prefix, f)
+            with open(src_file, 'x') as f:
+                code.write_to_file(f)
+        except FileExistsError:
+            pass
        if windows:
            compile_cmd = ['cl.exe', '/c', '/EHsc'] + compiler_config['flags'].split()
@@ -497,11 +607,15 @@ def compile_module(code, code_hash, base_dir):
        # Linking
        if windows:
-            import sysconfig
            config_vars = sysconfig.get_config_vars()
            py_lib = os.path.join(config_vars["installed_base"], "libs",
-                                  "python{}.lib".format(config_vars["py_version_nodot"]))
+                                  f"python{config_vars['py_version_nodot']}.lib")
            run_compile_step(['link.exe', py_lib, '/DLL', '/out:' + lib_file, object_file])
+        elif platform.system().lower() == 'darwin':
+            with atomic_file_write(lib_file) as file_name:
+                run_compile_step([compiler_config['command'], '-shared', object_file, '-o', file_name, '-undefined',
+                                  'dynamic_lookup']
+                                 + compiler_config['flags'].split())
        else:
            with atomic_file_write(lib_file) as file_name:
                run_compile_step([compiler_config['command'], '-shared', object_file, '-o', file_name]
@@ -509,18 +623,34 @@ def compile_module(code, code_hash, base_dir):
    return lib_file
-def compile_and_load(ast):
+def compile_and_load(ast, custom_backend=None):
    cache_config = get_cache_config()
-    code_hash_str = "mod_" + hashlib.sha256(generate_c(ast, dialect='c').encode()).hexdigest()
-    code = ExtensionModuleCode(module_name=code_hash_str)
+    compiler_config = get_compiler_config()
+    if compiler_config['os'].lower() == 'windows':
+        function_prefix = '__declspec(dllexport)'
+    elif ast.instruction_set and 'function_prefix' in ast.instruction_set:
+        function_prefix = ast.instruction_set['function_prefix']
+    else:
+        function_prefix = ''
+    code = ExtensionModuleCode(custom_backend=custom_backend)
    code.add_function(ast, ast.function_name)
+    code.create_code_string(compiler_config['restrict_qualifier'], function_prefix)
+    code_hash_str = code.get_hash_of_code()
+    compile_flags = []
+    if ast.instruction_set and 'compile_flags' in ast.instruction_set:
+        compile_flags = ast.instruction_set['compile_flags']
    if cache_config['object_cache'] is False:
-        with TemporaryDirectory() as base_dir:
+        with tempfile.TemporaryDirectory() as base_dir:
-            lib_file = compile_module(code, code_hash_str, base_dir)
+            lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags)
            result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
    else:
-        lib_file = compile_module(code, code_hash_str, base_dir=cache_config['object_cache'])
+        lib_file = compile_module(code, code_hash_str, base_dir=cache_config['object_cache'],
+                                  compile_flags=compile_flags)
        result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
    return KernelWrapper(result, ast.get_parameters(), ast)
--- a/pystencils/cpu/kernelcreation.py
+++ b/pystencils/cpu/kernelcreation.py
 import sympy as sp
-from functools import partial
-from pystencils.astnodes import SympyAssignment, Block, LoopOverCoordinate, KernelFunction
-from pystencils.transformations import resolve_buffer_accesses, resolve_field_accesses, make_loop_over_domain, \
-    add_types, get_optimal_loop_ordering, parse_base_pointer_info, move_constants_before_loop, \
-    split_inner_loop, get_base_buffer_index
-from pystencils.data_types import TypedSymbol, BasicType, StructType, create_type
-from pystencils.field import Field, FieldType
 import pystencils.astnodes as ast
+from pystencils.config import CreateKernelConfig
+from pystencils.enums import Target, Backend
+from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
 from pystencils.cpu.cpujit import make_python_function
-from pystencils.assignment import Assignment
+from pystencils.typing import StructType, TypedSymbol, create_type
-from typing import List, Union
+from pystencils.typing.transformations import add_types
+from pystencils.field import Field, FieldType
-AssignmentOrAstNodeList = List[Union[Assignment, ast.Node]]
+from pystencils.node_collection import NodeCollection
+from pystencils.transformations import (
+    filtered_tree_iteration, iterate_loops_by_depth, get_base_buffer_index, get_optimal_loop_ordering,
+    make_loop_over_domain, add_outer_loop_over_indexed_elements,
+    move_constants_before_loop, parse_base_pointer_info, resolve_buffer_accesses,
+    resolve_field_accesses, split_inner_loop)
-def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "kernel", type_info='double',
+def create_kernel(assignments: NodeCollection,
-                  split_groups=(), iteration_slice=None, ghost_layers=None,
+                  config: CreateKernelConfig) -> KernelFunction:
-                  skip_independence_check=False) -> KernelFunction:
    """Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
    Loops are created according to the field accesses in the equations.
@@ -24,35 +25,25 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
    Args:
        assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
        Defining the update rules of the kernel
-        function_name: name of the generated function - only important if generated code is written out
+        config: create kernel config
-        type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
-                   be of type 'double' except symbols which occur on the left hand side of equations where the
-                   right hand side is a sympy Boolean which are assumed to be 'bool' .
-        split_groups: Specification on how to split up inner loop into multiple loops. For details see
-                      transformation :func:`pystencils.transformation.split_inner_loop`
-        iteration_slice: if not None, iteration is done only over this slice of the field
-        ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
-                     if None, the number of ghost layers is determined automatically and assumed to be equal for a
-                     all dimensions
-        skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
-                                 periodicity kernel, that access the field outside the iteration bounds. Use with care!
    Returns:
        AST node representing a function, that can be printed as C or CUDA code
    """
+    function_name = config.function_name
+    iteration_slice = config.iteration_slice
+    ghost_layers = config.ghost_layers
+    fields_written = assignments.bound_fields
+    fields_read = assignments.rhs_fields
-    def type_symbol(term):
+    split_groups = ()
-        if isinstance(term, Field.Access) or isinstance(term, TypedSymbol):
+    if 'split_groups' in assignments.simplification_hints:
-            return term
+        split_groups = assignments.simplification_hints['split_groups']
-        elif isinstance(term, sp.Symbol):
+    assignments = assignments.all_assignments
-            if not hasattr(type_info, '__getitem__'):
-                return TypedSymbol(term.name, create_type(type_info))
+    # TODO Cleanup: move add_types to create_domain_kernel or create_kernel
-            else:
+    assignments = add_types(assignments, config)
-                return TypedSymbol(term.name, type_info[term.name])
-        else:
-            raise ValueError("Term has to be field access or symbol")
-    fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check)
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])
@@ -61,15 +52,33 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
    body = ast.Block(assignments)
    loop_order = get_optimal_loop_ordering(fields_without_buffers)
-    ast_node = make_loop_over_domain(body, function_name, iteration_slice=iteration_slice,
+    loop_node, ghost_layer_info = make_loop_over_domain(body, iteration_slice=iteration_slice,
-                                     ghost_layers=ghost_layers, loop_order=loop_order)
+                                                        ghost_layers=ghost_layers, loop_order=loop_order)
-    ast_node.target = 'cpu'
+    loop_node = add_outer_loop_over_indexed_elements(loop_node)
+    ast_node = KernelFunction(loop_node, Target.CPU, Backend.C, compile_function=make_python_function,
+                              ghost_layers=ghost_layer_info, function_name=function_name, assignments=assignments)
    if split_groups:
+        type_info = config.data_type
+        def type_symbol(term):
+            if isinstance(term, Field.Access) or isinstance(term, TypedSymbol):
+                return term
+            elif isinstance(term, sp.Symbol):
+                if isinstance(type_info, str) or not hasattr(type_info, '__getitem__'):
+                    return TypedSymbol(term.name, create_type(type_info))
+                else:
+                    return TypedSymbol(term.name, type_info[term.name])
+            else:
+                raise ValueError("Term has to be field access or symbol")
        typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups]
        split_inner_loop(ast_node, typed_split_groups)
-    base_pointer_spec = [['spatialInner0'], ['spatialInner1']] if len(loop_order) >= 2 else [['spatialInner0']]
+    base_pointer_spec = config.base_pointer_specification
+    if base_pointer_spec is None:
+        base_pointer_spec = []
    base_pointer_info = {field.name: parse_base_pointer_info(base_pointer_spec, loop_order,
                                                             field.spatial_dimensions, field.index_dimensions)
                         for field in fields_without_buffers}
@@ -81,14 +90,14 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
    if any(FieldType.is_buffer(f) for f in all_fields):
        resolve_buffer_accesses(ast_node, get_base_buffer_index(ast_node), read_only_fields)
+    # TODO think about typing
    resolve_field_accesses(ast_node, read_only_fields, field_to_base_pointer_info=base_pointer_info)
    move_constants_before_loop(ast_node)
-    ast_node.compile = partial(make_python_function, ast_node)
    return ast_node
-def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, function_name="kernel",
+def create_indexed_kernel(assignments: NodeCollection,
-                          type_info=None, coordinate_names=('x', 'y', 'z')) -> KernelFunction:
+                          config: CreateKernelConfig) -> KernelFunction:
    """
    Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
    coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
@@ -100,33 +109,41 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
    Args:
        assignments: list of assignments
-        index_fields: list of index fields, i.e. 1D fields with struct data type
+        config: Kernel configuration
-        type_info: see documentation of :func:`create_kernel`
-        function_name: see documentation of :func:`create_kernel`
-        coordinate_names: name of the coordinate fields in the struct data type
    """
-    fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False)
+    function_name = config.function_name
+    index_fields = config.index_fields
+    coordinate_names = config.coordinate_names
+    fields_written = assignments.bound_fields
+    fields_read = assignments.rhs_fields
    all_fields = fields_read.union(fields_written)
+    # extract the index fields based on the name. The original index field might have been modified
+    index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
+    non_index_fields = [f for f in all_fields if f not in index_fields]
+    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
+    assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
+                                          f"Non index fields are {non_index_fields}, spatial coordinates are " \
+                                          f"{spatial_coordinates}"
+    spatial_coordinates = list(spatial_coordinates)[0]
+    assignments = assignments.all_assignments
+    assignments = add_types(assignments, config)
    for index_field in index_fields:
        index_field.field_type = FieldType.INDEXED
        assert FieldType.is_indexed(index_field)
        assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
-    non_index_fields = [f for f in all_fields if f not in index_fields]
-    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
-    assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
-    spatial_coordinates = list(spatial_coordinates)[0]
    def get_coordinate_symbol_assignment(name):
        for idx_field in index_fields:
            assert isinstance(idx_field.dtype, StructType), "Index fields have to have a struct data type"
            data_type = idx_field.dtype
            if data_type.has_element(name):
                rhs = idx_field[0](name)
-                lhs = TypedSymbol(name, BasicType(data_type.get_element_type(name)))
+                lhs = TypedSymbol(name, data_type.get_element_type(name))
                return SympyAssignment(lhs, rhs)
-        raise ValueError("Index %s not found in any of the passed index fields" % (name,))
+        raise ValueError(f"Index {name} not found in any of the passed index fields")
    coordinate_symbol_assignments = [get_coordinate_symbol_assignment(n)
                                     for n in coordinate_names[:spatial_coordinates]]
@@ -141,18 +158,18 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu
        loop_body.append(assignment)
    function_body = Block([loop_node])
-    ast_node = KernelFunction(function_body, backend="cpu", function_name=function_name)
+    ast_node = KernelFunction(function_body, Target.CPU, Backend.C, make_python_function,
+                              ghost_layers=None, function_name=function_name, assignments=assignments)
    fixed_coordinate_mapping = {f.name: coordinate_typed_symbols for f in non_index_fields}
    read_only_fields = set([f.name for f in fields_read - fields_written])
    resolve_field_accesses(ast_node, read_only_fields, field_to_fixed_coordinates=fixed_coordinate_mapping)
    move_constants_before_loop(ast_node)
-    ast_node.compile = partial(make_python_function, ast_node)
    return ast_node
-def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
+def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None, assume_single_outer_loop=True):
    """Parallelize the outer loop with OpenMP.
    Args:
@@ -160,41 +177,57 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None):
        schedule: OpenMP scheduling policy e.g. 'static' or 'dynamic'
        num_threads: explicitly specify number of threads
        collapse: number of nested loops to include in parallel region (see OpenMP collapse)
+        assume_single_outer_loop: if True an exception is raised if multiple outer loops are detected for all but
+                                  optimized staggered kernels the single-outer-loop assumption should be true
    """
    if not num_threads:
        return
    assert type(ast_node) is ast.KernelFunction
    body = ast_node.body
-    threads_clause = "" if num_threads and isinstance(num_threads, bool) else " num_threads(%s)" % (num_threads,)
+    threads_clause = "" if num_threads and isinstance(num_threads, bool) else f" num_threads({num_threads})"
    wrapper_block = ast.PragmaBlock('#pragma omp parallel' + threads_clause, body.take_child_nodes())
    body.append(wrapper_block)
-    outer_loops = [l for l in body.atoms(ast.LoopOverCoordinate) if l.is_outermost_loop]
+    outer_loops = [l for l in filtered_tree_iteration(body, LoopOverCoordinate, stop_type=SympyAssignment)
+                   if l.is_outermost_loop]
    assert outer_loops, "No outer loop found"
-    assert len(outer_loops) <= 1, "More than one outer loop found. Not clear where to put OpenMP pragma."
+    if assume_single_outer_loop and len(outer_loops) > 1:
-    loop_to_parallelize = outer_loops[0]
+        raise ValueError("More than one outer loop found, only one outer loop expected")
-    try:
-        loop_range = int(loop_to_parallelize.stop - loop_to_parallelize.start)
+    for loop_to_parallelize in outer_loops:
-    except TypeError:
+        try:
-        loop_range = None
+            loop_range = int(loop_to_parallelize.stop - loop_to_parallelize.start)
+        except TypeError:
-    if num_threads is None:
+            loop_range = None
-        import multiprocessing
-        num_threads = multiprocessing.cpu_count()
+        if loop_range is not None and loop_range < num_threads and not collapse:
+            contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)]
-    if loop_range is not None and loop_range < num_threads and not collapse:
+            if len(contained_loops) == 1:
-        contained_loops = [l for l in loop_to_parallelize.body.args if isinstance(l, LoopOverCoordinate)]
+                contained_loop = contained_loops[0]
-        if len(contained_loops) == 1:
+                try:
-            contained_loop = contained_loops[0]
+                    contained_loop_range = int(contained_loop.stop - contained_loop.start)
-            try:
+                    if contained_loop_range > loop_range:
-                contained_loop_range = int(contained_loop.stop - contained_loop.start)
+                        loop_to_parallelize = contained_loop
-                if contained_loop_range > loop_range:
+                except TypeError:
-                    loop_to_parallelize = contained_loop
+                    pass
-            except TypeError:
-                pass
+        prefix = f"#pragma omp for schedule({schedule})"
+        if collapse:
-    prefix = "#pragma omp for schedule(%s)" % (schedule,)
+            prefix += f" collapse({collapse})"
-    if collapse:
+        loop_to_parallelize.prefix_lines.append(prefix)
-        prefix += " collapse(%d)" % (collapse, )
-    loop_to_parallelize.prefix_lines.append(prefix)
+def add_pragmas(ast_node, pragma_lines, nesting_depth=-1):
+    """Prepends given pragma lines to all loops of specified nesting depth.
+    Args:
+        ast_node: pystencils abstract syntax tree
+        pragma_lines: Iterable of strings containing the pragma lines
+        nesting_depth: Nesting depth of the loops the pragmas should be applied to.
+                       Outermost loop has depth 0.
+                       A depth of -1 indicates the innermost loops.
+    """
+    loop_nodes = iterate_loops_by_depth(ast_node, nesting_depth)
+    for n in loop_nodes:
+        n.prefix_lines += list(pragma_lines)
--- a/pystencils/cpu/msvc_detection.py
+++ b/pystencils/cpu/msvc_detection.py
-import subprocess
 import os
+import subprocess
 def get_environment(version_specifier, arch='x64'):
@@ -71,7 +71,7 @@ def normalize_msvc_version(version):
 def get_environment_from_vc_vars_file(vc_vars_file, arch):
    out = subprocess.check_output(
-        'cmd /u /c "{}" {} && set'.format(vc_vars_file, arch),
+        f'cmd /u /c "{vc_vars_file}" {arch} && set',
        stderr=subprocess.STDOUT,
    ).decode('utf-16le', errors='replace')

--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
-import sympy as sp
 import warnings
-from typing import Union, Container
+from typing import Container, Union
-from pystencils.backends.simd_instruction_sets import get_vector_instruction_set
-from pystencils.fast_approximation import fast_division, fast_sqrt, fast_inv_sqrt
+import numpy as np
-from pystencils.integer_functions import modulo_floor, modulo_ceil
+import sympy as sp
-from pystencils.sympyextensions import fast_subs
+from sympy.logic.boolalg import BooleanFunction, BooleanAtom
-from pystencils.data_types import TypedSymbol, VectorType, get_type_of_expression, vector_memory_access, cast_func, \
-    collate_types, PointerType
 import pystencils.astnodes as ast
-from pystencils.transformations import cut_loop, filtered_tree_iteration, replace_inner_stride_with_one
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
+from pystencils.typing import (BasicType, PointerType, TypedSymbol, VectorType, CastFunc, collate_types,
+                               get_type_of_expression, VectorMemoryAccess)
+from pystencils.functions import DivFunc
 from pystencils.field import Field
+from pystencils.integer_functions import modulo_ceil, modulo_floor
+from pystencils.sympyextensions import fast_subs
+from pystencils.transformations import cut_loop, filtered_tree_iteration, replace_inner_stride_with_one
+# noinspection PyPep8Naming
+class vec_any(sp.Function):
+    nargs = (1,)
+# noinspection PyPep8Naming
+class vec_all(sp.Function):
+    nargs = (1,)
+class NontemporalFence(ast.Node):
+    def __init__(self):
+        super(NontemporalFence, self).__init__(parent=None)
+    @property
+    def symbols_defined(self):
+        return set()
-def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
+    @property
+    def undefined_symbols(self):
+        return set()
+    @property
+    def args(self):
+        return []
+    def __eq__(self, other):
+        return isinstance(other, NontemporalFence)
+class CachelineSize(ast.Node):
+    symbol = sp.Symbol("_clsize")
+    mask_symbol = sp.Symbol("_clsize_mask")
+    last_symbol = sp.Symbol("_cl_lastvec")
+    def __init__(self):
+        super(CachelineSize, self).__init__(parent=None)
+    @property
+    def symbols_defined(self):
+        return {self.symbol, self.mask_symbol, self.last_symbol}
+    @property
+    def undefined_symbols(self):
+        return set()
+    @property
+    def args(self):
+        return []
+    def __eq__(self, other):
+        return isinstance(other, CachelineSize)
+    def __hash__(self):
+        return hash(self.symbol)
+def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
              assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
              assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
+    # TODO Vectorization Revamp we first introduce the remainder loop and then check if we can even vectorise.
+    #  Maybe first copy the ast and return the copied version on failure
    """Explicit vectorization using SIMD vectorization via intrinsics.
    Args:
@@ -37,9 +100,14 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
                                        depending on the access pattern there might be additional padding
                                        required at the end of the array
    """
+    if instruction_set == 'best':
+        if get_supported_instruction_sets():
+            instruction_set = get_supported_instruction_sets()[-1]
+        else:
+            instruction_set = 'avx'
    if instruction_set is None:
        return
    all_fields = kernel_ast.fields_accessed
    if nontemporal is None or nontemporal is False:
        nontemporal = {}
@@ -55,37 +123,53 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'avx',
                                  "to differently typed floating point fields")
    float_size = field_float_dtypes.pop().numpy_dtype.itemsize
    assert float_size in (8, 4)
-    vector_is = get_vector_instruction_set('double' if float_size == 8 else 'float',
+    default_float_type = 'float64' if float_size == 8 else 'float32'
-                                           instruction_set=instruction_set)
+    vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set)
-    vector_width = vector_is['width']
    kernel_ast.instruction_set = vector_is
-    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned,
+    if nontemporal and 'cachelineZero' in vector_is:
-                                                nontemporal, assume_sufficient_line_padding)
+        kernel_ast.use_all_written_field_sizes = True
-    insert_vector_casts(kernel_ast)
+    strided = 'storeS' in vector_is and 'loadS' in vector_is
+    keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned and 'storeA' in vector_is else 'storeU']
+    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal,
+                                                strided, keep_loop_stop, assume_sufficient_line_padding,
+                                                default_float_type)
-def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
+def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontemporal_fields,
-                                                assume_sufficient_line_padding):
+                                                strided, keep_loop_stop, assume_sufficient_line_padding,
+                                                default_float_type):
    """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
-    all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
+    all_loops = list(filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment))
-    inner_loops = [n for n in all_loops if n.is_innermost_loop]
+    inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
-    zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops}
+    zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}
+    vector_is = ast_node.instruction_set
+    assert vector_is, "The ast needs to hold information about the instruction_set for the vectorisation"
+    vector_width = vector_is['width']
+    vector_int_width = vector_is['intwidth']
    for loop_node in inner_loops:
        loop_range = loop_node.stop - loop_node.start
        # cut off loop tail, that is not a multiple of four
-        if assume_aligned and assume_sufficient_line_padding:
+        if keep_loop_stop:
+            pass
+        elif assume_aligned and assume_sufficient_line_padding:
            loop_range = loop_node.stop - loop_node.start
            new_stop = loop_node.start + modulo_ceil(loop_range, vector_width)
            loop_node.stop = new_stop
        else:
            cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
-            loop_nodes = cut_loop(loop_node, [cutting_point])
+            # TODO cut_loop calls deepcopy on the loop_node. This is bad as documented in cut_loop
-            assert len(loop_nodes) in (1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
+            loop_nodes = [loop for loop in cut_loop(loop_node, [cutting_point]).args
+                          if isinstance(loop, ast.LoopOverCoordinate)]
+            assert len(loop_nodes) in (0, 1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
+            if len(loop_nodes) == 0:
+                continue
            loop_node = loop_nodes[0]
+            # loop_node is the vectorized one
        # Find all array accesses (indexed) that depend on the loop counter as offset
        loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
        substitutions = {}
@@ -93,54 +177,184 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
        for indexed in loop_node.atoms(sp.Indexed):
            base, index = indexed.args
            if loop_counter_symbol in index.atoms(sp.Symbol):
+                if 'loadA' not in vector_is and 'storeA' not in vector_is and 'maskStoreA' not in vector_is:
+                    # don't need to generate the alignment check when there are no aligned load/store instructions
+                    aligned_access = False
+                else:
+                    if not isinstance(vector_width, int):
+                        raise NotImplementedError('Access alignment cannot be statically determined for sizeless '
+                                                  'vector ISAs')
+                    aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0
                loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
-                aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) == 0
+                stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
-                if not loop_counter_is_offset:
+                if not loop_counter_is_offset and (not strided or loop_counter_symbol in stride.atoms()):
                    successful = False
                    break
                typed_symbol = base.label
-                assert type(typed_symbol.dtype) is PointerType, \
+                assert type(typed_symbol.dtype) is PointerType, f"Type of access is {typed_symbol.dtype}, {indexed}"
-                    "Type of access is {}, {}".format(typed_symbol.dtype, indexed)
                vec_type = VectorType(typed_symbol.dtype.base_type, vector_width)
                use_aligned_access = aligned_access and assume_aligned
                nontemporal = False
                if hasattr(indexed, 'field'):
                    nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
-                substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal)
+                substitutions[indexed] = VectorMemoryAccess(indexed, vec_type, use_aligned_access, nontemporal, True,
+                                                            stride if strided else 1)
+                if nontemporal:
+                    # insert NontemporalFence after the outermost loop
+                    parent = loop_node.parent
+                    while type(parent.parent.parent) is not ast.KernelFunction:
+                        parent = parent.parent
+                    parent.parent.insert_after(NontemporalFence(), parent, if_not_exists=True)
+                    # insert CachelineSize at the beginning of the kernel
+                    parent.parent.insert_front(CachelineSize(), if_not_exists=True)
        if not successful:
            warnings.warn("Could not vectorize loop because of non-consecutive memory access")
            continue
        loop_node.step = vector_width
        loop_node.subs(substitutions)
+        arg_1 = CastFunc(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width))
+        arg_2 = CastFunc(tuple(range(vector_int_width if type(vector_int_width) is int else 2)),
+                         VectorType(loop_counter_symbol.dtype, vector_int_width))
+        vector_loop_counter = arg_1 + arg_2
+        fast_subs(loop_node, {loop_counter_symbol: vector_loop_counter},
+                  skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, VectorMemoryAccess))
+        mask_conditionals(loop_node)
+        from pystencils.rng import RNGBase
+        substitutions = {}
+        for rng in loop_node.atoms(RNGBase):
+            new_result_symbols = [TypedSymbol(s.name, VectorType(s.dtype, width=vector_width))
+                                  for s in rng.result_symbols]
+            substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)})
+            rng._symbols_defined = set(new_result_symbols)
+        fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase))
+        insert_vector_casts(loop_node, vector_is, default_float_type)
+def mask_conditionals(loop_body):
+    def visit_node(node, mask):
+        if isinstance(node, ast.Conditional):
+            cond = node.condition_expr
+            skip = (loop_body.loop_counter_symbol not in cond.atoms(sp.Symbol)) or cond.func in (vec_all, vec_any)
+            cond = True if skip else cond
+            true_mask = sp.And(cond, mask)
+            visit_node(node.true_block, true_mask)
+            if node.false_block:
+                false_mask = sp.And(sp.Not(node.condition_expr), mask)
+                visit_node(node, false_mask)
+            if not skip:
+                node.condition_expr = vec_any(node.condition_expr)
+        elif isinstance(node, ast.SympyAssignment):
+            if mask is not True:
+                s = {ma: VectorMemoryAccess(*ma.args[0:4], sp.And(mask, ma.args[4]), *ma.args[5:])
+                     for ma in node.atoms(VectorMemoryAccess)}
+                node.subs(s)
+        else:
+            for arg in node.args:
+                visit_node(arg, mask)
+    visit_node(loop_body, mask=True)
-def insert_vector_casts(ast_node):
+def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
    """Inserts necessary casts from scalar values to vector values."""
-    handled_functions = (sp.Add, sp.Mul, fast_division, fast_sqrt, fast_inv_sqrt)
+    handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.Abs)
-    def visit_expr(expr):
+    def is_scalar(expr) -> bool:
+        if hasattr(expr, "dtype"):
+            if type(expr.dtype) is VectorType:
+                return False
+            # Else branch: If expr is a CastFunc, then whether the expression
+            # is scalar is determined by the argument (remember: vector casts
+            # are not inserted yet). Therefore, we must recurse into the args of
+            # expr below. Otherwise, this expression is atomic and in that case
+            # it is assumed to be scalar below.
-        if isinstance(expr, cast_func) or isinstance(expr, vector_memory_access):
+        if isinstance(expr, ast.ResolvedFieldAccess):
-            return expr
+            # expr.field is not in expr.args
-        elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, sp.boolalg.BooleanFunction):
+            return is_scalar(expr.field)
-            new_args = [visit_expr(a) for a in expr.args]
+        elif isinstance(expr, (vec_any, vec_all)):
-            arg_types = [get_type_of_expression(a) for a in new_args]
+            return True
+        if not hasattr(expr, "args"):
+            return True
+        return all(is_scalar(arg) for arg in expr.args)
+    # TODO Vectorization Revamp: get rid of default_type
+    def visit_expr(expr, default_type='double', force_vectorize=False):
+        if isinstance(expr, VectorMemoryAccess):
+            return VectorMemoryAccess(*expr.args[0:4], visit_expr(expr.args[4], default_type, force_vectorize),
+                                      *expr.args[5:])
+        elif isinstance(expr, CastFunc):
+            cast_type = expr.args[1]
+            arg = visit_expr(expr.args[0], default_type, force_vectorize)
+            assert cast_type in [BasicType('float32'), BasicType('float64')], \
+                f'Vectorization cannot vectorize type {cast_type}'
+            return expr.func(arg, VectorType(cast_type, instruction_set['width']))
+        elif expr.func is sp.Abs and 'abs' not in instruction_set:
+            new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
+            base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is VectorMemoryAccess \
+                else get_type_of_expression(expr.args[0])
+            pw = sp.Piecewise((-new_arg, new_arg < CastFunc(0, base_type.numpy_dtype)),
+                              (new_arg, True))
+            return visit_expr(pw, default_type, force_vectorize)
+        elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction):
+            if expr.func is sp.Mul and expr.args[0] == -1:
+                # special treatment for the unary minus: make sure that the -1 has the same type as the argument
+                dtype = int
+                for arg in expr.atoms(VectorMemoryAccess):
+                    if arg.dtype.base_type.is_float():
+                        dtype = arg.dtype.base_type.numpy_dtype.type
+                for arg in expr.atoms(TypedSymbol):
+                    if type(arg.dtype) is VectorType and arg.dtype.base_type.is_float():
+                        dtype = arg.dtype.base_type.numpy_dtype.type
+                if dtype is not int:
+                    if dtype is np.float32:
+                        default_type = 'float'
+                    expr = sp.Mul(dtype(expr.args[0]), *expr.args[1:])
+            new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args]
+            arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
            if not any(type(t) is VectorType for t in arg_types):
                return expr
            else:
                target_type = collate_types(arg_types)
-                casted_args = [cast_func(a, target_type) if t != target_type else a
+                casted_args = [
-                               for a, t in zip(new_args, arg_types)]
+                    CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
+                    for a, t in zip(new_args, arg_types)]
                return expr.func(*casted_args)
+        elif expr.func is sp.UnevaluatedExpr:
+            assert expr.args[0].is_Pow or expr.args[0].is_Mul, "UnevaluatedExpr only implemented holding Mul or Pow"
+            # TODO this is only because cut_loop evaluates the multiplications again due to deepcopy. All this should
+            # TODO be fixed for real at some point.
+            if expr.args[0].is_Pow:
+                base = expr.args[0].base
+                exp = expr.args[0].exp
+                expr = sp.UnevaluatedExpr(sp.Mul(*([base] * +exp), evaluate=False))
+            new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args[0].args]
+            arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
+            target_type = collate_types(arg_types)
+            if not any(type(t) is VectorType for t in arg_types):
+                target_type = VectorType(target_type, instruction_set['width'])
+            casted_args = [
+                CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
+                for a, t in zip(new_args, arg_types)]
+            return expr.func(expr.args[0].func(*casted_args, evaluate=False))
        elif expr.func is sp.Pow:
-            new_arg = visit_expr(expr.args[0])
+            new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
            return expr.func(new_arg, expr.args[1])
        elif expr.func == sp.Piecewise:
-            new_results = [visit_expr(a[0]) for a in expr.args]
+            new_results = [visit_expr(a[0], default_type, force_vectorize) for a in expr.args]
-            new_conditions = [visit_expr(a[1]) for a in expr.args]
+            new_conditions = [visit_expr(a[1], default_type, force_vectorize) for a in expr.args]
            types_of_results = [get_type_of_expression(a) for a in new_results]
            types_of_conditions = [get_type_of_expression(a) for a in new_conditions]
@@ -151,38 +365,61 @@ def insert_vector_casts(ast_node):
            if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType:
                condition_target_type = VectorType(condition_target_type, width=result_target_type.width)
-            casted_results = [cast_func(a, result_target_type) if t != result_target_type else a
+            casted_results = [CastFunc(a, result_target_type) if t != result_target_type else a
                              for a, t in zip(new_results, types_of_results)]
-            casted_conditions = [cast_func(a, condition_target_type)
+            casted_conditions = [CastFunc(a, condition_target_type)
                                 if t != condition_target_type and a is not True else a
                                 for a, t in zip(new_conditions, types_of_conditions)]
            return sp.Piecewise(*[(r, c) for r, c in zip(casted_results, casted_conditions)])
-        else:
+        elif isinstance(expr, TypedSymbol):
+            if force_vectorize:
+                expr_type = get_type_of_expression(expr)
+                if type(expr_type) is not VectorType:
+                    vector_type = VectorType(expr_type, instruction_set['width'])
+                    return CastFunc(expr, vector_type)
            return expr
+        elif isinstance(expr, (sp.Number, BooleanAtom)):
+            return expr
+        else:
+            raise NotImplementedError(f'Due to defensive programming we handle only specific expressions.\n'
+                                      f'The expression {expr} of type {type(expr)} is not known yet.')
-    def visit_node(node, substitution_dict):
+    def visit_node(node, substitution_dict, default_type='double'):
        substitution_dict = substitution_dict.copy()
        for arg in node.args:
            if isinstance(arg, ast.SympyAssignment):
                assignment = arg
+                # If there is a remainder loop we do not vectorise it, thus lhs will indicate this
+                # if isinstance(assignment.lhs, ast.ResolvedFieldAccess):
+                # continue
                subs_expr = fast_subs(assignment.rhs, substitution_dict,
                                      skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
-                assignment.rhs = visit_expr(subs_expr)
-                rhs_type = get_type_of_expression(assignment.rhs)
+                # If either side contains a vectorized subexpression, both sides
+                # must be fully vectorized.
+                lhs_scalar = is_scalar(assignment.lhs)
+                rhs_scalar = is_scalar(subs_expr)
+                assignment.rhs = visit_expr(subs_expr, default_type, force_vectorize=not (lhs_scalar and rhs_scalar))
                if isinstance(assignment.lhs, TypedSymbol):
-                    lhs_type = assignment.lhs.dtype
+                    if lhs_scalar and not rhs_scalar:
-                    if type(rhs_type) is VectorType and type(lhs_type) is not VectorType:
+                        lhs_type = get_type_of_expression(assignment.lhs)
+                        rhs_type = get_type_of_expression(assignment.rhs)
                        new_lhs_type = VectorType(lhs_type, rhs_type.width)
                        new_lhs = TypedSymbol(assignment.lhs.name, new_lhs_type)
                        substitution_dict[assignment.lhs] = new_lhs
                        assignment.lhs = new_lhs
-                elif isinstance(assignment.lhs.func, cast_func):
+                elif isinstance(assignment.lhs, VectorMemoryAccess):
-                    lhs_type = assignment.lhs.args[1]
+                    assignment.lhs = visit_expr(assignment.lhs, default_type)
-                    if type(lhs_type) is VectorType and type(rhs_type) is not VectorType:
+            elif isinstance(arg, ast.Conditional):
-                        assignment.rhs = cast_func(assignment.rhs, lhs_type)
+                arg.condition_expr = fast_subs(arg.condition_expr, substitution_dict,
+                                               skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
+                arg.condition_expr = visit_expr(arg.condition_expr, default_type)
+                visit_node(arg, substitution_dict, default_type)
            else:
-                visit_node(arg, substitution_dict)
+                visit_node(arg, substitution_dict, default_type)
-    visit_node(ast_node, {})
+    visit_node(ast_node, {}, default_float_type)
No results found