5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils/bit_masks.py
+++ b/pystencils/bit_masks.py
 import sympy as sp
-from pystencils.data_types import get_type_of_expression
+# from pystencils.typing import get_type_of_expression


 # noinspection PyPep8Naming
@@ -22,13 +22,14 @@ class flag_cond(sp.Function):

    def __new__(cls, flag_bit, mask_expression, *expressions):

-        flag_dtype = get_type_of_expression(flag_bit)
-        if not flag_dtype.is_int():
-            raise ValueError('Argument flag_bit must be of integer type.')
-
-        mask_dtype = get_type_of_expression(mask_expression)
-        if not mask_dtype.is_int():
-            raise ValueError('Argument mask_expression must be of integer type.')
+        # TODO Jan reintroduce checking
+        # flag_dtype = get_type_of_expression(flag_bit)
+        # if not flag_dtype.is_int():
+        #     raise ValueError('Argument flag_bit must be of integer type.')
+        #
+        # mask_dtype = get_type_of_expression(mask_expression)
+        # if not mask_dtype.is_int():
+        #     raise ValueError('Argument mask_expression must be of integer type.')

        return super().__new__(cls, flag_bit, mask_expression, *expressions)


--- a/pystencils/boundaries/__init__.py
+++ b/pystencils/boundaries/__init__.py
--- a/pystencils/boundaries/boundaryconditions.py
+++ b/pystencils/boundaries/boundaryconditions.py
 from typing import Any, List, Tuple

-from pystencils import Assignment
+from pystencils.astnodes import SympyAssignment
 from pystencils.boundaries.boundaryhandling import BoundaryOffsetInfo
-from pystencils.data_types import create_type
+from pystencils.typing import create_type


 class Boundary:
@@ -14,7 +14,7 @@ class Boundary:
    def __init__(self, name=None):
        self._name = name

-    def __call__(self, field, direction_symbol, index_field) -> List[Assignment]:
+    def __call__(self, field, direction_symbol, index_field) -> List[SympyAssignment]:
        """Defines the boundary behavior and must therefore be implemented by all boundaries.

        Here the boundary is defined as a list of sympy assignments, from which a boundary kernel is generated.
@@ -63,20 +63,20 @@ class Neumann(Boundary):

        neighbor = BoundaryOffsetInfo.offset_from_dir(direction_symbol, field.spatial_dimensions)
        if field.index_dimensions == 0:
-            return [Assignment(field.center, field[neighbor])]
+            return [SympyAssignment(field.center, field[neighbor])]
        else:
            from itertools import product
            if not field.has_fixed_index_shape:
                raise NotImplementedError("Neumann boundary works only for fields with fixed index shape")
            index_iter = product(*(range(i) for i in field.index_shape))
-            return [Assignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]
+            return [SympyAssignment(field(*idx), field[neighbor](*idx)) for idx in index_iter]

    def __hash__(self):
        # All boundaries of these class behave equal -> should also be equal
        return hash("Neumann")

    def __eq__(self, other):
-        return type(other) == Neumann
+        return type(other) is Neumann


 class Dirichlet(Boundary):
@@ -103,11 +103,11 @@ class Dirichlet(Boundary):
    def __call__(self, field, direction_symbol, index_field, **kwargs):

        if field.index_dimensions == 0:
-            return [Assignment(field.center, index_field("value") if self.additional_data else self._value)]
+            return [SympyAssignment(field.center, index_field("value") if self.additional_data else self._value)]
        elif field.index_dimensions == 1:
            assert not self.additional_data
            if not field.has_fixed_index_shape:
                raise NotImplementedError("Field needs fixed index shape")
            assert len(self._value) == field.index_shape[0], "Dirichlet value does not match index shape of field"
-            return [Assignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
+            return [SympyAssignment(field(i), self._value[i]) for i in range(field.index_shape[0])]
        raise NotImplementedError("Dirichlet boundary not implemented for fields with more than one index dimension")
--- a/pystencils/boundaries/boundaryhandling.py
+++ b/pystencils/boundaries/boundaryhandling.py
+from functools import lru_cache
+
 import numpy as np
 import sympy as sp

 from pystencils import create_kernel, CreateKernelConfig, Target
-from pystencils.assignment import Assignment
+from pystencils.astnodes import SympyAssignment
 from pystencils.backends.cbackend import CustomCodeNode
 from pystencils.boundaries.createindexlist import (
    create_boundary_index_array, numpy_data_type_for_boundary_object)
-from pystencils.cache import memorycache
-from pystencils.data_types import TypedSymbol, create_type
-from pystencils.datahandling.pycuda import PyCudaArrayHandler
+from pystencils.typing import TypedSymbol, create_type
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler
 from pystencils.field import Field
-from pystencils.kernelparameters import FieldPointerSymbol
+from pystencils.typing.typed_sympy import FieldPointerSymbol

 try:
    # noinspection PyPep8Naming
    import waLBerla as wlb
    if wlb.cpp_available:
        from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
+        import cupy.cuda.runtime
    else:
        ParallelDataHandling = None
 except ImportError:
@@ -33,11 +35,11 @@ class FlagInterface:
        >>> dh = create_data_handling((4, 5))
        >>> fi = FlagInterface(dh, 'flag_field', np.uint8)
        >>> assert dh.has_data('flag_field')
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        2
-        >>> fi.reserve_flag(4)
+        >>> int(fi.reserve_flag(4))
        4
-        >>> fi.reserve_next_flag()
+        >>> int(fi.reserve_next_flag())
        8
    """

@@ -99,7 +101,7 @@ class BoundaryHandling:
        self.flag_interface = fi if fi is not None else FlagInterface(data_handling, name + "Flags")

        if ParallelDataHandling and isinstance(self.data_handling, ParallelDataHandling):
-            array_handler = PyCudaArrayHandler()
+            array_handler = GPUArrayHandler(cupy.cuda.runtime.getDevice())
        else:
            array_handler = self.data_handling.array_handler

@@ -115,7 +117,8 @@ class BoundaryHandling:

            for obj, cpu_arr in cpu_version.items():
                if obj not in gpu_version or gpu_version[obj].shape != cpu_arr.shape:
-                    gpu_version[obj] = array_handler.to_gpu(cpu_arr)
+                    gpu_version[obj] = array_handler.empty(cpu_arr.shape, cpu_arr.dtype)
+                    array_handler.upload(gpu_version[obj], cpu_arr)
                else:
                    array_handler.upload(gpu_version[obj], cpu_arr)

@@ -378,15 +381,15 @@ class BoundaryDataSetter:
        assert coord < self.dim
        return self.index_array[self.coord_map[coord]] + self.offset[coord] - self.ghost_layers + 0.5

-    @memorycache()
+    @lru_cache()
    def link_offsets(self):
        return self.stencil[self.index_array['dir']]

-    @memorycache()
+    @lru_cache()
    def link_positions(self, coord):
        return self.non_boundary_cell_positions(coord) + 0.5 * self.link_offsets()[:, coord]

-    @memorycache()
+    @lru_cache()
    def boundary_cell_positions(self, coord):
        return self.non_boundary_cell_positions(coord) + self.link_offsets()[:, coord]

@@ -423,29 +426,30 @@ class BoundaryOffsetInfo(CustomCodeNode):
        code = "\n"
        for i in range(dim):
            offset_str = ", ".join([str(d[i]) for d in stencil])
-            code += "const int64_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)
+            code += "const int32_t %s [] = { %s };\n" % (offset_sym[i].name, offset_str)

        inv_dirs = []
        for direction in stencil:
            inverse_dir = tuple([-i for i in direction])
            inv_dirs.append(str(stencil.index(inverse_dir)))

-        code += "const int64_t %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
+        code += "const int32_t %s [] = { %s };\n" % (self.INV_DIR_SYMBOL.name, ", ".join(inv_dirs))
        offset_symbols = BoundaryOffsetInfo._offset_symbols(dim)
        super(BoundaryOffsetInfo, self).__init__(code, symbols_read=set(),
                                                 symbols_defined=set(offset_symbols + [self.INV_DIR_SYMBOL]))

    @staticmethod
    def _offset_symbols(dim):
-        return [TypedSymbol(f"c{d}", create_type(np.int64)) for d in ['x', 'y', 'z'][:dim]]
+        return [TypedSymbol(f"c{d}", create_type(np.int32)) for d in ['x', 'y', 'z'][:dim]]

-    INV_DIR_SYMBOL = TypedSymbol("invdir", np.int64)
+    INV_DIR_SYMBOL = TypedSymbol("invdir", np.int32)


 def create_boundary_kernel(field, index_field, stencil, boundary_functor, target=Target.CPU, **kernel_creation_args):
    elements = [BoundaryOffsetInfo(stencil)]
-    dir_symbol = TypedSymbol("dir", np.int64)
-    elements += [Assignment(dir_symbol, index_field[0]('dir'))]
+    dir_symbol = TypedSymbol("dir", np.int32)
+    elements += [SympyAssignment(dir_symbol, index_field[0]('dir'))]
    elements += boundary_functor(field, direction_symbol=dir_symbol, index_field=index_field)
-    config = CreateKernelConfig(index_fields=[index_field], target=target, **kernel_creation_args)
+    config = CreateKernelConfig(index_fields=[index_field], target=target, skip_independence_check=True,
+                                **kernel_creation_args)
    return create_kernel(elements, config=config)
--- a/pystencils/boundaries/createindexlist.py
+++ b/pystencils/boundaries/createindexlist.py
@@ -2,66 +2,83 @@ import warnings

 import numpy as np

+
 try:
-    # Try to import right away - assume compiled code is available
-    # compile with: python setup.py build_ext --inplace --use-cython
-    from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
-        create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, create_boundary_cell_index_list_3d
+    import pyximport

+    pyximport.install(language_level=3)
    cython_funcs_available = True
 except ImportError:
-    try:
-        # If not, try development mode and import via pyximport
-        import pyximport
-
-        pyximport.install(language_level=3)
-        cython_funcs_available = True
-    except ImportError:
-        cython_funcs_available = False
-    if cython_funcs_available:
-        from pystencils.boundaries.createindexlistcython import create_boundary_neighbor_index_list_2d, \
-            create_boundary_neighbor_index_list_3d, create_boundary_cell_index_list_2d, \
-            create_boundary_cell_index_list_3d
+    cython_funcs_available = False
+
+if cython_funcs_available:
+    from pystencils.boundaries.createindexlistcython import (
+        create_boundary_neighbor_index_list_2d,
+        create_boundary_neighbor_index_list_3d,
+        create_boundary_cell_index_list_2d,
+        create_boundary_cell_index_list_3d,
+    )

 boundary_index_array_coordinate_names = ["x", "y", "z"]
 direction_member_name = "dir"
+default_index_array_dtype = np.int32


 def numpy_data_type_for_boundary_object(boundary_object, dim):
    coordinate_names = boundary_index_array_coordinate_names[:dim]
-    return np.dtype([(name, np.int32) for name in coordinate_names]
-                    + [(direction_member_name, np.int32)]
-                    + [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data], align=True)
-
-
-def _create_index_list_python(flag_field_arr, boundary_mask,
-                              fluid_mask, stencil, single_link, inner_or_boundary=False, nr_of_ghost_layers=None):
-
+    return np.dtype(
+        [(name, default_index_array_dtype) for name in coordinate_names]
+        + [(direction_member_name, default_index_array_dtype)]
+        + [(i[0], i[1].numpy_dtype) for i in boundary_object.additional_data],
+        align=True,
+    )
+
+
+def _create_index_list_python(
+    flag_field_arr,
+    boundary_mask,
+    fluid_mask,
+    stencil,
+    single_link,
+    inner_or_boundary=False,
+    nr_of_ghost_layers=None,
+):
    if inner_or_boundary and nr_of_ghost_layers is None:
-        raise ValueError("If inner_or_boundary is set True the number of ghost layers "
-                         "around the inner domain has to be specified")
+        raise ValueError(
+            "If inner_or_boundary is set True the number of ghost layers "
+            "around the inner domain has to be specified"
+        )

    if nr_of_ghost_layers is None:
        nr_of_ghost_layers = 0

-    coordinate_names = boundary_index_array_coordinate_names[:len(flag_field_arr.shape)]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
+    coordinate_names = boundary_index_array_coordinate_names[
+        : len(flag_field_arr.shape)
+    ]
+    index_arr_dtype = np.dtype(
+        [(name, default_index_array_dtype) for name in coordinate_names]
+        + [(direction_member_name, default_index_array_dtype)]
+    )

    # boundary cells are extracted via np.where. To ensure continous memory access in the compute kernel these cells
    # have to be sorted.
    boundary_cells = np.transpose(np.nonzero(flag_field_arr == boundary_mask))
    for i in range(len(flag_field_arr.shape)):
-        boundary_cells = boundary_cells[boundary_cells[:, i].argsort(kind='mergesort')]
+        boundary_cells = boundary_cells[boundary_cells[:, i].argsort(kind="mergesort")]

    # First a set is created to save all fluid cells which are near boundary
    fluid_cells = set()
    for cell in boundary_cells:
        cell = tuple(cell)
        for dir_idx, direction in enumerate(stencil):
-            neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)])
+            neighbor_cell = tuple(
+                [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
+            )
            # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
-            if any(not 0 + nr_of_ghost_layers <= e < upper - nr_of_ghost_layers
-                   for e, upper in zip(neighbor_cell, flag_field_arr.shape)):
+            if any(
+                not 0 + nr_of_ghost_layers <= e < upper - nr_of_ghost_layers
+                for e, upper in zip(neighbor_cell, flag_field_arr.shape)
+            ):
                continue
            if flag_field_arr[neighbor_cell] & fluid_mask:
                fluid_cells.add(neighbor_cell)
@@ -81,9 +98,14 @@ def _create_index_list_python(flag_field_arr, boundary_mask,
        cell = tuple(cell)
        sum_cells = np.zeros(len(cell))
        for dir_idx, direction in enumerate(stencil):
-            neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)])
+            neighbor_cell = tuple(
+                [cell_i + dir_i for cell_i, dir_i in zip(cell, direction)]
+            )
            # prevent out ouf bounds access. If boundary cell is at the border, some stencil directions would be out.
-            if any(not 0 <= e < upper for e, upper in zip(neighbor_cell, flag_field_arr.shape)):
+            if any(
+                not 0 <= e < upper
+                for e, upper in zip(neighbor_cell, flag_field_arr.shape)
+            ):
                continue
            if flag_field_arr[neighbor_cell] & checkmask:
                if single_link:
@@ -99,8 +121,15 @@ def _create_index_list_python(flag_field_arr, boundary_mask,
    return np.array(result, dtype=index_arr_dtype)


-def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
-                               nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
+def create_boundary_index_list(
+    flag_field,
+    stencil,
+    boundary_mask,
+    fluid_mask,
+    nr_of_ghost_layers=1,
+    inner_or_boundary=True,
+    single_link=False,
+):
    """Creates a numpy array storing links (connections) between domain cells and boundary cells.

    Args:
@@ -117,10 +146,20 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
    """
    dim = len(flag_field.shape)
    coordinate_names = boundary_index_array_coordinate_names[:dim]
-    index_arr_dtype = np.dtype([(name, np.int32) for name in coordinate_names] + [(direction_member_name, np.int32)])
-
-    stencil = np.array(stencil, dtype=np.int32)
-    args = (flag_field, nr_of_ghost_layers, boundary_mask, fluid_mask, stencil, single_link)
+    index_arr_dtype = np.dtype(
+        [(name, default_index_array_dtype) for name in coordinate_names]
+        + [(direction_member_name, default_index_array_dtype)]
+    )
+
+    stencil = np.array(stencil, dtype=default_index_array_dtype)
+    args = (
+        flag_field,
+        nr_of_ghost_layers,
+        boundary_mask,
+        fluid_mask,
+        stencil,
+        single_link,
+    )
    args_no_gl = (flag_field, boundary_mask, fluid_mask, stencil, single_link)

    if cython_funcs_available:
@@ -139,22 +178,42 @@ def create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
        return np.array(idx_list, dtype=index_arr_dtype)
    else:
        if flag_field.size > 1e6:
-            warnings.warn("Boundary setup may take very long! Consider installing cython to speed it up")
-        return _create_index_list_python(*args_no_gl, inner_or_boundary=inner_or_boundary,
-                                         nr_of_ghost_layers=nr_of_ghost_layers)
-
-
-def create_boundary_index_array(flag_field, stencil, boundary_mask, fluid_mask, boundary_object,
-                                nr_of_ghost_layers=1, inner_or_boundary=True, single_link=False):
-    idx_array = create_boundary_index_list(flag_field, stencil, boundary_mask, fluid_mask,
-                                           nr_of_ghost_layers, inner_or_boundary, single_link)
+            warnings.warn(
+                "Boundary setup may take very long! Consider installing cython to speed it up"
+            )
+        return _create_index_list_python(
+            *args_no_gl,
+            inner_or_boundary=inner_or_boundary,
+            nr_of_ghost_layers=nr_of_ghost_layers,
+        )
+
+
+def create_boundary_index_array(
+    flag_field,
+    stencil,
+    boundary_mask,
+    fluid_mask,
+    boundary_object,
+    nr_of_ghost_layers=1,
+    inner_or_boundary=True,
+    single_link=False,
+):
+    idx_array = create_boundary_index_list(
+        flag_field,
+        stencil,
+        boundary_mask,
+        fluid_mask,
+        nr_of_ghost_layers,
+        inner_or_boundary,
+        single_link,
+    )
    dim = len(flag_field.shape)

    if boundary_object.additional_data:
        coordinate_names = boundary_index_array_coordinate_names[:dim]
        index_arr_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
        extended_idx_field = np.empty(len(idx_array), dtype=index_arr_dtype)
-        for prop in coordinate_names + ['dir']:
+        for prop in coordinate_names + ["dir"]:
            extended_idx_field[prop] = idx_array[prop]

        idx_array = extended_idx_field

--- a/pystencils/boundaries/createindexlistcython.pyx
+++ b/pystencils/boundaries/createindexlistcython.pyx
-# distutils: language=c
-# Workaround for cython bug
-# see https://stackoverflow.com/questions/8024805/cython-compiled-c-extension-importerror-dynamic-module-does-not-define-init-fu
-WORKAROUND = "Something"
+# cython: language_level=3str

 import cython


--- a/pystencils/boundaries/inkernel.py
+++ b/pystencils/boundaries/inkernel.py
 import sympy as sp

 from pystencils.boundaries.boundaryhandling import DEFAULT_FLAG_TYPE
-from pystencils.data_types import TypedSymbol, create_type
+from pystencils.typing import TypedSymbol, create_type
 from pystencils.field import Field
 from pystencils.integer_functions import bitwise_and


--- a/pystencils/cache.py
+++ b/pystencils/cache.py
@@ -3,10 +3,7 @@ from collections.abc import Hashable
 from functools import partial, wraps
 from itertools import chain

-try:
-    from functools import lru_cache as memorycache
-except ImportError:
-    from backports.functools_lru_cache import lru_cache as memorycache
+from functools import lru_cache as memorycache

 from joblib import Memory
 from appdirs import user_cache_dir
@@ -62,6 +59,14 @@ def sharedmethodcache(cache_id: str):
    return _decorator


+def clear_cache():
+    """
+    Clears the pystencils cache created by joblib.
+    """
+    memory = Memory(cache_dir, verbose=0)
+    memory.clear(warn=False)
+
+
 # Disable memory cache:
 # disk_cache = lambda o: o
 # disk_cache_no_fallback = lambda o: o
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
+from copy import copy
+from collections import defaultdict
+from dataclasses import dataclass, field
+from types import MappingProxyType
+from typing import Union, Tuple, List, Dict, Callable, Any, DefaultDict, Iterable
+
+from pystencils import Target, Backend, Field
+from pystencils.typing.typed_sympy import BasicType
+from pystencils.typing.utilities import collate_types
+
+import numpy as np
+
+# TODO: There exists DTypeLike in NumPy which would be better than type for type hinting, to new at the moment
+# from numpy.typing import DTypeLike
+
+
+# TODO: CreateKernelConfig is bloated think of more classes better usage, factory whatever ...
+# Proposition: CreateKernelConfigs Classes for different targets?
+@dataclass
+class CreateKernelConfig:
+    """
+    **Below all parameters for the CreateKernelConfig are explained**
+    """
+    target: Target = Target.CPU
+    """
+    All targets are defined in :class:`pystencils.enums.Target`
+    """
+    backend: Backend = None
+    """
+    All backends are defined in :class:`pystencils.enums.Backend`
+    """
+    function_name: str = 'kernel'
+    """
+    Name of the generated function - only important if generated code is written out
+    """
+    data_type: Union[type, str, DefaultDict[str, BasicType], Dict[str, BasicType]] = np.float64
+    """
+    Data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type.
+    If specified as a dict ideally a defaultdict is used to define a default value for symbols not listed in the
+    dict. If a plain dict is provided it will be transformed into a defaultdict internally. The default value 
+    will then be specified via type collation then.
+    """
+    default_number_float: Union[type, str, BasicType] = None
+    """
+    Data type used for all untyped floating point numbers (i.e. 0.5). By default the value of data_type is used.
+    If data_type is given as a defaultdict its default_factory is used.
+    """
+    default_number_int: Union[type, str, BasicType] = np.int64
+    """
+    Data type used for all untyped integer numbers (i.e. 1)
+    """
+    iteration_slice: Tuple = None
+    """
+    Rectangular subset to iterate over, if not specified the complete non-ghost layer part of the field is iterated over
+    """
+    ghost_layers: Union[bool, int, List[Tuple[int]]] = None
+    """
+    A single integer specifies the ghost layer count at all borders, can also be a sequence of
+    pairs ``[(x_lower_gl, x_upper_gl), .... ]``. These layers are excluded from the iteration.
+    If left to default, the number of ghost layers is determined automatically from the assignments.
+    """
+    cpu_openmp: Union[bool, int] = False
+    """
+    `True` or number of threads for OpenMP parallelization, `False` for no OpenMP. If set to `True`, the maximum number
+    of available threads will be chosen.
+    """
+    cpu_vectorize_info: Dict = None
+    """
+    A dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
+    for documentation of these parameters see vectorize function. Example:
+    '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
+    """
+    cpu_blocking: Tuple[int] = None
+    """
+    A tuple of block sizes or `None` if no blocking should be applied
+    """
+    omp_single_loop: bool = True
+    """
+    If OpenMP is active: whether multiple outer loops are permitted
+    """
+    base_pointer_specification: Union[List[Iterable[str]], List[Iterable[int]]] = None
+    """
+    Specification of how many and which intermediate pointers are created for a field access.
+    For example [ (0), (2,3,)]  creates on base pointer for coordinates 2 and 3 and writes the offset for coordinate
+    zero directly in the field access. These specifications are defined dependent on the loop ordering.
+    This function translates more readable version into the specification above.
+    
+    For more information see: `pystencils.transformations.create_intermediate_base_pointer`
+    """
+    gpu_indexing: str = 'block'
+    """
+    Either 'block' or 'line' , or custom indexing class, see `pystencils.gpu.AbstractIndexing`
+    """
+    gpu_indexing_params: MappingProxyType = field(default_factory=lambda: MappingProxyType({}))
+    """
+    Dict with indexing parameters (constructor parameters of indexing class)
+    e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'.
+    """
+    # TODO Markus rework this docstring
+    default_assignment_simplifications: bool = False
+    """
+    If `True` default simplifications are first performed on the Assignments. If problems occur during the
+    simplification a warning will be thrown.
+    Furthermore, it is essential to know that this is a two-stage process. The first stage of the process acts
+    on the level of the `pystencils.AssignmentCollection`.  In this part,
+    `pystencil.simp.create_simplification_strategy` from pystencils.simplificationfactory will be used to
+    apply optimisations like insertion of constants to
+    remove pressure from the registers. Thus the first part of the optimisations can only be executed if
+    an `AssignmentCollection` is passed. The second part of the optimisation acts on the level of each Assignment
+    individually. In this stage, all optimisations from `sympy.codegen.rewriting.optims_c99` are applied
+    to each Assignment. Thus this stage can also be applied if a list of Assignments is passed.
+    """
+    cpu_prepend_optimizations: List[Callable] = field(default_factory=list)
+    """
+    List of extra optimizations to perform first on the AST.
+    """
+    use_auto_for_assignments: bool = False
+    """
+    If set to `True`, auto can be used in the generated code for data types. This makes the type system more robust.
+    """
+    index_fields: List[Field] = None
+    """
+    List of index fields, i.e. 1D fields with struct data type. If not `None`, `create_index_kernel`
+    instead of `create_domain_kernel` is used.
+    """
+    coordinate_names: Tuple[str, Any] = ('x', 'y', 'z')
+    """
+    Name of the coordinate fields in the struct data type.
+    """
+    allow_double_writes: bool = False
+    """
+    If True, don't check if every field is only written at a single location. This is required
+    for example for kernels that are compiled with loop step sizes > 1, that handle multiple
+    cells at once. Use with care!
+    """
+    skip_independence_check: bool = False
+    """
+    By default the assignment list is checked for read/write independence. This means fields are only written at
+    locations where they are read. Doing so guarantees thread safety. In some cases e.g. for
+    periodicity kernel, this can not be assured and does the check needs to be deactivated. Use with care!
+    """
+
+    class DataTypeFactory:
+        """Because of pickle, we need to have a nested class, instead of a lambda in __post_init__"""
+        def __init__(self, dt):
+            self.dt = dt
+
+        def __call__(self):
+            return BasicType(self.dt)
+
+    def _check_type(self, dtype_to_check):
+        if isinstance(dtype_to_check, str) and (dtype_to_check == 'float' or dtype_to_check == 'int'):
+            self._typing_error()
+
+        if isinstance(dtype_to_check, type) and not hasattr(dtype_to_check, "dtype"):
+            # NumPy-types are also of type 'type'. However, they have more properties
+            self._typing_error()
+
+    @staticmethod
+    def _typing_error():
+        raise ValueError("It is not possible to use python types (float, int) for datatypes because these "
+                         "types are ambiguous. For example float will map to double. "
+                         "Also the string version like 'float' is not allowed, e.g. use 'float64' instead")
+
+    def __post_init__(self):
+        # ----  Legacy parameters
+        if not isinstance(self.target, Target):
+            raise ValueError("target must be provided by the 'Target' enum")
+
+        # ---- Auto Backend
+        if not self.backend:
+            if self.target == Target.CPU:
+                self.backend = Backend.C
+            elif self.target == Target.GPU:
+                self.backend = Backend.CUDA
+            else:
+                raise NotImplementedError(f'Target {self.target} has no default backend')
+
+        if not isinstance(self.backend, Backend):
+            raise ValueError("backend must be provided by the 'Backend' enum")
+
+        # Normalise data types
+        for dtype in [self.data_type, self.default_number_float, self.default_number_int]:
+            self._check_type(dtype)
+
+        if not isinstance(self.data_type, dict):
+            dt = copy(self.data_type)  # The copy is necessary because BasicType has sympy shinanigans
+            self.data_type = defaultdict(self.DataTypeFactory(dt))
+
+        if isinstance(self.data_type, dict) and not isinstance(self.data_type, defaultdict):
+            for dtype in self.data_type.values():
+                self._check_type(dtype)
+
+            dt = collate_types([BasicType(dtype) for dtype in self.data_type.values()])
+            dtype_dict = self.data_type
+            self.data_type = defaultdict(self.DataTypeFactory(dt), dtype_dict)
+
+        assert isinstance(self.data_type, defaultdict), "At this point data_type must be a defaultdict!"
+        for dtype in self.data_type.values():
+            self._check_type(dtype)
+        self._check_type(self.data_type.default_factory())
+
+        if self.default_number_float is None:
+            self.default_number_float = self.data_type.default_factory()
+
+        if not isinstance(self.default_number_float, BasicType):
+            self.default_number_float = BasicType(self.default_number_float)
+        if not isinstance(self.default_number_int, BasicType):
+            self.default_number_int = BasicType(self.default_number_int)
--- a/pystencils/cpu/__init__.py
+++ b/pystencils/cpu/__init__.py
 from pystencils.cpu.cpujit import make_python_function
-from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel
+from pystencils.cpu.kernelcreation import add_openmp, create_indexed_kernel, create_kernel, add_pragmas

-__all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'make_python_function']
+__all__ = ['create_kernel', 'create_indexed_kernel', 'add_openmp', 'add_pragmas', 'make_python_function']
--- a/pystencils/cpu/cpujit.py
+++ b/pystencils/cpu/cpujit.py
@@ -13,7 +13,7 @@ in a configuration file.
 3. or in your home directory at ``~/.config/pystencils/config.json`` (Linux) or
   ``%HOMEPATH%\.pystencils\config.json`` (Windows)

-If no configuration file is found, a default configuration is created at the above mentioned location in your home.
+If no configuration file is found, a default configuration is created at the above-mentioned location in your home.
 So run *pystencils* once, then edit the created configuration file.


@@ -23,7 +23,7 @@ Compiler Config (Linux)
 - **'os'**: should be detected automatically as 'linux'
 - **'command'**: path to C++ compiler (defaults to 'g++')
 - **'flags'**: space separated list of compiler flags. Make sure to activate OpenMP in your compiler
- **'restrict_qualifier'**: the restrict qualifier is not standardized accross compilers.
+- **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
  For most Linux compilers the qualifier is ``__restrict__``


@@ -39,30 +39,36 @@ Then 'cl.exe' is used to compile.
  where Visual Studio is installed. This path has to contain a file called 'vcvarsall.bat'
 - **'arch'**: 'x86' or 'x64'
 - **'flags'**: flags passed to 'cl.exe', make sure OpenMP is activated
- **'restrict_qualifier'**: the restrict qualifier is not standardized across compilers.
+- **'restrict_qualifier'**: the 'restrict' qualifier is not standardized across compilers.
  For Windows compilers the qualifier should be ``__restrict``

 """
+from appdirs import user_cache_dir, user_config_dir
+from collections import OrderedDict
 import hashlib
+import importlib.util
 import json
 import os
 import platform
 import shutil
 import subprocess
+import sysconfig
+import tempfile
 import textwrap
-from collections import OrderedDict
-from sysconfig import get_paths
-from tempfile import TemporaryDirectory, NamedTemporaryFile
+import time
+import warnings
+import pathlib

 import numpy as np
-from appdirs import user_cache_dir, user_config_dir

 from pystencils import FieldType
 from pystencils.astnodes import LoopOverCoordinate
-from pystencils.backends.cbackend import generate_c, get_headers, CFunction
-from pystencils.data_types import cast_func, VectorType, vector_memory_access
+from pystencils.backends.cbackend import generate_c, get_headers
+from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
+from pystencils.cpu.msvc_detection import get_environment
 from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
+from pystencils.typing import BasicType, CastFunc, VectorType, VectorMemoryAccess
 from pystencils.utils import atomic_file_write, recursive_dict_update


@@ -118,15 +124,15 @@ def get_configuration_file_path():

    # 1) Read path from environment variable if found
    if 'PYSTENCILS_CONFIG' in os.environ:
-        return os.environ['PYSTENCILS_CONFIG'], True
+        return os.environ['PYSTENCILS_CONFIG']
    # 2) Look in current directory for pystencils.json
    elif os.path.exists("pystencils.json"):
-        return "pystencils.json", True
+        return "pystencils.json"
    # 3) Try ~/.pystencils.json
    elif os.path.exists(config_path_in_home):
-        return config_path_in_home, True
+        return config_path_in_home
    else:
-        return config_path_in_home, False
+        return config_path_in_home


 def create_folder(path, is_file):
@@ -146,7 +152,7 @@ def read_config():
            ('flags', '-Ofast -DNDEBUG -fPIC -march=native -fopenmp -std=c++11'),
            ('restrict_qualifier', '__restrict__')
        ])
-        if platform.machine().startswith('ppc64'):
+        if platform.machine().startswith('ppc64') or platform.machine() == 'arm64':
            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native',
                                                                                        '-mcpu=native')
    elif platform.system().lower() == 'windows':
@@ -157,6 +163,9 @@ def read_config():
            ('flags', '/Ox /fp:fast /OpenMP /arch:avx'),
            ('restrict_qualifier', '__restrict')
        ])
+        if platform.machine() == 'ARM64':
+            default_compiler_config['arch'] = 'ARM64'
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace(' /arch:avx', '')
    elif platform.system().lower() == 'darwin':
        default_compiler_config = OrderedDict([
            ('os', 'darwin'),
@@ -165,15 +174,19 @@ def read_config():
            ('restrict_qualifier', '__restrict__')
        ])
        if platform.machine() == 'arm64':
-            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', '')
+            if 'sme' in get_supported_instruction_sets():
+                flag = '-march=armv8.7-a+sme '
+            else:
+                flag = ''
+            default_compiler_config['flags'] = default_compiler_config['flags'].replace('-march=native ', flag)
        for libomp in ['/opt/local/lib/libomp/libomp.dylib', '/usr/local/lib/libomp.dylib',
                       '/opt/homebrew/lib/libomp.dylib']:
            if os.path.exists(libomp):
                default_compiler_config['flags'] += ' ' + libomp
                break
    else:
-        raise ValueError("The detection of the platform with platform.system() did not work. "
-                         "Pystencils is only supported for linux, windows, and darwin platforms.")
+        raise NotImplementedError('Generation of default compiler flags for %s is not implemented' %
+                                  (platform.system(),))

    default_cache_config = OrderedDict([
        ('object_cache', os.path.join(user_cache_dir('pystencils'), 'objectcache')),
@@ -183,16 +196,22 @@ def read_config():
    default_config = OrderedDict([('compiler', default_compiler_config),
                                  ('cache', default_cache_config)])

-    config_path, config_exists = get_configuration_file_path()
+    from fasteners import InterProcessLock
+
+    config_path = pathlib.Path(get_configuration_file_path())
+    config_path.parent.mkdir(parents=True, exist_ok=True)
+    
    config = default_config.copy()
-    if config_exists:
-        with open(config_path, 'r') as json_config_file:
-            loaded_config = json.load(json_config_file)
-        config = recursive_dict_update(config, loaded_config)
-    else:
-        create_folder(config_path, True)
-        with open(config_path, 'w') as f:
-            json.dump(config, f, indent=4)
+
+    lockfile = config_path.with_suffix(config_path.suffix + ".lock")
+    with InterProcessLock(lockfile):
+        if config_path.exists():
+            with open(config_path, 'r') as json_config_file:
+                loaded_config = json.load(json_config_file)
+            config = recursive_dict_update(config, loaded_config)
+        else:
+            with open(config_path, 'w') as f:
+                json.dump(config, f, indent=4)

    if config['cache']['object_cache'] is not False:
        config['cache']['object_cache'] = os.path.expanduser(config['cache']['object_cache']).format(pid=os.getpid())
@@ -213,12 +232,11 @@ def read_config():
            shutil.rmtree(config['cache']['object_cache'], ignore_errors=True)

        create_folder(config['cache']['object_cache'], False)
-        with NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
+        with tempfile.NamedTemporaryFile('w', dir=os.path.dirname(cache_status_file), delete=False) as f:
            json.dump(config['compiler'], f, indent=4)
        os.replace(f.name, cache_status_file)

    if config['compiler']['os'] == 'windows':
-        from pystencils.cpu.msvc_detection import get_environment
        msvc_env = get_environment(config['compiler']['msvc_version'], config['compiler']['arch'])
        if 'env' not in config['compiler']:
            config['compiler']['env'] = {}
@@ -265,6 +283,7 @@ def clear_cache():
        create_folder(cache_config['object_cache'], False)


+# TODO don't hardcode C type. [1] of tuple output
 type_mapping = {
    np.float32: ('PyFloat_AsDouble', 'float'),
    np.float64: ('PyFloat_AsDouble', 'double'),
@@ -274,8 +293,6 @@ type_mapping = {
    np.uint16: ('PyLong_AsUnsignedLong', 'uint16_t'),
    np.uint32: ('PyLong_AsUnsignedLong', 'uint32_t'),
    np.uint64: ('PyLong_AsUnsignedLong', 'uint64_t'),
-    np.complex64: (('PyComplex_RealAsDouble', 'PyComplex_ImagAsDouble'), 'ComplexFloat'),
-    np.complex128: (('PyComplex_RealAsDouble', 'PyComplex_ImagAsDouble'), 'ComplexDouble'),
 }

 template_extract_scalar = """
@@ -285,14 +302,6 @@ if( obj_{name} == NULL) {{  PyErr_SetString(PyExc_TypeError, "Keyword argument '
 if( PyErr_Occurred() ) {{ return NULL; }}
 """

-template_extract_complex = """
-PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
-if( obj_{name} == NULL) {{  PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
-{target_type} {name}{{ ({real_type}) {extract_function_real}( obj_{name} ),
-                       ({real_type}) {extract_function_imag}( obj_{name} ) }};
-if( PyErr_Occurred() ) {{ return NULL; }}
-"""
-
 template_extract_array = """
 PyObject * obj_{name} = PyDict_GetItemString(kwargs, "{name}");
 if( obj_{name} == NULL) {{  PyErr_SetString(PyExc_TypeError, "Keyword argument '{name}' missing"); return NULL; }};
@@ -388,7 +397,7 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                aligned = False
                if ast_node.assignments:
                    aligned = any([a.lhs.args[2] for a in ast_node.assignments
-                                   if hasattr(a, 'lhs') and isinstance(a.lhs, cast_func)
+                                   if hasattr(a, 'lhs') and isinstance(a.lhs, CastFunc)
                                   and hasattr(a.lhs, 'dtype') and isinstance(a.lhs.dtype, VectorType)])

                if ast_node.instruction_set and aligned:
@@ -398,9 +407,10 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
                        for loop in ast_node.atoms(LoopOverCoordinate):
                            has_openmp = has_openmp or any(['#pragma omp' in p for p in loop.prefix_lines])
                            has_nontemporal = has_nontemporal or any([a.args[0].field == field and a.args[3] for a in
-                                                                      loop.atoms(vector_memory_access)])
+                                                                      loop.atoms(VectorMemoryAccess)])
                        if has_openmp and has_nontemporal:
-                            byte_width = ast_node.instruction_set['cachelineSize']
+                            cl_size = ast_node.instruction_set['cachelineSize']
+                            byte_width = f"({cl_size}) < SIZE_MAX ? ({cl_size}) : ({byte_width})"
                    offset = max(max(ast_node.ghost_layers)) * item_size
                    offset_cond = f"(((uintptr_t) buffer_{field.name}.buf) + {offset}) % ({byte_width}) == 0"

@@ -449,21 +459,11 @@ def create_function_boilerplate_code(parameter_info, name, ast_node, insert_chec
            parameters.append(f"buffer_{field.name}.strides[{param.symbol.coordinate}] / {item_size}")
        elif param.is_field_shape:
            parameters.append(f"buffer_{param.field_name}.shape[{param.symbol.coordinate}]")
-        elif type(param.symbol) is CFunction:
-            continue
        else:
            extract_function, target_type = type_mapping[param.symbol.dtype.numpy_dtype.type]
-            if np.issubdtype(param.symbol.dtype.numpy_dtype, np.complexfloating):
-                pre_call_code += template_extract_complex.format(extract_function_real=extract_function[0],
-                                                                 extract_function_imag=extract_function[1],
-                                                                 target_type=target_type,
-                                                                 real_type="float" if target_type == "ComplexFloat"
-                                                                 else "double",
-                                                                 name=param.symbol.name)
-            else:
-                pre_call_code += template_extract_scalar.format(extract_function=extract_function,
-                                                                target_type=target_type,
-                                                                name=param.symbol.name)
+            pre_call_code += template_extract_scalar.format(extract_function=extract_function,
+                                                            target_type=target_type,
+                                                            name=param.symbol.name)

            parameters.append(param.symbol.name)

@@ -483,18 +483,15 @@ def create_module_boilerplate_code(module_name, names):


 def load_kernel_from_file(module_name, function_name, path):
-    from importlib.util import spec_from_file_location, module_from_spec
    try:
-        spec = spec_from_file_location(name=module_name, location=path)
-        mod = module_from_spec(spec)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
+        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)
    except ImportError:
-        import time
-        import warnings
        warnings.warn(f"Could not load {path}, trying on more time in 5 seconds ...")
        time.sleep(5)
-        spec = spec_from_file_location(name=module_name, location=path)
-        mod = module_from_spec(spec)
+        spec = importlib.util.spec_from_file_location(name=module_name, location=path)
+        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)

    return getattr(mod, function_name)
@@ -533,9 +530,13 @@ class ExtensionModuleCode:

        headers = {'<math.h>', '<stdint.h>'}
        for ast in self._ast_nodes:
+            for field in ast.fields_accessed:
+                if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                    # Add the half precision header only if half precision numbers occur in the AST
+                    headers.add('"half_precision.h"')
            headers.update(get_headers(ast))
-        header_list = list(headers)
-        header_list.sort()
+
+        header_list = sorted(headers)
        header_list.insert(0, '"Python.h"')
        ps_headers = [os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]) for h in header_list
                      if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'include', h[1:-1]))]
@@ -572,7 +573,7 @@ def compile_module(code, code_hash, base_dir, compile_flags=None):
        compile_flags = []

    compiler_config = get_compiler_config()
-    extra_flags = ['-I' + get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags
+    extra_flags = ['-I' + sysconfig.get_paths()['include'], '-I' + get_pystencils_include_path()] + compile_flags

    if compiler_config['os'].lower() == 'windows':
        lib_suffix = '.pyd'
@@ -606,7 +607,6 @@ def compile_module(code, code_hash, base_dir, compile_flags=None):

        # Linking
        if windows:
-            import sysconfig
            config_vars = sysconfig.get_config_vars()
            py_lib = os.path.join(config_vars["installed_base"], "libs",
                                  f"python{config_vars['py_version_nodot']}.lib")
@@ -627,7 +627,12 @@ def compile_and_load(ast, custom_backend=None):
    cache_config = get_cache_config()

    compiler_config = get_compiler_config()
-    function_prefix = '__declspec(dllexport)' if compiler_config['os'].lower() == 'windows' else ''
+    if compiler_config['os'].lower() == 'windows':
+        function_prefix = '__declspec(dllexport)'
+    elif ast.instruction_set and 'function_prefix' in ast.instruction_set:
+        function_prefix = ast.instruction_set['function_prefix']
+    else:
+        function_prefix = ''

    code = ExtensionModuleCode(custom_backend=custom_backend)
    code.add_function(ast, ast.function_name)
@@ -640,7 +645,7 @@ def compile_and_load(ast, custom_backend=None):
        compile_flags = ast.instruction_set['compile_flags']

    if cache_config['object_cache'] is False:
-        with TemporaryDirectory() as base_dir:
+        with tempfile.TemporaryDirectory() as base_dir:
            lib_file = compile_module(code, code_hash_str, base_dir, compile_flags=compile_flags)
            result = load_kernel_from_file(code_hash_str, ast.function_name, lib_file)
    else:

--- a/pystencils/cpu/kernelcreation.py
+++ b/pystencils/cpu/kernelcreation.py
-from typing import List, Union
-
 import sympy as sp
-import numpy as np

 import pystencils.astnodes as ast
-from pystencils.assignment import Assignment
+from pystencils.config import CreateKernelConfig
 from pystencils.enums import Target, Backend
 from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
 from pystencils.cpu.cpujit import make_python_function
-from pystencils.data_types import StructType, TypedSymbol, create_type
+from pystencils.typing import StructType, TypedSymbol, create_type
+from pystencils.typing.transformations import add_types
 from pystencils.field import Field, FieldType
+from pystencils.node_collection import NodeCollection
 from pystencils.transformations import (
-    add_types, filtered_tree_iteration, get_base_buffer_index, get_optimal_loop_ordering, make_loop_over_domain,
+    filtered_tree_iteration, iterate_loops_by_depth, get_base_buffer_index, get_optimal_loop_ordering,
+    make_loop_over_domain, add_outer_loop_over_indexed_elements,
    move_constants_before_loop, parse_base_pointer_info, resolve_buffer_accesses,
    resolve_field_accesses, split_inner_loop)

-AssignmentOrAstNodeList = List[Union[Assignment, ast.Node]]
-

-def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "kernel", type_info='double',
-                  split_groups=(), iteration_slice=None, ghost_layers=None,
-                  skip_independence_check=False, allow_double_writes=False) -> KernelFunction:
+def create_kernel(assignments: NodeCollection,
+                  config: CreateKernelConfig) -> KernelFunction:
    """Creates an abstract syntax tree for a kernel function, by taking a list of update rules.

    Loops are created according to the field accesses in the equations.
@@ -28,39 +25,25 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
    Args:
        assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
        Defining the update rules of the kernel
-        function_name: name of the generated function - only important if generated code is written out
-        type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
-                   be of type 'double' except symbols which occur on the left hand side of equations where the
-                   right hand side is a sympy Boolean which are assumed to be 'bool' .
-        split_groups: Specification on how to split up inner loop into multiple loops. For details see
-                      transformation :func:`pystencils.transformation.split_inner_loop`
-        iteration_slice: if not None, iteration is done only over this slice of the field
-        ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
-                      that should be excluded from the iteration.
-                     if None, the number of ghost layers is determined automatically and assumed to be equal for a
-                     all dimensions
-        skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
-                                 periodicity kernel, that access the field outside the iteration bounds. Use with care!
-        allow_double_writes: If True, don't check if every field is only written at a single location. This is required
-                             for example for kernels that are compiled with loop step sizes > 1, that handle multiple 
-                             cells at once. Use with care!
+        config: create kernel config

    Returns:
        AST node representing a function, that can be printed as C or CUDA code
    """
-    def type_symbol(term):
-        if isinstance(term, Field.Access) or isinstance(term, TypedSymbol):
-            return term
-        elif isinstance(term, sp.Symbol):
-            if isinstance(type_info, str) or not hasattr(type_info, '__getitem__'):
-                return TypedSymbol(term.name, create_type(type_info))
-            else:
-                return TypedSymbol(term.name, type_info[term.name])
-        else:
-            raise ValueError("Term has to be field access or symbol")
+    function_name = config.function_name
+    iteration_slice = config.iteration_slice
+    ghost_layers = config.ghost_layers
+    fields_written = assignments.bound_fields
+    fields_read = assignments.rhs_fields
+
+    split_groups = ()
+    if 'split_groups' in assignments.simplification_hints:
+        split_groups = assignments.simplification_hints['split_groups']
+    assignments = assignments.all_assignments
+
+    # TODO Cleanup: move add_types to create_domain_kernel or create_kernel
+    assignments = add_types(assignments, config)

-    fields_read, fields_written, assignments = add_types(
-        assignments, type_info, not skip_independence_check, check_double_write_condition=not allow_double_writes)
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])

@@ -71,14 +54,31 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke
    loop_order = get_optimal_loop_ordering(fields_without_buffers)
    loop_node, ghost_layer_info = make_loop_over_domain(body, iteration_slice=iteration_slice,
                                                        ghost_layers=ghost_layers, loop_order=loop_order)
+    loop_node = add_outer_loop_over_indexed_elements(loop_node)
+
    ast_node = KernelFunction(loop_node, Target.CPU, Backend.C, compile_function=make_python_function,
                              ghost_layers=ghost_layer_info, function_name=function_name, assignments=assignments)

    if split_groups:
+        type_info = config.data_type
+
+        def type_symbol(term):
+            if isinstance(term, Field.Access) or isinstance(term, TypedSymbol):
+                return term
+            elif isinstance(term, sp.Symbol):
+                if isinstance(type_info, str) or not hasattr(type_info, '__getitem__'):
+                    return TypedSymbol(term.name, create_type(type_info))
+                else:
+                    return TypedSymbol(term.name, type_info[term.name])
+            else:
+                raise ValueError("Term has to be field access or symbol")
+
        typed_split_groups = [[type_symbol(s) for s in split_group] for split_group in split_groups]
        split_inner_loop(ast_node, typed_split_groups)

-    base_pointer_spec = [['spatialInner0'], ['spatialInner1']] if len(loop_order) >= 2 else [['spatialInner0']]
+    base_pointer_spec = config.base_pointer_specification
+    if base_pointer_spec is None:
+        base_pointer_spec = []
    base_pointer_info = {field.name: parse_base_pointer_info(base_pointer_spec, loop_order,
                                                             field.spatial_dimensions, field.index_dimensions)
                         for field in fields_without_buffers}
@@ -90,13 +90,14 @@ def create_kernel(assignments: AssignmentOrAstNodeList, function_name: str = "ke

    if any(FieldType.is_buffer(f) for f in all_fields):
        resolve_buffer_accesses(ast_node, get_base_buffer_index(ast_node), read_only_fields)
+    # TODO think about typing
    resolve_field_accesses(ast_node, read_only_fields, field_to_base_pointer_info=base_pointer_info)
    move_constants_before_loop(ast_node)
    return ast_node


-def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, function_name="kernel",
-                          type_info=None, coordinate_names=('x', 'y', 'z')) -> KernelFunction:
+def create_indexed_kernel(assignments: NodeCollection,
+                          config: CreateKernelConfig) -> KernelFunction:
    """
    Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
    coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
@@ -108,31 +109,39 @@ def create_indexed_kernel(assignments: AssignmentOrAstNodeList, index_fields, fu

    Args:
        assignments: list of assignments
-        index_fields: list of index fields, i.e. 1D fields with struct data type
-        type_info: see documentation of :func:`create_kernel`
-        function_name: see documentation of :func:`create_kernel`
-        coordinate_names: name of the coordinate fields in the struct data type
+        config: Kernel configuration
    """
-    fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False)
+    function_name = config.function_name
+    index_fields = config.index_fields
+    coordinate_names = config.coordinate_names
+    fields_written = assignments.bound_fields
+    fields_read = assignments.rhs_fields
+
    all_fields = fields_read.union(fields_written)
+    # extract the index fields based on the name. The original index field might have been modified
+    index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
+    non_index_fields = [f for f in all_fields if f not in index_fields]
+    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
+    assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
+                                          f"Non index fields are {non_index_fields}, spatial coordinates are " \
+                                          f"{spatial_coordinates}"
+    spatial_coordinates = list(spatial_coordinates)[0]
+
+    assignments = assignments.all_assignments
+    assignments = add_types(assignments, config)

    for index_field in index_fields:
        index_field.field_type = FieldType.INDEXED
        assert FieldType.is_indexed(index_field)
        assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"

-    non_index_fields = [f for f in all_fields if f not in index_fields]
-    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
-    assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
-    spatial_coordinates = list(spatial_coordinates)[0]
-
    def get_coordinate_symbol_assignment(name):
        for idx_field in index_fields:
            assert isinstance(idx_field.dtype, StructType), "Index fields have to have a struct data type"
            data_type = idx_field.dtype
            if data_type.has_element(name):
                rhs = idx_field[0](name)
-                lhs = TypedSymbol(name, np.int64)
+                lhs = TypedSymbol(name, data_type.get_element_type(name))
                return SympyAssignment(lhs, rhs)
        raise ValueError(f"Index {name} not found in any of the passed index fields")

@@ -207,3 +216,18 @@ def add_openmp(ast_node, schedule="static", num_threads=True, collapse=None, ass
        if collapse:
            prefix += f" collapse({collapse})"
        loop_to_parallelize.prefix_lines.append(prefix)
+
+
+def add_pragmas(ast_node, pragma_lines, nesting_depth=-1):
+    """Prepends given pragma lines to all loops of specified nesting depth.
+    
+    Args:
+        ast_node: pystencils abstract syntax tree
+        pragma_lines: Iterable of strings containing the pragma lines
+        nesting_depth: Nesting depth of the loops the pragmas should be applied to.
+                       Outermost loop has depth 0.
+                       A depth of -1 indicates the innermost loops.
+    """
+    loop_nodes = iterate_loops_by_depth(ast_node, nesting_depth)
+    for n in loop_nodes:
+        n.prefix_lines += list(pragma_lines)
--- a/pystencils/cpu/msvc_detection.py
+++ b/pystencils/cpu/msvc_detection.py
--- a/pystencils/cpu/vectorization.py
+++ b/pystencils/cpu/vectorization.py
@@ -3,13 +3,13 @@ from typing import Container, Union

 import numpy as np
 import sympy as sp
-from sympy.logic.boolalg import BooleanFunction
+from sympy.logic.boolalg import BooleanFunction, BooleanAtom

 import pystencils.astnodes as ast
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
-from pystencils.data_types import (
-    PointerType, TypedSymbol, VectorType, cast_func, collate_types, get_type_of_expression, vector_memory_access)
-from pystencils.fast_approximation import fast_division, fast_inv_sqrt, fast_sqrt
+from pystencils.typing import (BasicType, PointerType, TypedSymbol, VectorType, CastFunc, collate_types,
+                               get_type_of_expression, VectorMemoryAccess)
+from pystencils.functions import DivFunc
 from pystencils.field import Field
 from pystencils.integer_functions import modulo_ceil, modulo_floor
 from pystencils.sympyextensions import fast_subs
@@ -76,6 +76,8 @@ class CachelineSize(ast.Node):
 def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
              assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
              assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
+    # TODO Vectorization Revamp we first introduce the remainder loop and then check if we can even vectorise.
+    #  Maybe first copy the ast and return the copied version on failure
    """Explicit vectorization using SIMD vectorization via intrinsics.

    Args:
@@ -121,24 +123,31 @@ def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
                                  "to differently typed floating point fields")
    float_size = field_float_dtypes.pop().numpy_dtype.itemsize
    assert float_size in (8, 4)
-    default_float_type = 'double' if float_size == 8 else 'float'
+    default_float_type = 'float64' if float_size == 8 else 'float32'
    vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set)
-    vector_width = vector_is['width']
    kernel_ast.instruction_set = vector_is

+    if nontemporal and 'cachelineZero' in vector_is:
+        kernel_ast.use_all_written_field_sizes = True
    strided = 'storeS' in vector_is and 'loadS' in vector_is
-    keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned else 'storeU']
-    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, nontemporal,
-                                                strided, keep_loop_stop, assume_sufficient_line_padding)
-    insert_vector_casts(kernel_ast, default_float_type)
+    keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned and 'storeA' in vector_is else 'storeU']
+    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, assume_aligned, nontemporal,
+                                                strided, keep_loop_stop, assume_sufficient_line_padding,
+                                                default_float_type)


-def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_aligned, nontemporal_fields,
-                                                strided, keep_loop_stop, assume_sufficient_line_padding):
+def vectorize_inner_loops_and_adapt_load_stores(ast_node, assume_aligned, nontemporal_fields,
+                                                strided, keep_loop_stop, assume_sufficient_line_padding,
+                                                default_float_type):
    """Goes over all innermost loops, changes increment to vector width and replaces field accesses by vector type."""
-    all_loops = filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment)
-    inner_loops = [n for n in all_loops if n.is_innermost_loop]
-    zero_loop_counters = {l.loop_counter_symbol: 0 for l in all_loops}
+    all_loops = list(filtered_tree_iteration(ast_node, ast.LoopOverCoordinate, stop_type=ast.SympyAssignment))
+    inner_loops = [loop for loop in all_loops if loop.is_innermost_loop]
+    zero_loop_counters = {loop.loop_counter_symbol: 0 for loop in all_loops}
+
+    vector_is = ast_node.instruction_set
+    assert vector_is, "The ast needs to hold information about the instruction_set for the vectorisation"
+    vector_width = vector_is['width']
+    vector_int_width = vector_is['intwidth']

    for loop_node in inner_loops:
        loop_range = loop_node.stop - loop_node.start
@@ -152,11 +161,14 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
            loop_node.stop = new_stop
        else:
            cutting_point = modulo_floor(loop_range, vector_width) + loop_node.start
-            loop_nodes = [l for l in cut_loop(loop_node, [cutting_point]).args if isinstance(l, ast.LoopOverCoordinate)]
+            # TODO cut_loop calls deepcopy on the loop_node. This is bad as documented in cut_loop
+            loop_nodes = [loop for loop in cut_loop(loop_node, [cutting_point]).args
+                          if isinstance(loop, ast.LoopOverCoordinate)]
            assert len(loop_nodes) in (0, 1, 2)  # 2 for main and tail loop, 1 if loop range divisible by vector width
            if len(loop_nodes) == 0:
                continue
            loop_node = loop_nodes[0]
+            # loop_node is the vectorized one

        # Find all array accesses (indexed) that depend on the loop counter as offset
        loop_counter_symbol = ast.LoopOverCoordinate.get_loop_counter_symbol(loop_node.coordinate_to_loop_over)
@@ -165,23 +177,29 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
        for indexed in loop_node.atoms(sp.Indexed):
            base, index = indexed.args
            if loop_counter_symbol in index.atoms(sp.Symbol):
+                if 'loadA' not in vector_is and 'storeA' not in vector_is and 'maskStoreA' not in vector_is:
+                    # don't need to generate the alignment check when there are no aligned load/store instructions
+                    aligned_access = False
+                else:
+                    if not isinstance(vector_width, int):
+                        raise NotImplementedError('Access alignment cannot be statically determined for sizeless '
+                                                  'vector ISAs')
+                    aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) % vector_width == 0
                loop_counter_is_offset = loop_counter_symbol not in (index - loop_counter_symbol).atoms()
-                aligned_access = (index - loop_counter_symbol).subs(zero_loop_counters) == 0
                stride = sp.simplify(index.subs({loop_counter_symbol: loop_counter_symbol + 1}) - index)
                if not loop_counter_is_offset and (not strided or loop_counter_symbol in stride.atoms()):
                    successful = False
                    break
                typed_symbol = base.label
-                assert type(typed_symbol.dtype) is PointerType, \
-                    f"Type of access is {typed_symbol.dtype}, {indexed}"
+                assert type(typed_symbol.dtype) is PointerType, f"Type of access is {typed_symbol.dtype}, {indexed}"

                vec_type = VectorType(typed_symbol.dtype.base_type, vector_width)
                use_aligned_access = aligned_access and assume_aligned
                nontemporal = False
                if hasattr(indexed, 'field'):
                    nontemporal = (indexed.field in nontemporal_fields) or (indexed.field.name in nontemporal_fields)
-                substitutions[indexed] = vector_memory_access(indexed, vec_type, use_aligned_access, nontemporal, True,
-                                                              stride if strided else 1)
+                substitutions[indexed] = VectorMemoryAccess(indexed, vec_type, use_aligned_access, nontemporal, True,
+                                                            stride if strided else 1)
                if nontemporal:
                    # insert NontemporalFence after the outermost loop
                    parent = loop_node.parent
@@ -196,13 +214,13 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a

        loop_node.step = vector_width
        loop_node.subs(substitutions)
-        vector_int_width = ast_node.instruction_set['intwidth']
-        vector_loop_counter = cast_func(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width)) \
-            + cast_func(tuple(range(vector_int_width if type(vector_int_width) is int else 2)),
-                        VectorType(loop_counter_symbol.dtype, vector_int_width))
+        arg_1 = CastFunc(loop_counter_symbol, VectorType(loop_counter_symbol.dtype, vector_int_width))
+        arg_2 = CastFunc(tuple(range(vector_int_width if type(vector_int_width) is int else 2)),
+                         VectorType(loop_counter_symbol.dtype, vector_int_width))
+        vector_loop_counter = arg_1 + arg_2

        fast_subs(loop_node, {loop_counter_symbol: vector_loop_counter},
-                  skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, vector_memory_access))
+                  skip=lambda e: isinstance(e, ast.ResolvedFieldAccess) or isinstance(e, VectorMemoryAccess))

        mask_conditionals(loop_node)

@@ -214,6 +232,7 @@ def vectorize_inner_loops_and_adapt_load_stores(ast_node, vector_width, assume_a
            substitutions.update({s[0]: s[1] for s in zip(rng.result_symbols, new_result_symbols)})
            rng._symbols_defined = set(new_result_symbols)
        fast_subs(loop_node, substitutions, skip=lambda e: isinstance(e, RNGBase))
+        insert_vector_casts(loop_node, vector_is, default_float_type)


 def mask_conditionals(loop_body):
@@ -232,8 +251,8 @@ def mask_conditionals(loop_body):
                node.condition_expr = vec_any(node.condition_expr)
        elif isinstance(node, ast.SympyAssignment):
            if mask is not True:
-                s = {ma: vector_memory_access(*ma.args[0:4], sp.And(mask, ma.args[4]), *ma.args[5:])
-                     for ma in node.atoms(vector_memory_access)}
+                s = {ma: VectorMemoryAccess(*ma.args[0:4], sp.And(mask, ma.args[4]), *ma.args[5:])
+                     for ma in node.atoms(VectorMemoryAccess)}
                node.subs(s)
        else:
            for arg in node.args:
@@ -242,28 +261,55 @@ def mask_conditionals(loop_body):
    visit_node(loop_body, mask=True)


-def insert_vector_casts(ast_node, default_float_type='double'):
+def insert_vector_casts(ast_node, instruction_set, default_float_type='double'):
    """Inserts necessary casts from scalar values to vector values."""

-    handled_functions = (sp.Add, sp.Mul, fast_division, fast_sqrt, fast_inv_sqrt, vec_any, vec_all)
-
-    def visit_expr(expr, default_type='double'):
-        if isinstance(expr, vector_memory_access):
-            return vector_memory_access(*expr.args[0:4], visit_expr(expr.args[4], default_type), *expr.args[5:])
-        elif isinstance(expr, cast_func):
-            return expr
-        elif expr.func is sp.Abs and 'abs' not in ast_node.instruction_set:
-            new_arg = visit_expr(expr.args[0], default_type)
-            base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is vector_memory_access \
+    handled_functions = (sp.Add, sp.Mul, vec_any, vec_all, DivFunc, sp.Abs)
+
+    def is_scalar(expr) -> bool:
+        if hasattr(expr, "dtype"):
+            if type(expr.dtype) is VectorType:
+                return False
+            # Else branch: If expr is a CastFunc, then whether the expression
+            # is scalar is determined by the argument (remember: vector casts
+            # are not inserted yet). Therefore, we must recurse into the args of
+            # expr below. Otherwise, this expression is atomic and in that case
+            # it is assumed to be scalar below.
+
+        if isinstance(expr, ast.ResolvedFieldAccess):
+            # expr.field is not in expr.args
+            return is_scalar(expr.field)
+        elif isinstance(expr, (vec_any, vec_all)):
+            return True
+
+        if not hasattr(expr, "args"):
+            return True
+
+        return all(is_scalar(arg) for arg in expr.args)
+
+    # TODO Vectorization Revamp: get rid of default_type
+    def visit_expr(expr, default_type='double', force_vectorize=False):
+        if isinstance(expr, VectorMemoryAccess):
+            return VectorMemoryAccess(*expr.args[0:4], visit_expr(expr.args[4], default_type, force_vectorize),
+                                      *expr.args[5:])
+        elif isinstance(expr, CastFunc):
+            cast_type = expr.args[1]
+            arg = visit_expr(expr.args[0], default_type, force_vectorize)
+            assert cast_type in [BasicType('float32'), BasicType('float64')], \
+                f'Vectorization cannot vectorize type {cast_type}'
+            return expr.func(arg, VectorType(cast_type, instruction_set['width']))
+        elif expr.func is sp.Abs and 'abs' not in instruction_set:
+            new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
+            base_type = get_type_of_expression(expr.args[0]).base_type if type(expr.args[0]) is VectorMemoryAccess \
                else get_type_of_expression(expr.args[0])
-            pw = sp.Piecewise((-new_arg, new_arg < cast_func(0, base_type.numpy_dtype)),
+            pw = sp.Piecewise((-new_arg, new_arg < CastFunc(0, base_type.numpy_dtype)),
                              (new_arg, True))
-            return visit_expr(pw, default_type)
+            return visit_expr(pw, default_type, force_vectorize)
        elif expr.func in handled_functions or isinstance(expr, sp.Rel) or isinstance(expr, BooleanFunction):
            if expr.func is sp.Mul and expr.args[0] == -1:
                # special treatment for the unary minus: make sure that the -1 has the same type as the argument
                dtype = int
-                for arg in expr.atoms(vector_memory_access):
+                for arg in expr.atoms(VectorMemoryAccess):
                    if arg.dtype.base_type.is_float():
                        dtype = arg.dtype.base_type.numpy_dtype.type
                for arg in expr.atoms(TypedSymbol):
@@ -273,22 +319,42 @@ def insert_vector_casts(ast_node, default_float_type='double'):
                    if dtype is np.float32:
                        default_type = 'float'
                    expr = sp.Mul(dtype(expr.args[0]), *expr.args[1:])
-            new_args = [visit_expr(a, default_type) for a in expr.args]
+            new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args]
            arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
            if not any(type(t) is VectorType for t in arg_types):
                return expr
            else:
                target_type = collate_types(arg_types)
                casted_args = [
-                    cast_func(a, target_type) if t != target_type and not isinstance(a, vector_memory_access) else a
+                    CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
                    for a, t in zip(new_args, arg_types)]
                return expr.func(*casted_args)
+        elif expr.func is sp.UnevaluatedExpr:
+            assert expr.args[0].is_Pow or expr.args[0].is_Mul, "UnevaluatedExpr only implemented holding Mul or Pow"
+            # TODO this is only because cut_loop evaluates the multiplications again due to deepcopy. All this should
+            # TODO be fixed for real at some point.
+            if expr.args[0].is_Pow:
+                base = expr.args[0].base
+                exp = expr.args[0].exp
+                expr = sp.UnevaluatedExpr(sp.Mul(*([base] * +exp), evaluate=False))
+
+            new_args = [visit_expr(a, default_type, force_vectorize) for a in expr.args[0].args]
+            arg_types = [get_type_of_expression(a, default_float_type=default_type) for a in new_args]
+
+            target_type = collate_types(arg_types)
+            if not any(type(t) is VectorType for t in arg_types):
+                target_type = VectorType(target_type, instruction_set['width'])
+
+            casted_args = [
+                CastFunc(a, target_type) if t != target_type and not isinstance(a, VectorMemoryAccess) else a
+                for a, t in zip(new_args, arg_types)]
+            return expr.func(expr.args[0].func(*casted_args, evaluate=False))
        elif expr.func is sp.Pow:
-            new_arg = visit_expr(expr.args[0], default_type)
+            new_arg = visit_expr(expr.args[0], default_type, force_vectorize)
            return expr.func(new_arg, expr.args[1])
        elif expr.func == sp.Piecewise:
-            new_results = [visit_expr(a[0], default_type) for a in expr.args]
-            new_conditions = [visit_expr(a[1], default_type) for a in expr.args]
+            new_results = [visit_expr(a[0], default_type, force_vectorize) for a in expr.args]
+            new_conditions = [visit_expr(a[1], default_type, force_vectorize) for a in expr.args]
            types_of_results = [get_type_of_expression(a) for a in new_results]
            types_of_conditions = [get_type_of_expression(a) for a in new_conditions]

@@ -299,34 +365,54 @@ def insert_vector_casts(ast_node, default_float_type='double'):
            if type(condition_target_type) is not VectorType and type(result_target_type) is VectorType:
                condition_target_type = VectorType(condition_target_type, width=result_target_type.width)

-            casted_results = [cast_func(a, result_target_type) if t != result_target_type else a
+            casted_results = [CastFunc(a, result_target_type) if t != result_target_type else a
                              for a, t in zip(new_results, types_of_results)]

-            casted_conditions = [cast_func(a, condition_target_type)
+            casted_conditions = [CastFunc(a, condition_target_type)
                                 if t != condition_target_type and a is not True else a
                                 for a, t in zip(new_conditions, types_of_conditions)]

            return sp.Piecewise(*[(r, c) for r, c in zip(casted_results, casted_conditions)])
-        else:
+        elif isinstance(expr, TypedSymbol):
+            if force_vectorize:
+                expr_type = get_type_of_expression(expr)
+                if type(expr_type) is not VectorType:
+                    vector_type = VectorType(expr_type, instruction_set['width'])
+                    return CastFunc(expr, vector_type)
            return expr
+        elif isinstance(expr, (sp.Number, BooleanAtom)):
+            return expr
+        else:
+            raise NotImplementedError(f'Due to defensive programming we handle only specific expressions.\n'
+                                      f'The expression {expr} of type {type(expr)} is not known yet.')

    def visit_node(node, substitution_dict, default_type='double'):
        substitution_dict = substitution_dict.copy()
        for arg in node.args:
            if isinstance(arg, ast.SympyAssignment):
                assignment = arg
+                # If there is a remainder loop we do not vectorise it, thus lhs will indicate this
+                # if isinstance(assignment.lhs, ast.ResolvedFieldAccess):
+                # continue
                subs_expr = fast_subs(assignment.rhs, substitution_dict,
                                      skip=lambda e: isinstance(e, ast.ResolvedFieldAccess))
-                assignment.rhs = visit_expr(subs_expr, default_type)
-                rhs_type = get_type_of_expression(assignment.rhs)
+
+                # If either side contains a vectorized subexpression, both sides
+                # must be fully vectorized.
+                lhs_scalar = is_scalar(assignment.lhs)
+                rhs_scalar = is_scalar(subs_expr)
+
+                assignment.rhs = visit_expr(subs_expr, default_type, force_vectorize=not (lhs_scalar and rhs_scalar))
+
                if isinstance(assignment.lhs, TypedSymbol):
-                    lhs_type = assignment.lhs.dtype
-                    if type(rhs_type) is VectorType and type(lhs_type) is not VectorType:
+                    if lhs_scalar and not rhs_scalar:
+                        lhs_type = get_type_of_expression(assignment.lhs)
+                        rhs_type = get_type_of_expression(assignment.rhs)
                        new_lhs_type = VectorType(lhs_type, rhs_type.width)
                        new_lhs = TypedSymbol(assignment.lhs.name, new_lhs_type)
                        substitution_dict[assignment.lhs] = new_lhs
                        assignment.lhs = new_lhs
-                elif isinstance(assignment.lhs, vector_memory_access):
+                elif isinstance(assignment.lhs, VectorMemoryAccess):
                    assignment.lhs = visit_expr(assignment.lhs, default_type)
            elif isinstance(arg, ast.Conditional):
                arg.condition_expr = fast_subs(arg.condition_expr, substitution_dict,

--- a/pystencils/datahandling/__init__.py
+++ b/pystencils/datahandling/__init__.py
@@ -23,7 +23,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                         default_layout: str = 'SoA',
                         default_target: Target = Target.CPU,
                         parallel: bool = False,
-                         default_ghost_layers: int = 1) -> DataHandling:
+                         default_ghost_layers: int = 1,
+                         device_number: Union[int, None] = None) -> DataHandling:
    """Creates a data handling instance.

    Args:
@@ -34,6 +35,9 @@ def create_data_handling(domain_size: Tuple[int, ...],
        default_target: `Target`
        parallel: if True a parallel domain is created using walberla - each MPI process gets a part of the domain
        default_ghost_layers: default number of ghost layers if not overwritten in 'add_array'
+        device_number: If `default_target` is set to 'GPU' and `parallel` is False, a device number should be
+                       specified. If none is given, the device with the largest amount of memory is used. If multiple
+                       devices have the same amount of memory, the one with the lower number is used
    """
    if isinstance(default_target, str):
        new_target = Target[default_target.upper()]
@@ -69,7 +73,8 @@ def create_data_handling(domain_size: Tuple[int, ...],
                                  periodicity=periodicity,
                                  default_target=default_target,
                                  default_layout=default_layout,
-                                  default_ghost_layers=default_ghost_layers)
+                                  default_ghost_layers=default_ghost_layers,
+                                  device_number=device_number)


 __all__ = ['create_data_handling']
--- a/pystencils/datahandling/blockiteration.py
+++ b/pystencils/datahandling/blockiteration.py
@@ -115,7 +115,7 @@ class ParallelBlock(Block):
            result = wlb.field.toArray(result, with_ghost_layers=self._gls)
            result = self._normalize_array_shape(result)
        elif 'GpuField' in type_name:
-            result = wlb.cuda.toGpuArray(result, with_ghost_layers=self._gls)
+            result = wlb.gpu.toGpuArray(result, with_ghost_layers=self._gls)
            result = self._normalize_array_shape(result)
        return result


--- a/pystencils/datahandling/datahandling_interface.py
+++ b/pystencils/datahandling/datahandling_interface.py
@@ -331,6 +331,7 @@ class DataHandling(ABC):
                b[array_name][(Ellipsis, *value_idx)].fill(val)
            else:
                b[array_name].fill(val)
+            self.to_gpu(array_name)

    def min(self, array_name, slice_obj=None, ghost_layers=False, inner_ghost_layers=False, reduce=True):
        """Returns the minimum value inside the domain or slice of the domain.

--- a/pystencils/datahandling/parallel_datahandling.py
+++ b/pystencils/datahandling/parallel_datahandling.py
@@ -9,7 +9,7 @@ from pystencils.datahandling.blockiteration import block_iteration, sliced_block
 from pystencils.datahandling.datahandling_interface import DataHandling
 from pystencils.enums import Backend
 from pystencils.field import Field, FieldType
-from pystencils.kernelparameters import FieldPointerSymbol
+from pystencils.typing.typed_sympy import FieldPointerSymbol
 from pystencils.utils import DotDict
 from pystencils import Target

@@ -151,8 +151,8 @@ class ParallelDataHandling(DataHandling):
        if gpu:
            if alignment != 0:
                raise ValueError("Alignment for walberla GPU fields not yet supported")
-            wlb.cuda.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
-                                          usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])
+            wlb.gpu.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
+                                         usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])

        if cpu and gpu:
            self._cpu_gpu_pairs.append((name, self.GPU_DATA_PREFIX + name))
@@ -255,7 +255,7 @@ class ParallelDataHandling(DataHandling):
    def get_kernel_kwargs(self, kernel_function, **kwargs):
        if kernel_function.ast.backend == Backend.CUDA:
            name_map = self._field_name_to_gpu_data_name
-            to_array = wlb.cuda.toGpuArray
+            to_array = wlb.gpu.toGpuArray
        else:
            name_map = self._field_name_to_cpu_data_name
            to_array = wlb.field.toArray
@@ -280,7 +280,8 @@ class ParallelDataHandling(DataHandling):
            for block in self.blocks:
                transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
        else:
-            wlb.cuda.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
+            if self.is_on_gpu(name):
+                wlb.gpu.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)

    def to_gpu(self, name):
        if name in self._custom_data_transfer_functions:
@@ -288,20 +289,21 @@ class ParallelDataHandling(DataHandling):
            for block in self.blocks:
                transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
        else:
-            wlb.cuda.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
+            if self.is_on_gpu(name):
+                wlb.gpu.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)

    def is_on_gpu(self, name):
        return (name, self.GPU_DATA_PREFIX + name) in self._cpu_gpu_pairs

    def all_to_cpu(self):
        for cpu_name, gpu_name in self._cpu_gpu_pairs:
-            wlb.cuda.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
+            wlb.gpu.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
        for name in self._custom_data_transfer_functions.keys():
            self.to_cpu(name)

    def all_to_gpu(self):
        for cpu_name, gpu_name in self._cpu_gpu_pairs:
-            wlb.cuda.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
+            wlb.gpu.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
        for name in self._custom_data_transfer_functions.keys():
            self.to_gpu(name)

@@ -328,7 +330,7 @@ class ParallelDataHandling(DataHandling):
                create_packing = wlb.field.createStencilRestrictedPackInfo
        else:
            assert target == Target.GPU
-            create_packing = wlb.cuda.createPackInfo if buffered else wlb.cuda.createMPIDatatypeInfo
+            create_packing = wlb.gpu.createPackInfo if buffered else wlb.gpu.createMPIDatatypeInfo
            names = [self.GPU_DATA_PREFIX + name for name in names]

        sync_function = create_scheme(self.blocks, stencil)

--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -6,11 +6,10 @@ import numpy as np

 from pystencils.datahandling.blockiteration import SerialBlock
 from pystencils.datahandling.datahandling_interface import DataHandling
-from pystencils.datahandling.pycuda import PyCudaArrayHandler, PyCudaNotAvailableHandler
 from pystencils.enums import Target
-from pystencils.field import (
-    Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple,
-    spatial_layout_string_to_tuple)
+from pystencils.field import (Field, FieldType, create_numpy_array_with_layout,
+                              layout_string_to_tuple, spatial_layout_string_to_tuple)
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
 from pystencils.slicing import normalize_slice, remove_ghost_layers
 from pystencils.utils import DotDict

@@ -23,7 +22,8 @@ class SerialDataHandling(DataHandling):
                 default_layout: str = 'SoA',
                 periodicity: Union[bool, Sequence[bool]] = False,
                 default_target: Target = Target.CPU,
-                 array_handler=None) -> None:
+                 array_handler=None,
+                 device_number=None) -> None:
        """
        Creates a data handling for single node simulations.

@@ -31,9 +31,17 @@ class SerialDataHandling(DataHandling):
            domain_size: size of the spatial domain as tuple
            default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
            default_layout: default layout used, if  not overridden in add_array() method
+            periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
+                         Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
+                         False (non-periodic)
            default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
                            allocated if not overwritten in add_array, and synchronization functions are for the GPU by
                            default
+            array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
+                           and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
+            device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
+                           the device with the largest amount of memory is used. If multiple devices have the same
+                           amount of memory, the one with the lower number is used
        """
        super(SerialDataHandling, self).__init__()
        self._domainSize = tuple(domain_size)
@@ -48,9 +56,14 @@ class SerialDataHandling(DataHandling):

        if not array_handler:
            try:
-                self.array_handler = PyCudaArrayHandler()
-            except Exception:
-                self.array_handler = PyCudaNotAvailableHandler()
+                if device_number is None:
+                    import cupy.cuda.runtime
+                    if cupy.cuda.runtime.getDeviceCount() > 0:
+                        device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
+                                               key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
+                self.array_handler = GPUArrayHandler(device_number)
+            except ImportError:
+                self.array_handler = GPUNotAvailableHandler()
        else:
            self.array_handler = array_handler

@@ -126,10 +139,14 @@ class SerialDataHandling(DataHandling):
        else:
            layout_tuple = spatial_layout_string_to_tuple(layout, self.dim)

-        # cpu_arr is always created - since there is no create_pycuda_array_with_layout()
+        # cpu_arr is always created - since there is no create_gpu_array_with_layout()
        byte_offset = ghost_layers * np.dtype(dtype).itemsize
-        cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
-                                                 byte_offset=byte_offset, **kwargs)
+
+        if gpu:
+            cpu_arr = self.array_handler.pinned_numpy_array(shape=kwargs['shape'], layout=layout_tuple, dtype=dtype)
+        else:
+            cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
+                                                     byte_offset=byte_offset, **kwargs)

        if alignment and gpu:
            raise NotImplementedError("Alignment for GPU fields not supported")
@@ -251,14 +268,16 @@ class SerialDataHandling(DataHandling):
            transfer_func = self._custom_data_transfer_functions[name][1]
            transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
        else:
-            self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
+            if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
+                self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])

    def to_gpu(self, name):
        if name in self._custom_data_transfer_functions:
            transfer_func = self._custom_data_transfer_functions[name][0]
            transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
        else:
-            self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
+            if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
+                self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])

    def is_on_gpu(self, name):
        return name in self.gpu_arrays
@@ -313,7 +332,7 @@ class SerialDataHandling(DataHandling):
                    result.append(functor(filtered_stencil, ghost_layers=gls))
                else:
                    if functor is None:
-                        from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as functor
+                        from pystencils.gpu.periodicity import get_periodic_boundary_functor as functor
                        target = Target.GPU
                    result.append(functor(filtered_stencil, self._domainSize,
                                          index_dimensions=self.fields[name].index_dimensions,
@@ -419,13 +438,19 @@ class SerialDataHandling(DataHandling):
    def world_rank(self):
        return 0

-    def save_all(self, file):
-        np.savez_compressed(file, **self.cpu_arrays)
+    def save_all(self, filename, compressed=True, synchronise_data=True):
+        if synchronise_data:
+            for name in (self.cpu_arrays.keys() & self.gpu_arrays.keys()):
+                self.to_cpu(name)
+        if compressed:
+            np.savez_compressed(filename, **self.cpu_arrays)
+        else:
+            np.savez(filename, **self.cpu_arrays)

-    def load_all(self, file):
-        if '.npz' not in file:
-            file += '.npz'
-        file_contents = np.load(file)
+    def load_all(self, filename, synchronise_data=True):
+        if '.npz' not in filename:
+            filename += '.npz'
+        file_contents = np.load(filename)
        for arr_name, arr_contents in self.cpu_arrays.items():
            if arr_name not in file_contents:
                print(f"Skipping read data {arr_name} because there is no data with this name in data handling")
@@ -435,3 +460,6 @@ class SerialDataHandling(DataHandling):
                      f"Read array shape {file_contents[arr_name].shape}, existing array shape {arr_contents.shape}")
                continue
            np.copyto(arr_contents, file_contents[arr_name])
+            if synchronise_data:
+                if arr_name in self.gpu_arrays.keys():
+                    self.to_gpu(arr_name)
--- a/pystencils/datahandling/vtk.py
+++ b/pystencils/datahandling/vtk.py
No results found