5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils/datahandling/parallel_datahandling.py
+++ b/pystencils/datahandling/parallel_datahandling.py
@@ -9,7 +9,7 @@ from pystencils.datahandling.blockiteration import block_iteration, sliced_block
 from pystencils.datahandling.datahandling_interface import DataHandling
 from pystencils.enums import Backend
 from pystencils.field import Field, FieldType
-from pystencils.kernelparameters import FieldPointerSymbol
+from pystencils.typing.typed_sympy import FieldPointerSymbol
 from pystencils.utils import DotDict
 from pystencils import Target
@@ -151,8 +151,8 @@ class ParallelDataHandling(DataHandling):
        if gpu:
            if alignment != 0:
                raise ValueError("Alignment for walberla GPU fields not yet supported")
-            wlb.cuda.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
+            wlb.gpu.addGpuFieldToStorage(self.blocks, self.GPU_DATA_PREFIX + name, dtype, fSize=values_per_cell,
-                                          usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])
+                                         usePitchedMem=False, ghostLayers=ghost_layers, layout=layout_map[layout])
        if cpu and gpu:
            self._cpu_gpu_pairs.append((name, self.GPU_DATA_PREFIX + name))
@@ -255,7 +255,7 @@ class ParallelDataHandling(DataHandling):
    def get_kernel_kwargs(self, kernel_function, **kwargs):
        if kernel_function.ast.backend == Backend.CUDA:
            name_map = self._field_name_to_gpu_data_name
-            to_array = wlb.cuda.toGpuArray
+            to_array = wlb.gpu.toGpuArray
        else:
            name_map = self._field_name_to_cpu_data_name
            to_array = wlb.field.toArray
@@ -280,7 +280,8 @@ class ParallelDataHandling(DataHandling):
            for block in self.blocks:
                transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
        else:
-            wlb.cuda.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
+            if self.is_on_gpu(name):
+                wlb.gpu.copyFieldToCpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
    def to_gpu(self, name):
        if name in self._custom_data_transfer_functions:
@@ -288,20 +289,21 @@ class ParallelDataHandling(DataHandling):
            for block in self.blocks:
                transfer_func(block[self.GPU_DATA_PREFIX + name], block[name])
        else:
-            wlb.cuda.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
+            if self.is_on_gpu(name):
+                wlb.gpu.copyFieldToGpu(self.blocks, self.GPU_DATA_PREFIX + name, name)
    def is_on_gpu(self, name):
        return (name, self.GPU_DATA_PREFIX + name) in self._cpu_gpu_pairs
    def all_to_cpu(self):
        for cpu_name, gpu_name in self._cpu_gpu_pairs:
-            wlb.cuda.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
+            wlb.gpu.copyFieldToCpu(self.blocks, gpu_name, cpu_name)
        for name in self._custom_data_transfer_functions.keys():
            self.to_cpu(name)
    def all_to_gpu(self):
        for cpu_name, gpu_name in self._cpu_gpu_pairs:
-            wlb.cuda.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
+            wlb.gpu.copyFieldToGpu(self.blocks, gpu_name, cpu_name)
        for name in self._custom_data_transfer_functions.keys():
            self.to_gpu(name)
@@ -328,7 +330,7 @@ class ParallelDataHandling(DataHandling):
                create_packing = wlb.field.createStencilRestrictedPackInfo
        else:
            assert target == Target.GPU
-            create_packing = wlb.cuda.createPackInfo if buffered else wlb.cuda.createMPIDatatypeInfo
+            create_packing = wlb.gpu.createPackInfo if buffered else wlb.gpu.createMPIDatatypeInfo
            names = [self.GPU_DATA_PREFIX + name for name in names]
        sync_function = create_scheme(self.blocks, stencil)

--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -6,12 +6,10 @@ import numpy as np
 from pystencils.datahandling.blockiteration import SerialBlock
 from pystencils.datahandling.datahandling_interface import DataHandling
-from pystencils.datahandling.pycuda import PyCudaArrayHandler, PyCudaNotAvailableHandler
-from pystencils.datahandling.pyopencl import PyOpenClArrayHandler
 from pystencils.enums import Target
-from pystencils.field import (
+from pystencils.field import (Field, FieldType, create_numpy_array_with_layout,
-    Field, FieldType, create_numpy_array_with_layout, layout_string_to_tuple,
+                              layout_string_to_tuple, spatial_layout_string_to_tuple)
-    spatial_layout_string_to_tuple)
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
 from pystencils.slicing import normalize_slice, remove_ghost_layers
 from pystencils.utils import DotDict
@@ -24,9 +22,8 @@ class SerialDataHandling(DataHandling):
                 default_layout: str = 'SoA',
                 periodicity: Union[bool, Sequence[bool]] = False,
                 default_target: Target = Target.CPU,
-                 opencl_queue=None,
+                 array_handler=None,
-                 opencl_ctx=None,
+                 device_number=None) -> None:
-                 array_handler=None) -> None:
        """
        Creates a data handling for single node simulations.
@@ -34,9 +31,17 @@ class SerialDataHandling(DataHandling):
            domain_size: size of the spatial domain as tuple
            default_ghost_layers: default number of ghost layers used, if not overridden in add_array() method
            default_layout: default layout used, if  not overridden in add_array() method
+            periodicity: List of booleans that indicate which dimensions have periodic boundary conditions.
+                         Alternatively, a single boolean can be given, which is used for all dimensions. Defaults to
+                         False (non-periodic)
            default_target: `Target` either 'CPU' or 'GPU'. If set to 'GPU' for each array also a GPU version is
                            allocated if not overwritten in add_array, and synchronization functions are for the GPU by
                            default
+            array_handler: An object that provides the same interface as `GPUArrayHandler`, which is used for creation
+                           and transferring of GPU arrays. Default is to construct a fresh `GPUArrayHandler`
+            device_number: If `default_target` is set to 'GPU', a device number should be specified. If none is given,
+                           the device with the largest amount of memory is used. If multiple devices have the same
+                           amount of memory, the one with the lower number is used
        """
        super(SerialDataHandling, self).__init__()
        self._domainSize = tuple(domain_size)
@@ -48,17 +53,17 @@ class SerialDataHandling(DataHandling):
        self.custom_data_cpu = DotDict()
        self.custom_data_gpu = DotDict()
        self._custom_data_transfer_functions = {}
-        self._opencl_queue = opencl_queue
-        self._opencl_ctx = opencl_ctx
        if not array_handler:
            try:
-                self.array_handler = PyCudaArrayHandler()
+                if device_number is None:
-            except Exception:
+                    import cupy.cuda.runtime
-                self.array_handler = PyCudaNotAvailableHandler()
+                    if cupy.cuda.runtime.getDeviceCount() > 0:
+                        device_number = sorted(range(cupy.cuda.runtime.getDeviceCount()),
-            if default_target == Target.OPENCL or opencl_queue:
+                                               key=lambda i: cupy.cuda.Device(i).mem_info[1], reverse=True)[0]
-                self.array_handler = PyOpenClArrayHandler(opencl_queue)
+                self.array_handler = GPUArrayHandler(device_number)
+            except ImportError:
+                self.array_handler = GPUNotAvailableHandler()
        else:
            self.array_handler = array_handler
@@ -134,10 +139,14 @@ class SerialDataHandling(DataHandling):
        else:
            layout_tuple = spatial_layout_string_to_tuple(layout, self.dim)
-        # cpu_arr is always created - since there is no create_pycuda_array_with_layout()
+        # cpu_arr is always created - since there is no create_gpu_array_with_layout()
        byte_offset = ghost_layers * np.dtype(dtype).itemsize
-        cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
-                                                 byte_offset=byte_offset, **kwargs)
+        if gpu:
+            cpu_arr = self.array_handler.pinned_numpy_array(shape=kwargs['shape'], layout=layout_tuple, dtype=dtype)
+        else:
+            cpu_arr = create_numpy_array_with_layout(layout=layout_tuple, alignment=alignment,
+                                                     byte_offset=byte_offset, **kwargs)
        if alignment and gpu:
            raise NotImplementedError("Alignment for GPU fields not supported")
@@ -259,14 +268,16 @@ class SerialDataHandling(DataHandling):
            transfer_func = self._custom_data_transfer_functions[name][1]
            transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
        else:
-            self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
+            if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
+                self.array_handler.download(self.gpu_arrays[name], self.cpu_arrays[name])
    def to_gpu(self, name):
        if name in self._custom_data_transfer_functions:
            transfer_func = self._custom_data_transfer_functions[name][0]
            transfer_func(self.custom_data_gpu[name], self.custom_data_cpu[name])
        else:
-            self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
+            if name in self.cpu_arrays.keys() & self.gpu_arrays.keys():
+                self.array_handler.upload(self.gpu_arrays[name], self.cpu_arrays[name])
    def is_on_gpu(self, name):
        return name in self.gpu_arrays
@@ -280,8 +291,6 @@ class SerialDataHandling(DataHandling):
    def synchronization_function(self, names, stencil=None, target=None, functor=None, **_):
        if target is None:
            target = self.default_target
-        if target == Target.OPENCL:  # TODO potential misuse between Target and Backend
-            target = Target.GPU
        assert target in (Target.CPU, Target.GPU)
        if not hasattr(names, '__len__') or type(names) is str:
            names = [names]
@@ -323,17 +332,14 @@ class SerialDataHandling(DataHandling):
                    result.append(functor(filtered_stencil, ghost_layers=gls))
                else:
                    if functor is None:
-                        from pystencils.gpucuda.periodicity import get_periodic_boundary_functor as functor
+                        from pystencils.gpu.periodicity import get_periodic_boundary_functor as functor
-                        target = Target.GPU if not isinstance(self.array_handler,
+                        target = Target.GPU
-                                                              PyOpenClArrayHandler) else Target.OPENCL
                    result.append(functor(filtered_stencil, self._domainSize,
                                          index_dimensions=self.fields[name].index_dimensions,
                                          index_dim_shape=values_per_cell,
                                          dtype=self.fields[name].dtype.numpy_dtype,
                                          ghost_layers=gls,
-                                          target=target,
+                                          target=target))
-                                          opencl_queue=self._opencl_queue,
-                                          opencl_ctx=self._opencl_ctx))
        if target == Target.CPU:
            def result_functor():
@@ -432,13 +438,19 @@ class SerialDataHandling(DataHandling):
    def world_rank(self):
        return 0
-    def save_all(self, file):
+    def save_all(self, filename, compressed=True, synchronise_data=True):
-        np.savez_compressed(file, **self.cpu_arrays)
+        if synchronise_data:
+            for name in (self.cpu_arrays.keys() & self.gpu_arrays.keys()):
+                self.to_cpu(name)
+        if compressed:
+            np.savez_compressed(filename, **self.cpu_arrays)
+        else:
+            np.savez(filename, **self.cpu_arrays)
-    def load_all(self, file):
+    def load_all(self, filename, synchronise_data=True):
-        if '.npz' not in file:
+        if '.npz' not in filename:
-            file += '.npz'
+            filename += '.npz'
-        file_contents = np.load(file)
+        file_contents = np.load(filename)
        for arr_name, arr_contents in self.cpu_arrays.items():
            if arr_name not in file_contents:
                print(f"Skipping read data {arr_name} because there is no data with this name in data handling")
@@ -448,3 +460,6 @@ class SerialDataHandling(DataHandling):
                      f"Read array shape {file_contents[arr_name].shape}, existing array shape {arr_contents.shape}")
                continue
            np.copyto(arr_contents, file_contents[arr_name])
+            if synchronise_data:
+                if arr_name in self.gpu_arrays.keys():
+                    self.to_gpu(arr_name)
--- a/pystencils/datahandling/vtk.py
+++ b/pystencils/datahandling/vtk.py
--- a/pystencils/display_utils.py
+++ b/pystencils/display_utils.py
@@ -10,7 +10,12 @@ from pystencils.kernel_wrapper import KernelWrapper
 def to_dot(expr: sp.Expr, graph_style: Optional[Dict[str, Any]] = None, short=True):
    """Show a sympy or pystencils AST as dot graph"""
    from pystencils.astnodes import Node
-    import graphviz
+    try:
+        import graphviz
+    except ImportError:
+        print("graphviz is not installed. Visualizing the AST is not available")
+        return
    graph_style = {} if graph_style is None else graph_style
    if isinstance(expr, Node):
@@ -46,7 +51,7 @@ def get_code_obj(ast: Union[KernelFunction, KernelWrapper], custom_backend=None)
    if isinstance(ast, KernelWrapper):
        ast = ast.ast
-    if ast.backend not in {Backend.C, Backend.CUDA, Backend.OPENCL}:
+    if ast.backend not in {Backend.C, Backend.CUDA}:
        raise NotImplementedError(f'get_code_obj is not implemented for backend {ast.backend}')
    dialect = ast.backend

--- a/src/pystencils/enums.py
+++ b/src/pystencils/enums.py
+from enum import Enum, auto
+class Target(Enum):
+    """
+    The Target enumeration represents all possible targets that can be used for the code generation.
+    """
+    CPU = auto()
+    """
+    Target CPU architecture.
+    """
+    GPU = auto()
+    """
+    Target GPU architecture.
+    """
+class Backend(Enum):
+    """
+    The Backend enumeration represents all possible backends that can be used for the code generation.
+    Backends and targets must be combined with care. For example CPU as a target and CUDA as a backend makes no sense.
+    """
+    C = auto()
+    """
+    Use the C Backend of pystencils.
+    """
+    CUDA = auto()
+    """
+    Use the CUDA backend to generate code for NVIDIA GPUs.
+    """
--- a/pystencils/fast_approximation.py
+++ b/pystencils/fast_approximation.py
@@ -4,20 +4,30 @@ import sympy as sp
 from pystencils.astnodes import Node
 from pystencils.simp import AssignmentCollection
+from pystencils.assignment import Assignment
 # noinspection PyPep8Naming
 class fast_division(sp.Function):
+    """
+    Produces special float instructions for CUDA kernels
+    """
    nargs = (2,)
 # noinspection PyPep8Naming
 class fast_sqrt(sp.Function):
+    """
+    Produces special float instructions for CUDA kernels
+    """
    nargs = (1, )
 # noinspection PyPep8Naming
 class fast_inv_sqrt(sp.Function):
+    """
+    Produces special float instructions for CUDA kernels
+    """
    nargs = (1, )
@@ -32,7 +42,7 @@ def _run(term, visitor):
        return visitor(term)
-def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]):
+def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection, Assignment]):
    def visit(expr):
        if isinstance(expr, Node):
            return expr
@@ -48,7 +58,7 @@ def insert_fast_sqrts(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection])
    return _run(term, visit)
-def insert_fast_divisions(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection]):
+def insert_fast_divisions(term: Union[sp.Expr, List[sp.Expr], AssignmentCollection, Assignment]):
    def visit(expr):
        if isinstance(expr, Node):

--- a/pystencils/fd/__init__.py
+++ b/pystencils/fd/__init__.py
--- a/pystencils/fd/derivation.py
+++ b/pystencils/fd/derivation.py
--- a/pystencils/fd/derivative.py
+++ b/pystencils/fd/derivative.py
--- a/pystencils/fd/finitedifferences.py
+++ b/pystencils/fd/finitedifferences.py
@@ -105,9 +105,8 @@ class Discretization2ndOrder:
            return self._discretize_advection(e)
        elif isinstance(e, Diff):
            arg, *indices = diff_args(e)
-            from pystencils.interpolation_astnodes import InterpolatorAccess
-            if not isinstance(arg, (Field.Access, InterpolatorAccess)):
+            if not isinstance(arg, Field.Access):
                raise ValueError("Only derivatives with field or field accesses as arguments can be discretized")
            return self.spatial_stencil(indices, self.dx, arg)
        else:

--- a/pystencils/fd/finitevolumes.py
+++ b/pystencils/fd/finitevolumes.py
--- a/pystencils/fd/spatial.py
+++ b/pystencils/fd/spatial.py
+from functools import lru_cache
 from typing import Tuple
 import sympy as sp
 from pystencils.astnodes import LoopOverCoordinate
-from pystencils.cache import memorycache
 from pystencils.fd import Diff
 from pystencils.field import Field
 from pystencils.transformations import generic_visit
@@ -136,7 +136,7 @@ def discretize_spatial_staggered(expr, dx, stencil=fd_stencils_standard):
 # -------------------------------------- special stencils --------------------------------------------------------------
-@memorycache(maxsize=1)
+@lru_cache(maxsize=1)
 def forth_order_2d_derivation() -> Tuple[FiniteDifferenceStencilDerivation.Result, ...]:
    # Symmetry, isotropy and 4th order conditions are not enough to fully specify the stencil
    # one weight has to be specifically set to a somewhat arbitrary value

--- a/pystencils/field.py
+++ b/pystencils/field.py
@@ -5,7 +5,7 @@ import pickle
 import re
 from enum import Enum
 from itertools import chain
-from typing import List, Optional, Sequence, Set, Tuple
+from typing import List, Optional, Sequence, Set, Tuple, Union
 import numpy as np
 import sympy as sp
@@ -13,13 +13,13 @@ from sympy.core.cache import cacheit
 import pystencils
 from pystencils.alignedarray import aligned_empty
-from pystencils.data_types import StructType, TypedSymbol, create_type
+from pystencils.typing import StructType, TypedSymbol, BasicType, create_type
-from pystencils.kernelparameters import FieldShapeSymbol, FieldStrideSymbol
+from pystencils.typing.typed_sympy import FieldShapeSymbol, FieldStrideSymbol
 from pystencils.stencil import (
    direction_string_to_offset, inverse_direction, offset_to_direction_string)
 from pystencils.sympyextensions import is_integer_sequence
-__all__ = ['Field', 'fields', 'FieldType', 'AbstractField']
+__all__ = ['Field', 'fields', 'FieldType', 'Field']
 class FieldType(Enum):
@@ -69,81 +69,7 @@ class FieldType(Enum):
        return field.field_type == FieldType.STAGGERED_FLUX
-def fields(description=None, index_dimensions=0, layout=None, field_type=FieldType.GENERIC, **kwargs):
+class Field:
-    """Creates pystencils fields from a string description.
-    Examples:
-        Create a 2D scalar and vector field:
-            >>> s, v = fields("s, v(2): double[2D]")
-            >>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
-            >>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
-        Create an integer field of shape (10, 20):
-            >>> f = fields("f : int32[10, 20]")
-            >>> f.has_fixed_shape, f.shape
-            (True, (10, 20))
-        Numpy arrays can be used as template for shape and data type of field:
-            >>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
-            >>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
-            >>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
-            >>> assert v.index_shape == (2,)
-        Format string can be left out, field names are taken from keyword arguments.
-            >>> fields(f1=arr_s, f2=arr_s)
-            [f1: double[20,20], f2: double[20,20]]
-        The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
-            >>> f = fields(f=arr_v, index_dimensions=1)
-            >>> assert f.index_dimensions == 1
-            >>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
-            >>> f.layout
-            (2, 1, 0)
-    """
-    result = []
-    if description:
-        field_descriptions, dtype, shape = _parse_description(description)
-        layout = 'numpy' if layout is None else layout
-        for field_name, idx_shape in field_descriptions:
-            if field_name in kwargs:
-                arr = kwargs[field_name]
-                idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
-                assert idx_shape_of_arr == idx_shape
-                f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
-                                                  field_type=field_type)
-            elif isinstance(shape, tuple):
-                f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
-                                            index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
-            elif isinstance(shape, int):
-                f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
-                                         index_shape=idx_shape, layout=layout, field_type=field_type)
-            elif shape is None:
-                f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
-                                         index_shape=idx_shape, layout=layout, field_type=field_type)
-            else:
-                assert False
-            result.append(f)
-    else:
-        assert layout is None, "Layout can not be specified when creating Field from numpy array"
-        for field_name, arr in kwargs.items():
-            result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
-                                                        field_type=field_type))
-    if len(result) == 0:
-        return None
-    elif len(result) == 1:
-        return result[0]
-    else:
-        return result
-class AbstractField:
-    class AbstractAccess:
-        pass
-class Field(AbstractField):
    """
    With fields one can formulate stencil-like update rules on structured grids.
    This Field class knows about the dimension, memory layout (strides) and optionally about the size of an array.
@@ -330,9 +256,7 @@ class Field(AbstractField):
        self.shape = shape
        self.strides = strides
        self.latex_name: Optional[str] = None
-        self.coordinate_origin: tuple[float, sp.Symbol] = sp.Matrix(tuple(
+        self.coordinate_origin = sp.Matrix([0] * self.spatial_dimensions)
-            0 for _ in range(self.spatial_dimensions)
-        ))
        self.coordinate_transform = sp.eye(self.spatial_dimensions)
        if field_type == FieldType.STAGGERED:
            assert self.staggered_stencil
@@ -341,8 +265,7 @@ class Field(AbstractField):
        if self.has_fixed_shape:
            return Field(new_name, self.field_type, self._dtype, self._layout, self.shape, self.strides)
        else:
-            return Field.create_generic(new_name, self.spatial_dimensions, self.dtype.numpy_dtype,
+            return Field(new_name, self.field_type, self.dtype, self.layout, self.shape, self.strides)
-                                        self.index_dimensions, self._layout, self.index_shape, self.field_type)
    @property
    def spatial_dimensions(self) -> int:
@@ -432,8 +355,8 @@ class Field(AbstractField):
        elif len(index_shape) == 2:
            return sp.Matrix([[self(i, j) for j in range(index_shape[1])] for i in range(index_shape[0])])
        elif len(index_shape) == 3:
-            return sp.Matrix([[[self(i, j, k) for k in range(index_shape[2])]
+            return sp.Array([[[self(i, j, k) for k in range(index_shape[2])]
-                               for j in range(index_shape[1])] for i in range(index_shape[0])])
+                              for j in range(index_shape[1])] for i in range(index_shape[0])])
        else:
            raise NotImplementedError("center_vector is not implemented for more than 3 index dimensions")
@@ -466,35 +389,13 @@ class Field(AbstractField):
        if type(offset) is not tuple:
            offset = (offset,)
        if len(offset) != self.spatial_dimensions:
-            raise ValueError("Wrong number of spatial indices: "
+            raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
-                             "Got %d, expected %d" % (len(offset), self.spatial_dimensions))
        return Field.Access(self, offset)
    def absolute_access(self, offset, index):
        assert FieldType.is_custom(self)
        return Field.Access(self, offset, index, is_absolute_access=True)
-    def interpolated_access(self,
-                            offset: Tuple,
-                            interpolation_mode='linear',
-                            address_mode='BORDER',
-                            allow_textures=True):
-        """Provides access to field values at non-integer positions
-        ``interpolated_access`` is similar to :func:`Field.absolute_access` except that
-        it allows non-integer offsets and automatic handling of out-of-bound accesses.
-        :param offset:              Tuple of spatial coordinates (can be floats)
-        :param interpolation_mode:  One of :class:`pystencils.interpolation_astnodes.InterpolationMode`
-        :param address_mode:        How boundaries are handled can be 'border', 'wrap', 'mirror', 'clamp'
-        :param allow_textures:      Allow implementation by texture accesses on GPUs
-        """
-        from pystencils.interpolation_astnodes import Interpolator
-        return Interpolator(self,
-                            interpolation_mode,
-                            address_mode,
-                            allow_textures=allow_textures).at(offset)
    def staggered_access(self, offset, index=None):
        """If this field is a staggered field, it can be accessed using half-integer offsets.
        For example, an offset of ``(0, sp.Rational(1,2))`` or ``"E"`` corresponds to the staggered point to the east
@@ -511,8 +412,7 @@ class Field(AbstractField):
            offset = tuple(direction_string_to_offset(offset, self.spatial_dimensions))
            offset = tuple([o * sp.Rational(1, 2) for o in offset])
        if len(offset) != self.spatial_dimensions:
-            raise ValueError("Wrong number of spatial indices: "
+            raise ValueError(f"Wrong number of spatial indices: Got {len(offset)}, expected {self.spatial_dimensions}")
-                             "Got %d, expected %d" % (len(offset), self.spatial_dimensions))
        prefactor = 1
        neighbor_vec = [0] * len(offset)
@@ -526,8 +426,7 @@ class Field(AbstractField):
            if FieldType.is_staggered_flux(self):
                prefactor = -1
        if neighbor not in self.staggered_stencil:
-            raise ValueError("{} is not a valid neighbor for the {} stencil".format(offset_orig,
+            raise ValueError(f"{offset_orig} is not a valid neighbor for the {self.staggered_stencil_name} stencil")
-                                                                                    self.staggered_stencil_name))
        offset = tuple(sp.Matrix(offset) - sp.Rational(1, 2) * sp.Matrix(neighbor_vec))
@@ -539,15 +438,13 @@ class Field(AbstractField):
            return prefactor * Field.Access(self, offset, (idx,))
        else:  # this field stores a vector or tensor at each staggered position
            if index is None:
-                raise ValueError("Wrong number of indices: "
+                raise ValueError(f"Wrong number of indices: Got 0, expected {self.index_dimensions - 1}")
-                                 "Got %d, expected %d" % (0, self.index_dimensions - 1))
            if type(index) is np.ndarray:
                index = tuple(index)
            if type(index) is not tuple:
                index = (index,)
            if self.index_dimensions != len(index) + 1:
-                raise ValueError("Wrong number of indices: "
+                raise ValueError(f"Wrong number of indices: Got {len(index)}, expected {self.index_dimensions - 1}")
-                                 "Got %d, expected %d" % (len(index), self.index_dimensions - 1))
            return prefactor * Field.Access(self, offset, (idx, *index))
@@ -587,7 +484,7 @@ class Field(AbstractField):
    @property
    def staggered_stencil_name(self):
        assert FieldType.is_staggered(self)
-        return "D%dQ%d" % (self.spatial_dimensions, self.index_shape[0] * 2 + 1)
+        return f"D{self.spatial_dimensions}Q{self.index_shape[0] * 2 + 1}"
    def __call__(self, *args, **kwargs):
        center = tuple([0] * self.spatial_dimensions)
@@ -651,7 +548,7 @@ class Field(AbstractField):
        self.coordinate_origin = -sp.Matrix([i / 2 for i in self.spatial_shape])
    # noinspection PyAttributeOutsideInit,PyUnresolvedReferences
-    class Access(TypedSymbol, AbstractField.AbstractAccess):
+    class Access(TypedSymbol):
        """Class representing a relative access into a `Field`.
        This class behaves like a normal sympy Symbol, it is actually derived from it. One can built up
@@ -705,7 +602,11 @@ class Field(AbstractField):
            if superscript is not None:
                symbol_name += "^" + superscript
-            obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype)
+            if dtype:
+                obj = super(Field.Access, self).__xnew__(self, symbol_name, dtype)
+            else:
+                obj = super(Field.Access, self).__xnew__(self, symbol_name, field.dtype)
            obj._field = field
            obj._offsets = []
            for o in offsets:
@@ -747,9 +648,14 @@ class Field(AbstractField):
                idx = ()
            if len(idx) != self.field.index_dimensions:
-                raise ValueError("Wrong number of indices: "
+                raise ValueError(f"Wrong number of indices: Got {len(idx)}, expected {self.field.index_dimensions}")
-                                 "Got %d, expected %d" % (len(idx), self.field.index_dimensions))
+            if len(idx) == 1 and isinstance(idx[0], str):
-            return Field.Access(self.field, self._offsets, idx, dtype=self.dtype)
+                dtype = BasicType(self.field.dtype.numpy_dtype[idx[0]])
+                return Field.Access(self.field, self._offsets, idx,
+                                    is_absolute_access=self.is_absolute_access, dtype=dtype)
+            else:
+                return Field.Access(self.field, self._offsets, idx,
+                                    is_absolute_access=self.is_absolute_access, dtype=self.dtype)
        def __getitem__(self, *idx):
            return self.__call__(*idx)
@@ -803,7 +709,8 @@ class Field(AbstractField):
            """
            offset_list = list(self.offsets)
            offset_list[coord_id] += offset
-            return Field.Access(self.field, tuple(offset_list), self.index, dtype=self.dtype)
+            return Field.Access(self.field, tuple(offset_list), self.index,
+                                is_absolute_access=self.is_absolute_access, dtype=self.dtype)
        def get_shifted(self, *shift) -> 'Field.Access':
            """Returns a new Access with changed spatial coordinates
@@ -816,6 +723,7 @@ class Field(AbstractField):
            return Field.Access(self.field,
                                tuple(a + b for a, b in zip(shift, self.offsets)),
                                self.index,
+                                is_absolute_access=self.is_absolute_access,
                                dtype=self.dtype)
        def at_index(self, *idx_tuple) -> 'Field.Access':
@@ -826,12 +734,14 @@ class Field(AbstractField):
                >>> f(0).at_index(8)
                f_C^8
            """
-            return Field.Access(self.field, self.offsets, idx_tuple, dtype=self.dtype)
+            return Field.Access(self.field, self.offsets, idx_tuple,
+                                is_absolute_access=self.is_absolute_access, dtype=self.dtype)
        def _eval_subs(self, old, new):
            return Field.Access(self.field,
                                tuple(sp.sympify(a).subs(old, new) for a in self.offsets),
                                tuple(sp.sympify(a).subs(old, new) for a in self.index),
+                                is_absolute_access=self.is_absolute_access,
                                dtype=self.dtype)
        @property
@@ -849,7 +759,8 @@ class Field(AbstractField):
        def _hashable_content(self):
            super_class_contents = super(Field.Access, self)._hashable_content()
-            return (super_class_contents, self._field.hashable_contents(), *self._index, *self._offsets)
+            return (super_class_contents, self._field.hashable_contents(), *self._index,
+                    *self._offsets, self._is_absolute_access)
        def _staggered_offset(self, offsets, index):
            assert FieldType.is_staggered(self._field)
@@ -870,15 +781,14 @@ class Field(AbstractField):
            if FieldType.is_staggered(self._field):
                if self.index and self.field.index_dimensions > 1:
-                    return "{{%s}_{%s}^{%s}}" % (n, offset_str, self.index[1:]
+                    return f"{{{n}}}_{{{offset_str}}}^{{{self.index[1:] if len(self.index) > 2 else self.index[1]}}}"
-                                                 if len(self.index) > 2 else self.index[1])
                else:
-                    return "{{%s}_{%s}}" % (n, offset_str)
+                    return f"{{{n}}}_{{{offset_str}}}"
            else:
                if self.index and self.field.index_dimensions > 0:
-                    return "{{%s}_{%s}^{%s}}" % (n, offset_str, self.index if len(self.index) > 1 else self.index[0])
+                    return f"{{{n}}}_{{{offset_str}}}^{{{self.index if len(self.index) > 1 else self.index[0]}}}"
                else:
-                    return "{{%s}_{%s}}" % (n, offset_str)
+                    return f"{{{n}}}_{{{offset_str}}}"
        def __str__(self):
            n = self._field.latex_name if self._field.latex_name else self._field.name
@@ -901,6 +811,75 @@ class Field(AbstractField):
                    return f"{n}[{offset_str}]"
+def fields(description=None, index_dimensions=0, layout=None,
+           field_type=FieldType.GENERIC, **kwargs) -> Union[Field, List[Field]]:
+    """Creates pystencils fields from a string description.
+    Examples:
+        Create a 2D scalar and vector field:
+            >>> s, v = fields("s, v(2): double[2D]")
+            >>> assert s.spatial_dimensions == 2 and s.index_dimensions == 0
+            >>> assert (v.spatial_dimensions, v.index_dimensions, v.index_shape) == (2, 1, (2,))
+        Create an integer field of shape (10, 20):
+            >>> f = fields("f : int32[10, 20]")
+            >>> f.has_fixed_shape, f.shape
+            (True, (10, 20))
+        Numpy arrays can be used as template for shape and data type of field:
+            >>> arr_s, arr_v = np.zeros([20, 20]), np.zeros([20, 20, 2])
+            >>> s, v = fields("s, v(2)", s=arr_s, v=arr_v)
+            >>> assert s.index_dimensions == 0 and s.dtype.numpy_dtype == arr_s.dtype
+            >>> assert v.index_shape == (2,)
+        Format string can be left out, field names are taken from keyword arguments.
+            >>> fields(f1=arr_s, f2=arr_s)
+            [f1: double[20,20], f2: double[20,20]]
+        The keyword names ``index_dimension`` and ``layout`` have special meaning, don't use them for field names
+            >>> f = fields(f=arr_v, index_dimensions=1)
+            >>> assert f.index_dimensions == 1
+            >>> f = fields("pdfs(19) : float32[3D]", layout='fzyx')
+            >>> f.layout
+            (2, 1, 0)
+    """
+    result = []
+    if description:
+        field_descriptions, dtype, shape = _parse_description(description)
+        layout = 'numpy' if layout is None else layout
+        for field_name, idx_shape in field_descriptions:
+            if field_name in kwargs:
+                arr = kwargs[field_name]
+                idx_shape_of_arr = () if not len(idx_shape) else arr.shape[-len(idx_shape):]
+                assert idx_shape_of_arr == idx_shape
+                f = Field.create_from_numpy_array(field_name, kwargs[field_name], index_dimensions=len(idx_shape),
+                                                  field_type=field_type)
+            elif isinstance(shape, tuple):
+                f = Field.create_fixed_size(field_name, shape + idx_shape, dtype=dtype,
+                                            index_dimensions=len(idx_shape), layout=layout, field_type=field_type)
+            elif isinstance(shape, int):
+                f = Field.create_generic(field_name, spatial_dimensions=shape, dtype=dtype,
+                                         index_shape=idx_shape, layout=layout, field_type=field_type)
+            elif shape is None:
+                f = Field.create_generic(field_name, spatial_dimensions=2, dtype=dtype,
+                                         index_shape=idx_shape, layout=layout, field_type=field_type)
+            else:
+                assert False
+            result.append(f)
+    else:
+        assert layout is None, "Layout can not be specified when creating Field from numpy array"
+        for field_name, arr in kwargs.items():
+            result.append(Field.create_from_numpy_array(field_name, arr, index_dimensions=index_dimensions,
+                                                        field_type=field_type))
+    if len(result) == 0:
+        raise ValueError("Could not parse field description")
+    elif len(result) == 1:
+        return result[0]
+    else:
+        return result
 def get_layout_from_strides(strides: Sequence[int], index_dimension_ids: Optional[List[int]] = None):
    index_dimension_ids = [] if index_dimension_ids is None else index_dimension_ids
    coordinates = list(range(len(strides)))
@@ -969,24 +948,35 @@ def create_numpy_array_with_layout(shape, layout, alignment=False, byte_offset=0
 def spatial_layout_string_to_tuple(layout_str: str, dim: int) -> Tuple[int, ...]:
-    if layout_str in ('fzyx', 'zyxf'):
+    if dim <= 0:
-        assert dim <= 3
+        raise ValueError("Dimensionality must be positive")
-        return tuple(reversed(range(dim)))
+    layout_str = layout_str.lower()
-    if layout_str in ('fzyx', 'f', 'reverse_numpy', 'SoA'):
+    if layout_str in ('fzyx', 'zyxf', 'soa', 'aos'):
+        if dim > 3:
+            raise ValueError(f"Invalid spatial dimensionality for layout descriptor {layout_str}: May be at most 3.")
        return tuple(reversed(range(dim)))
-    elif layout_str in ('c', 'numpy', 'AoS'):
+    if layout_str in ('f', 'reverse_numpy'):
+        return tuple(reversed(range(dim)))
+    elif layout_str in ('c', 'numpy'):
        return tuple(range(dim))
    raise ValueError("Unknown layout descriptor " + layout_str)
 def layout_string_to_tuple(layout_str, dim):
+    if dim <= 0:
+        raise ValueError("Dimensionality must be positive")
    layout_str = layout_str.lower()
    if layout_str == 'fzyx' or layout_str == 'soa':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim)))
    elif layout_str == 'zyxf' or layout_str == 'aos':
-        assert dim <= 4
+        if dim > 4:
+            raise ValueError(f"Invalid total dimensionality for layout descriptor {layout_str}: May be at most 4.")
        return tuple(reversed(range(dim - 1))) + (dim - 1,)
    elif layout_str == 'f' or layout_str == 'reverse_numpy':
        return tuple(reversed(range(dim)))
@@ -1060,7 +1050,6 @@ def _parse_part1(d):
 def _parse_description(description):
    def parse_part2(d):
        result = type_description_regex.match(d)
        if result:

--- a/src/pystencils/functions.py
+++ b/src/pystencils/functions.py
+import sympy as sp
+from pystencils.typing import PointerType
+class DivFunc(sp.Function):
+    """
+    DivFunc represents a division operation, since sympy represents divisions with ^-1
+    """
+    is_Atom = True
+    is_real = True
+    def __new__(cls, *args, **kwargs):
+        if len(args) != 2:
+            raise ValueError(f'{cls} takes only 2 arguments, instead {len(args)} received!')
+        divisor, dividend, *other_args = args
+        return sp.Function.__new__(cls, divisor, dividend, *other_args, **kwargs)
+    def _eval_evalf(self, *args, **kwargs):
+        return self.divisor.evalf() / self.dividend.evalf()
+    @property
+    def divisor(self):
+        return self.args[0]
+    @property
+    def dividend(self):
+        return self.args[1]
+class AddressOf(sp.Function):
+    """
+    AddressOf is the '&' operation in C. It gets the address of a lvalue.
+    """
+    is_Atom = True
+    def __new__(cls, arg):
+        obj = sp.Function.__new__(cls, arg)
+        return obj
+    @property
+    def canonical(self):
+        if hasattr(self.args[0], 'canonical'):
+            return self.args[0].canonical
+        else:
+            raise NotImplementedError()
+    @property
+    def is_commutative(self):
+        return self.args[0].is_commutative
+    @property
+    def dtype(self):
+        if hasattr(self.args[0], 'dtype'):
+            return PointerType(self.args[0].dtype, restrict=True)
+        else:
+            raise ValueError(f'pystencils supports only non void pointers. Current address_of type: {self.args[0]}')
--- a/pystencils/gpucuda/__init__.py
+++ b/pystencils/gpucuda/__init__.py
-from pystencils.gpucuda.cudajit import make_python_function
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
-from pystencils.gpucuda.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
+from pystencils.gpu.gpujit import make_python_function
+from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
 from .indexing import AbstractIndexing, BlockIndexing, LineIndexing
-__all__ = ['create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
+__all__ = ['GPUArrayHandler', 'GPUNotAvailableHandler',
+           'create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
           'AbstractIndexing', 'BlockIndexing', 'LineIndexing']
--- a/src/pystencils/gpu/gpu_array_handler.py
+++ b/src/pystencils/gpu/gpu_array_handler.py
+try:
+    import cupy as cp
+    import cupyx as cpx
+except ImportError:
+    cp = None
+    cpx = None
+import numpy as np
+class GPUArrayHandler:
+    def __init__(self, device_number):
+        self._device_number = device_number
+    def zeros(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.zeros(shape=shape, dtype=dtype, order=order)
+    def ones(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.ones(shape=shape, dtype=dtype, order=order)
+    def empty(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.empty(shape=shape, dtype=dtype, order=order)
+    def to_gpu(self, numpy_array):
+        swaps = _get_index_swaps(numpy_array)
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                gpu_array = cp.asarray(numpy_array.base)
+            for a, b in reversed(swaps):
+                gpu_array = gpu_array.swapaxes(a, b)
+            return gpu_array
+        else:
+            return cp.asarray(numpy_array)
+    def upload(self, array, numpy_array):
+        assert self._device_number == array.device.id
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                array.base.set(numpy_array.base)
+        else:
+            with cp.cuda.Device(self._device_number):
+                array.set(numpy_array)
+    def download(self, array, numpy_array):
+        assert self._device_number == array.device.id
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                numpy_array.base[:] = array.base.get()
+        else:
+            with cp.cuda.Device(self._device_number):
+                numpy_array[:] = array.get()
+    def randn(self, shape, dtype=np.float64):
+        with cp.cuda.Device(self._device_number):
+            return cp.random.randn(*shape, dtype=dtype)
+    @staticmethod
+    def pinned_numpy_array(layout, shape, dtype):
+        assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
+        cur_layout = list(range(len(shape)))
+        swaps = []
+        for i in range(len(layout)):
+            if cur_layout[i] != layout[i]:
+                index_to_swap_with = cur_layout.index(layout[i])
+                swaps.append((i, index_to_swap_with))
+                cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
+        assert tuple(cur_layout) == tuple(layout)
+        shape = list(shape)
+        for a, b in swaps:
+            shape[a], shape[b] = shape[b], shape[a]
+        res = cpx.empty_pinned(tuple(shape), order='c', dtype=dtype)
+        for a, b in reversed(swaps):
+            res = res.swapaxes(a, b)
+        return res
+    from_numpy = to_gpu
+class GPUNotAvailableHandler:
+    def __getattribute__(self, name):
+        raise NotImplementedError("Unable to utilise cupy! Please make sure cupy works correctly in your setup!")
+def _get_index_swaps(array):
+    swaps = []
+    if array.base is not None and isinstance(array.base, np.ndarray):
+        for stride in array.base.strides:
+            index_base = array.base.strides.index(stride)
+            index_view = array.strides.index(stride)
+            if index_base != index_view and (index_view, index_base) not in swaps:
+                swaps.append((index_base, index_view))
+    return swaps
--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
@@ -2,13 +2,11 @@ import numpy as np
 from pystencils.backends.cbackend import get_headers
 from pystencils.backends.cuda_backend import generate_cuda
-from pystencils.data_types import StructType
+from pystencils.typing import StructType
 from pystencils.field import FieldType
-from pystencils.gpucuda.texture_utils import ndarray_to_tex
+from pystencils.include import get_pystencils_include_path
-from pystencils.include import get_pycuda_include_path, get_pystencils_include_path
-from pystencils.interpolation_astnodes import InterpolatorAccess, TextureCachedField
 from pystencils.kernel_wrapper import KernelWrapper
-from pystencils.kernelparameters import FieldPointerSymbol
+from pystencils.typing import BasicType, FieldPointerSymbol
 USE_FAST_MATH = True
@@ -23,57 +21,49 @@ def get_cubic_interpolation_include_paths():
 def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None):
    """
    Creates a kernel function from an abstract syntax tree which
-    was created e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
+    was created e.g. by :func:`pystencils.gpu.create_cuda_kernel`
-    or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
+    or :func:`pystencils.gpu.created_indexed_cuda_kernel`
    Args:
        kernel_function_node: the abstract syntax tree
        argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
                       returned kernel functor.
+        custom_backend: use own custom printer for code generation
    Returns:
        compiled kernel as Python function
    """
-    import pycuda.autoinit  # NOQA
+    import cupy as cp
-    from pycuda.compiler import SourceModule
    if argument_dict is None:
        argument_dict = {}
-    header_list = ['<cstdint>'] + list(get_headers(kernel_function_node))
+    headers = get_headers(kernel_function_node)
-    includes = "\n".join(["#include %s" % (include_file,) for include_file in header_list])
+    if cp.cuda.runtime.is_hip:
+        headers.add('"gpu_defines.h"')
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<hip/hip_fp16.h>')
+    else:
+        headers.update({'"gpu_defines.h"', '<cstdint>'})
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<cuda_fp16.h>')
+    header_list = sorted(headers)
+    includes = "\n".join([f"#include {include_file}" for include_file in header_list])
    code = includes + "\n"
    code += "#define FUNC_PREFIX __global__\n"
    code += "#define RESTRICT __restrict__\n\n"
-    code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
+    code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
-    textures = set(d.interpolator for d in kernel_function_node.atoms(
-        InterpolatorAccess) if isinstance(d.interpolator, TextureCachedField))
-    nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
+    options = ["-w", "-std=c++11"]
    if USE_FAST_MATH:
-        nvcc_options.append("-use_fast_math")
+        options.append("-use_fast_math")
+    options.append("-I" + get_pystencils_include_path())
-    # Code for CubicInterpolationCUDA
-    from pystencils.interpolation_astnodes import InterpolationMode
-    from os.path import join, dirname, isdir
-    if any(t.interpolation_mode == InterpolationMode.CUBIC_SPLINE for t in textures):
-        assert isdir(join(dirname(__file__), ("CubicInterpolationCUDA", "code")),
-                     "Submodule CubicInterpolationCUDA does not exist.\n"
-                     + "Clone https://github.com/theHamsta/CubicInterpolationCUDA into pystencils.gpucuda")
-        nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code")]
-        nvcc_options += ["-I" + join(dirname(__file__), "CubicInterpolationCUDA", "code", "internal")]
-        needed_dims = set(t.field.spatial_dimensions for t in textures
-                          if t.interpolation_mode == InterpolationMode.CUBIC_SPLINE)
-        for i in needed_dims:
-            code = 'extern "C++" {\n#include "cubicTex%iD.cu"\n}\n' % i + code
-    mod = SourceModule(code, options=nvcc_options, include_dirs=[
-                       get_pystencils_include_path(), get_pycuda_include_path()])
-    func = mod.get_function(kernel_function_node.function_name)
+    func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True)
    parameters = kernel_function_node.get_parameters()
    cache = {}
@@ -84,7 +74,10 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
                         for k, v in kwargs.items()))
        try:
            args, block_and_thread_numbers = cache[key]
-            func(*args, **block_and_thread_numbers)
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
+                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
        except KeyError:
            full_arguments = argument_dict.copy()
            full_arguments.update(kwargs)
@@ -95,17 +88,16 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
-            # TODO: use texture objects:
+            args = tuple(_build_numpy_argument_list(parameters, full_arguments))
-            # https://devblogs.nvidia.com/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/
-            for tex in textures:
-                tex_ref = mod.get_texref(str(tex))
-                ndarray_to_tex(tex_ref, full_arguments[tex.field.name], tex.address_mode,
-                               tex.filter_mode, tex.use_normalized_coordinates, tex.read_as_integer)
-            args = _build_numpy_argument_list(parameters, full_arguments)
            cache[key] = (args, block_and_thread_numbers)
            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(*args, **block_and_thread_numbers)
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
-        # import pycuda.driver as cuda
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
+                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
+                # useful for debugging:
+                # cp.cuda.runtime.deviceSynchronize()
        # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
    ast = kernel_function_node
    parameters = kernel_function_node.get_parameters()
@@ -124,8 +116,8 @@ def _build_numpy_argument_list(parameters, argument_dict):
            actual_type = array.dtype
            expected_type = param.fields[0].dtype.numpy_dtype
            if expected_type != actual_type:
-                raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." %
+                raise ValueError(f"Data type mismatch for field {param.field_name}. "
-                                 (param.field_name, expected_type, actual_type))
+                                 f"Expected {expected_type} got {actual_type}.")
            result.append(array)
        elif param.is_field_stride:
            cast_to_dtype = param.symbol.dtype.numpy_dtype.type
@@ -160,22 +152,22 @@ def _check_arguments(parameter_specification, argument_dict):
            try:
                field_arr = argument_dict[symbolic_field.name]
            except KeyError:
-                raise KeyError("Missing field parameter for kernel call " + str(symbolic_field))
+                raise KeyError(f"Missing field parameter for kernel call {str(symbolic_field)}")
            if symbolic_field.has_fixed_shape:
                symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
                if isinstance(symbolic_field.dtype, StructType):
                    symbolic_field_shape = symbolic_field_shape[:-1]
                if symbolic_field_shape != field_arr.shape:
-                    raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
+                    raise ValueError(f"Passed array {symbolic_field.name} has shape {str(field_arr.shape)} "
-                                     (symbolic_field.name, str(field_arr.shape), str(symbolic_field.shape)))
+                                     f"which does not match expected shape {str(symbolic_field.shape)}")
            if symbolic_field.has_fixed_shape:
                symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides)
                if isinstance(symbolic_field.dtype, StructType):
                    symbolic_field_strides = symbolic_field_strides[:-1]
                if symbolic_field_strides != field_arr.strides:
-                    raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
+                    raise ValueError(f"Passed array {symbolic_field.name} has strides {str(field_arr.strides)} "
-                                     (symbolic_field.name, str(field_arr.strides), str(symbolic_field_strides)))
+                                     f"which does not match expected strides {str(symbolic_field_strides)}")
            if FieldType.is_indexed(symbolic_field):
                index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
@@ -183,9 +175,9 @@ def _check_arguments(parameter_specification, argument_dict):
                array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
    if len(array_shapes) > 1:
-        raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
+        raise ValueError(f"All passed arrays have to have the same size {str(array_shapes)}")
    if len(index_arr_shapes) > 1:
-        raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
+        raise ValueError(f"All passed index arrays have to have the same size {str(array_shapes)}")
    if len(index_arr_shapes) > 0:
        return list(index_arr_shapes)[0]

--- a/pystencils/gpucuda/indexing.py
+++ b/pystencils/gpucuda/indexing.py
 import abc
 from functools import partial
+import math
+from typing import List, Tuple
 import sympy as sp
 from sympy.core.cache import cacheit
-from pystencils.astnodes import Block, Conditional
+from pystencils.astnodes import Block, Conditional, SympyAssignment
-from pystencils.data_types import TypedSymbol, create_type
+from pystencils.typing import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
-from pystencils.slicing import normalize_slice
 from pystencils.sympyextensions import is_integer_sequence, prod
@@ -32,23 +33,58 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c
 class AbstractIndexing(abc.ABC):
    """
-    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
+    Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
-    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
+    to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
-    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
+    and computes the number of blocks and threads a kernel is started with.
-    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
+    The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
+    and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
+    coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
    """
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        for iter_space in iteration_space:
+            assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
+                                                  f"not tuple of type {type(iter_space)}"
+        self._iteration_space = iteration_space
+        self._data_layout = data_layout
+        self._dim = len(iteration_space)
+    @property
+    def iteration_space(self):
+        """Iteration space to loop over"""
+        return self._iteration_space
+    @property
+    def data_layout(self):
+        """Data layout of the kernels arrays"""
+        return self._data_layout
+    @property
+    def dim(self):
+        """Number of spatial dimensions"""
+        return self._dim
    @property
    @abc.abstractmethod
    def coordinates(self):
-        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
+        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic GPU block and thread indices.
        These symbolic indices can be obtained with the method `index_variables` """
    @property
    def index_variables(self):
-        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
+        """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
+    @abc.abstractmethod
+    def get_loop_ctr_assignments(self, loop_counter_symbols) -> List[SympyAssignment]:
+        """Adds assignments for the loop counter symbols depending on the gpu threads.
+        Args:
+            loop_counter_symbols: typed symbols representing the loop counters
+        Returns:
+            assignments for the loop counters
+        """
    @abc.abstractmethod
    def call_parameters(self, arr_shape):
        """Determine grid and block size for kernel call.
@@ -88,57 +124,79 @@ class AbstractIndexing(abc.ABC):
 class BlockIndexing(AbstractIndexing):
-    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
+    """Generic indexing scheme that maps sub-blocks of an array to GPU blocks.
    Args:
-        field: pystencils field (common to all Indexing classes)
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
-        iteration_slice: slice that defines rectangular subarea which is iterated over
+        data_layout: tuple specifying loop order with innermost loop last.
+                     This is the same format as returned by `Field.layout`.
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
-        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
+        compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
+        maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
+                            maximum block size from the device properties
+        device_number: device number of the used GPU. By default, the zeroth device is used.
    """
-    def __init__(self, field, iteration_slice,
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple[int],
-                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
+                 block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
-                 maximum_block_size=(1024, 1024, 64)):
+                 maximum_block_size=(1024, 1024, 64), device_number=None):
-        if field.spatial_dimensions > 3:
+        super(BlockIndexing, self).__init__(iteration_space, data_layout)
-            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
+        if self._dim > 4:
+            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
-        if permute_block_size_dependent_on_layout:
+        if permute_block_size_dependent_on_layout and self._dim < 4:
-            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
+            block_size = self.permute_block_size_according_to_layout(block_size, data_layout)
        self._block_size = block_size
        if maximum_block_size == 'auto':
+            assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
            # Get device limits
-            import pycuda.driver as cuda
+            import cupy as cp
-            # noinspection PyUnresolvedReferences
+            # See https://github.com/cupy/cupy/issues/7676
-            import pycuda.autoinit  # NOQA
+            if cp.cuda.runtime.is_hip:
-            da = cuda.device_attribute
+                maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
-            device = cuda.Context.get_device()
+            else:
-            maximum_block_size = tuple(device.get_attribute(a)
+                da = cp.cuda.Device(device_number).attributes
-                                       for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z))
+                maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
        self._maximum_block_size = maximum_block_size
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._dim = field.spatial_dimensions
-        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
        self._compile_time_block_size = compile_time_block_size
+        self._device_number = device_number
    @property
-    def coordinates(self):
+    def cuda_indices(self):
-        offsets = _get_start_from_slice(self._iterationSlice)
        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
-        coordinates = [block_index * bs + thread_idx + off
+        indices = [block_index * bs + thread_idx
-                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)]
+                   for block_index, bs, thread_idx in zip(BLOCK_IDX, block_size, THREAD_IDX)]
+        return indices[:self._dim]
-        return coordinates[:self._dim]
+    @property
+    def coordinates(self):
+        if self._dim < 4:
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
+            return coordinates[:self._dim]
+        else:
+            coordinates = list()
+            width = self._iteration_space[1].stop - self.iteration_space[1].start
+            coordinates.append(div_floor(self.cuda_indices[0], width))
+            coordinates.append(sp.Mod(self.cuda_indices[0], width))
+            coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
+            coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
+            return coordinates
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)
+        if len(widths) > 3:
+            widths = [widths[0] * widths[1], widths[2], widths[3]]
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
        extend_bs = (1,) * (3 - len(self._block_size))
        block_size = self._block_size + extend_bs
        if not self._compile_time_block_size:
@@ -147,7 +205,8 @@ class BlockIndexing(AbstractIndexing):
            for i in range(len(widths)):
                factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
                adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
-            block_size = tuple(adapted_block_size) + extend_bs
+            extend_adapted_bs = (1,) * (3 - len(adapted_block_size))
+            block_size = tuple(adapted_block_size) + extend_adapted_bs
        block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size))
        grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
@@ -158,42 +217,59 @@ class BlockIndexing(AbstractIndexing):
    def guard(self, kernel_content, arr_shape):
        arr_shape = arr_shape[:self._dim]
-        conditions = [c < end
+        if len(self._iteration_space) - 1 == len(arr_shape):
-                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
+            numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space[1:], arr_shape)
+            numeric_iteration_slice = [self.iteration_space[0]] + numeric_iteration_slice
+        else:
+            assert len(self._iteration_space) == len(arr_shape), "Iteration space must be equal to the array shape"
+            numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
+        for i, s in enumerate(numeric_iteration_slice):
+            if s.step and s.step != 1:
+                end[i] = div_ceil(s.stop - s.start, s.step) + s.start
+        if self._dim < 4:
+            conditions = [c < e for c, e in zip(self.coordinates, end)]
+        else:
+            end = [end[0] * end[1], end[2], end[3]]
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
+            conditions = [c < e for c, e in zip(coordinates, end)]
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
        return Block([Conditional(condition, kernel_content)])
-    @staticmethod
+    def numeric_iteration_space(self, arr_shape):
-    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
-        """Shrinks the block_size if there are too many registers used per multiprocessor.
+    def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
+        """Shrinks the block_size if there are too many registers used per block.
        This is not done automatically, since the required_registers_per_thread are not known before compilation.
-        They can be obtained by ``func.num_regs`` from a pycuda function.
+        They can be obtained by ``func.num_regs`` from a cupy function.
-        :returns smaller block_size if too many registers are used.
+        Args:
+            block_size: used block size that is target for limiting
+            required_registers_per_thread: needed registers per thread
+        returns: smaller block_size if too many registers are used.
        """
-        import pycuda.driver as cuda
+        import cupy as cp
-        # noinspection PyUnresolvedReferences
-        import pycuda.autoinit  # NOQA
-        da = cuda.device_attribute
-        if device is None:
-            device = cuda.Context.get_device()
-        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
-        block = block_size
+        # See https://github.com/cupy/cupy/issues/7676
+        if cp.cuda.runtime.is_hip:
+            max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
+        else:
+            device = cp.cuda.Device(self._device_number)
+            da = device.attributes
+            max_registers_per_block = da.get("MaxRegistersPerBlock")
+        result = list(block_size)
        while True:
-            num_threads = 1
+            required_registers = math.prod(result) * required_registers_per_thread
-            for t in block:
+            if required_registers <= max_registers_per_block:
-                num_threads *= t
+                return result
-            required_registers_per_mt = num_threads * required_registers_per_thread
-            if required_registers_per_mt <= available_registers_per_mp:
-                return block
            else:
-                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
+                largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
-                assert block[largest_grid_entry_idx] >= 2
+                assert result[largest_list_entry_idx] >= 2
-                block[largest_grid_entry_idx] //= 2
+                result[largest_list_entry_idx] //= 2
    @staticmethod
    def permute_block_size_according_to_layout(block_size, layout):
@@ -222,42 +298,48 @@ class BlockIndexing(AbstractIndexing):
 class LineIndexing(AbstractIndexing):
    """
-    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
+    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D GPU block.
    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
-    maximum amount of threads allowed in a CUDA block (which depends on device).
+    maximum amount of threads allowed in a GPU block (which depends on device).
+    Args:
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple to determine the fast and slow coordinates.
    """
-    def __init__(self, field, iteration_slice):
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
-        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
+        super(LineIndexing, self).__init__(iteration_space, data_layout)
-        if field.spatial_dimensions > 4:
+        if len(iteration_space) > 4:
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
-        coordinates = available_indices[:field.spatial_dimensions]
+    @property
+    def cuda_indices(self):
+        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
+        coordinates = available_indices[:self.dim]
-        fastest_coordinate = field.layout[-1]
+        fastest_coordinate = self.data_layout[-1]
        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
-        self._coordinates = coordinates
+        return coordinates
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
    @property
    def coordinates(self):
-        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
+        return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]
-    def call_parameters(self, arr_shape):
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
+    def call_parameters(self, arr_shape):
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
-        widths = sp.Matrix(widths).subs(substitution_dict)
+        widths = _get_widths(numeric_iteration_slice)
        def get_shape_of_cuda_idx(cuda_idx):
-            if cuda_idx not in self._coordinates:
+            if cuda_idx not in self.cuda_indices:
                return 1
            else:
-                idx = self._coordinates.index(cuda_idx)
+                idx = self.cuda_indices.index(cuda_idx)
                return widths[idx]
        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
@@ -272,30 +354,66 @@ class LineIndexing(AbstractIndexing):
    def symbolic_parameters(self):
        return set()
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
 # -------------------------------------- Helper functions --------------------------------------------------------------
-def _get_start_from_slice(iteration_slice):
+def _get_numeric_iteration_slice(iteration_slice, arr_shape):
    res = []
-    for slice_component in iteration_slice:
+    for slice_component, shape in zip(iteration_slice, arr_shape):
-        if type(slice_component) is slice:
+        result_slice = slice_component
-            res.append(slice_component.start if slice_component.start is not None else 0)
+        if not isinstance(result_slice.start, int):
-        else:
+            start = result_slice.start
-            assert isinstance(slice_component, int)
+            assert len(start.free_symbols) == 1
-            res.append(slice_component)
+            start = start.subs({symbol: shape for symbol in start.free_symbols})
+            result_slice = slice(start, result_slice.stop, result_slice.step)
+        if not isinstance(result_slice.stop, int):
+            stop = result_slice.stop
+            assert len(stop.free_symbols) == 1
+            stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
+            result_slice = slice(result_slice.start, stop, result_slice.step)
+        assert isinstance(result_slice.step, int)
+        res.append(result_slice)
    return res
-def _get_end_from_slice(iteration_slice, arr_shape):
+def _get_widths(iteration_slice):
-    iter_slice = normalize_slice(iteration_slice, arr_shape)
+    widths = []
-    res = []
+    for iter_slice in iteration_slice:
-    for slice_component in iter_slice:
+        step = iter_slice.step
-        if type(slice_component) is slice:
+        assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
-            res.append(slice_component.stop)
+        start = iter_slice.start
+        stop = iter_slice.stop
+        if step == 1:
+            if stop - start == 0:
+                widths.append(1)
+            else:
+                widths.append(stop - start)
        else:
-            assert isinstance(slice_component, int)
+            width = (stop - start) / step
-            res.append(slice_component + 1)
+            if isinstance(width, int):
-    return res
+                widths.append(width)
+            elif isinstance(width, float):
+                widths.append(math.ceil(width))
+            else:
+                widths.append(div_ceil(stop - start, step))
+    return widths
+def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
+    loop_ctr_assignments = []
+    for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
+        if isinstance(iter_slice, slice) and iter_slice.step > 1:
+            offset = (iter_slice.step * iter_slice.start) - iter_slice.start
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step - offset))
+        elif iter_slice.start == iter_slice.stop:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
+        else:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))
+    return loop_ctr_assignments
 def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):

--- a/pystencils/gpucuda/kernelcreation.py
+++ b/pystencils/gpucuda/kernelcreation.py
-import numpy as np
+import sympy as sp
 from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
-from pystencils.data_types import StructType, TypedSymbol
+from pystencils.config import CreateKernelConfig
+from pystencils.typing import StructType, TypedSymbol
+from pystencils.typing.transformations import add_types
 from pystencils.field import Field, FieldType
 from pystencils.enums import Target, Backend
-from pystencils.gpucuda.cudajit import make_python_function
+from pystencils.gpu.gpujit import make_python_function
-from pystencils.gpucuda.indexing import BlockIndexing
+from pystencils.node_collection import NodeCollection
+from pystencils.gpu.indexing import indexing_creator_from_params
+from pystencils.slicing import normalize_slice
 from pystencils.transformations import (
-    add_types, get_base_buffer_index, get_common_shape, implement_interpolations,
+    get_base_buffer_index, get_common_field, get_common_indexed_element, parse_base_pointer_info,
-    parse_base_pointer_info, resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
+    resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
-def create_cuda_kernel(assignments,
+def create_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
-                       function_name="kernel",
-                       type_info=None,
+    function_name = config.function_name
-                       indexing_creator=BlockIndexing,
+    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
-                       iteration_slice=None,
+    iteration_slice = config.iteration_slice
-                       ghost_layers=None,
+    ghost_layers = config.ghost_layers
-                       skip_independence_check=False,
-                       use_textures_for_interpolation=True):
+    fields_written = assignments.bound_fields
-    assert assignments, "Assignments must not be empty!"
+    fields_read = assignments.rhs_fields
-    fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check)
+    assignments = assignments.all_assignments
+    assignments = add_types(assignments, config)
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])
-    buffers = set([f for f in all_fields if FieldType.is_buffer(f) or FieldType.is_custom(f)])
+    buffers = set([f for f in all_fields if FieldType.is_buffer(f)])
    fields_without_buffers = all_fields - buffers
    field_accesses = set()
    num_buffer_accesses = 0
+    indexed_elements = set()
    for eq in assignments:
+        indexed_elements.update(eq.atoms(sp.Indexed))
        field_accesses.update(eq.atoms(Field.Access))
        field_accesses = {e for e in field_accesses if not e.is_absolute_access}
        num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field))
-    common_shape = get_common_shape(fields_without_buffers)
+    # common shape and field to from the iteration space
+    common_field = get_common_field(fields_without_buffers)
+    common_shape = common_field.spatial_shape
    if iteration_slice is None:
        # determine iteration slice from ghost layers
@@ -52,17 +63,28 @@ def create_cuda_kernel(assignments,
                iteration_slice.append(slice(ghost_layers[i][0],
                                             -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None))
-    indexing = indexing_creator(field=list(fields_without_buffers)[0], iteration_slice=iteration_slice)
+        iteration_space = normalize_slice(iteration_slice, common_shape)
-    coord_mapping = indexing.coordinates
+    else:
+        iteration_space = normalize_slice(iteration_slice, common_shape)
+    iteration_space = tuple([s if isinstance(s, slice) else slice(s, s + 1, 1) for s in iteration_space])
+    loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]
+    if len(indexed_elements) > 0:
+        common_indexed_element = get_common_indexed_element(indexed_elements)
+        index = common_indexed_element.indices[0].atoms(TypedSymbol)
+        assert len(index) == 1, "index expressions must only contain one symbol representing the index"
+        indexing = indexing_creator(iteration_space=(slice(0, common_indexed_element.shape[0], 1), *iteration_space),
+                                    data_layout=common_field.layout)
+        extended_ctrs = [index.pop(), *loop_counter_symbols]
+        loop_counter_assignments = indexing.get_loop_ctr_assignments(extended_ctrs)
+    else:
+        indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
+        loop_counter_assignments = indexing.get_loop_ctr_assignments(loop_counter_symbols)
+    assignments = loop_counter_assignments + assignments
+    block = indexing.guard(Block(assignments), common_shape)
-    cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value)
-                            for i, value in enumerate(coord_mapping)]
-    cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)]
-    assignments = cell_idx_assignments + assignments
-    block = Block(assignments)
-    block = indexing.guard(block, common_shape)
    unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)
    ast = KernelFunction(block,
@@ -74,19 +96,18 @@ def create_cuda_kernel(assignments,
                         assignments=assignments)
    ast.global_variables.update(indexing.index_variables)
-    implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation)
+    base_pointer_spec = config.base_pointer_specification
+    if base_pointer_spec is None:
-    base_pointer_spec = [['spatialInner0']]
+        base_pointer_spec = []
    base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
                                                         f.spatial_dimensions, f.index_dimensions)
                         for f in all_fields}
-    coord_mapping = {f.name: cell_idx_symbols for f in all_fields}
+    coord_mapping = {f.name: loop_counter_symbols for f in all_fields}
-    loop_strides = list(fields_without_buffers)[0].shape
    if any(FieldType.is_buffer(f) for f in all_fields):
-        resolve_buffer_accesses(ast, get_base_buffer_index(ast, indexing.coordinates, loop_strides), read_only_fields)
+        base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
+        resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)
    resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
                           field_to_fixed_coordinates=coord_mapping)
@@ -105,34 +126,41 @@ def create_cuda_kernel(assignments,
    return ast
-def created_indexed_cuda_kernel(assignments,
+def created_indexed_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
-                                index_fields,
-                                function_name="kernel",
+    index_fields = config.index_fields
-                                type_info=None,
+    function_name = config.function_name
-                                coordinate_names=('x', 'y', 'z'),
+    coordinate_names = config.coordinate_names
-                                indexing_creator=BlockIndexing,
+    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
-                                use_textures_for_interpolation=True):
+    fields_written = assignments.bound_fields
-    fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False)
+    fields_read = assignments.rhs_fields
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])
+    # extract the index fields based on the name. The original index field might have been modified
+    index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
+    non_index_fields = [f for f in all_fields if f not in index_fields]
+    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
+    assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
+                                          f"Non index fields are {non_index_fields}, spatial coordinates are " \
+                                          f"{spatial_coordinates}"
+    spatial_coordinates = list(spatial_coordinates)[0]
+    assignments = assignments.all_assignments
+    assignments = add_types(assignments, config)
    for index_field in index_fields:
        index_field.field_type = FieldType.INDEXED
        assert FieldType.is_indexed(index_field)
        assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
-    non_index_fields = [f for f in all_fields if f not in index_fields]
-    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
-    assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
-    spatial_coordinates = list(spatial_coordinates)[0]
    def get_coordinate_symbol_assignment(name):
        for ind_f in index_fields:
            assert isinstance(ind_f.dtype, StructType), "Index fields have to have a struct data type"
            data_type = ind_f.dtype
            if data_type.has_element(name):
                rhs = ind_f[0](name)
-                lhs = TypedSymbol(name, np.int64)
+                lhs = TypedSymbol(name, data_type.get_element_type(name))
                return SympyAssignment(lhs, rhs)
        raise ValueError(f"Index {name} not found in any of the passed index fields")
@@ -141,17 +169,19 @@ def created_indexed_cuda_kernel(assignments,
    coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments]
    idx_field = list(index_fields)[0]
-    indexing = indexing_creator(field=idx_field,
-                                iteration_slice=[slice(None, None, None)] * len(idx_field.spatial_shape))
+    iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape),
+                                      idx_field.spatial_shape)
+    indexing = indexing_creator(iteration_space=iteration_space,
+                                data_layout=idx_field.layout)
    function_body = Block(coordinate_symbol_assignments + assignments)
-    function_body = indexing.guard(function_body, get_common_shape(index_fields))
+    function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape)
    ast = KernelFunction(function_body, Target.GPU, Backend.CUDA, make_python_function,
                         None, function_name, assignments=assignments)
    ast.global_variables.update(indexing.index_variables)
-    implement_interpolations(ast, implement_by_texture_accesses=use_textures_for_interpolation)
    coord_mapping = indexing.coordinates
    base_pointer_spec = [['spatialInner0']]
    base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],

--- a/pystencils/gpucuda/periodicity.py
+++ b/pystencils/gpucuda/periodicity.py
 import numpy as np
 from itertools import product
-import pystencils.gpucuda
+from pystencils import CreateKernelConfig, create_kernel
-import pystencils.opencl
+from pystencils.gpu import make_python_function
 from pystencils import Assignment, Field
-from pystencils.gpucuda.kernelcreation import create_cuda_kernel
 from pystencils.enums import Target
 from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
@@ -27,24 +26,21 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in
        eq = Assignment(f(*i), f[tuple(offset)](*i))
        update_eqs.append(eq)
-    ast = create_cuda_kernel(update_eqs, iteration_slice=to_slice, skip_independence_check=True)
+    config = CreateKernelConfig(target=Target.GPU, iteration_slice=to_slice, skip_independence_check=True)
+    ast = create_kernel(update_eqs, config=config)
    return ast
 def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
-                                  thickness=None, dtype=float, target=Target.GPU, opencl_queue=None, opencl_ctx=None):
+                                  thickness=None, dtype=np.float64, target=Target.GPU):
-    assert target in {Target.GPU, Target.OPENCL}
+    assert target in {Target.GPU}
    src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
    kernels = []
    for src_slice, dst_slice in src_dst_slice_tuples:
        ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
-        if target == pystencils.Target.GPU:
+        kernels.append(make_python_function(ast))
-            kernels.append(pystencils.gpucuda.make_python_function(ast))
-        else:
-            ast._target = pystencils.Target.OPENCL
-            ast._backend = pystencils.Backend.OPENCL
-            kernels.append(pystencils.opencl.make_python_function(ast, opencl_queue, opencl_ctx))
    def functor(pdfs, **_):
        for kernel in kernels:
No results found