5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/pystencils/gpucuda/__init__.py
+++ b/pystencils/gpucuda/__init__.py
-from pystencils.gpucuda.cudajit import make_python_function
+from pystencils.gpu.gpu_array_handler import GPUArrayHandler, GPUNotAvailableHandler
-from pystencils.gpucuda.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
+from pystencils.gpu.gpujit import make_python_function
+from pystencils.gpu.kernelcreation import create_cuda_kernel, created_indexed_cuda_kernel
 from .indexing import AbstractIndexing, BlockIndexing, LineIndexing
-__all__ = ['create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
+__all__ = ['GPUArrayHandler', 'GPUNotAvailableHandler',
+           'create_cuda_kernel', 'created_indexed_cuda_kernel', 'make_python_function',
           'AbstractIndexing', 'BlockIndexing', 'LineIndexing']
--- a/src/pystencils/gpu/gpu_array_handler.py
+++ b/src/pystencils/gpu/gpu_array_handler.py
+try:
+    import cupy as cp
+    import cupyx as cpx
+except ImportError:
+    cp = None
+    cpx = None
+import numpy as np
+class GPUArrayHandler:
+    def __init__(self, device_number):
+        self._device_number = device_number
+    def zeros(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.zeros(shape=shape, dtype=dtype, order=order)
+    def ones(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.ones(shape=shape, dtype=dtype, order=order)
+    def empty(self, shape, dtype=np.float64, order='C'):
+        with cp.cuda.Device(self._device_number):
+            return cp.empty(shape=shape, dtype=dtype, order=order)
+    def to_gpu(self, numpy_array):
+        swaps = _get_index_swaps(numpy_array)
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                gpu_array = cp.asarray(numpy_array.base)
+            for a, b in reversed(swaps):
+                gpu_array = gpu_array.swapaxes(a, b)
+            return gpu_array
+        else:
+            return cp.asarray(numpy_array)
+    def upload(self, array, numpy_array):
+        assert self._device_number == array.device.id
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                array.base.set(numpy_array.base)
+        else:
+            with cp.cuda.Device(self._device_number):
+                array.set(numpy_array)
+    def download(self, array, numpy_array):
+        assert self._device_number == array.device.id
+        if numpy_array.base is not None and isinstance(numpy_array.base, np.ndarray):
+            with cp.cuda.Device(self._device_number):
+                numpy_array.base[:] = array.base.get()
+        else:
+            with cp.cuda.Device(self._device_number):
+                numpy_array[:] = array.get()
+    def randn(self, shape, dtype=np.float64):
+        with cp.cuda.Device(self._device_number):
+            return cp.random.randn(*shape, dtype=dtype)
+    @staticmethod
+    def pinned_numpy_array(layout, shape, dtype):
+        assert set(layout) == set(range(len(shape))), "Wrong layout descriptor"
+        cur_layout = list(range(len(shape)))
+        swaps = []
+        for i in range(len(layout)):
+            if cur_layout[i] != layout[i]:
+                index_to_swap_with = cur_layout.index(layout[i])
+                swaps.append((i, index_to_swap_with))
+                cur_layout[i], cur_layout[index_to_swap_with] = cur_layout[index_to_swap_with], cur_layout[i]
+        assert tuple(cur_layout) == tuple(layout)
+        shape = list(shape)
+        for a, b in swaps:
+            shape[a], shape[b] = shape[b], shape[a]
+        res = cpx.empty_pinned(tuple(shape), order='c', dtype=dtype)
+        for a, b in reversed(swaps):
+            res = res.swapaxes(a, b)
+        return res
+    from_numpy = to_gpu
+class GPUNotAvailableHandler:
+    def __getattribute__(self, name):
+        raise NotImplementedError("Unable to utilise cupy! Please make sure cupy works correctly in your setup!")
+def _get_index_swaps(array):
+    swaps = []
+    if array.base is not None and isinstance(array.base, np.ndarray):
+        for stride in array.base.strides:
+            index_base = array.base.strides.index(stride)
+            index_view = array.strides.index(stride)
+            if index_base != index_view and (index_view, index_base) not in swaps:
+                swaps.append((index_base, index_view))
+    return swaps
--- a/pystencils/gpucuda/cudajit.py
+++ b/pystencils/gpucuda/cudajit.py
@@ -4,9 +4,9 @@ from pystencils.backends.cbackend import get_headers
 from pystencils.backends.cuda_backend import generate_cuda
 from pystencils.typing import StructType
 from pystencils.field import FieldType
-from pystencils.include import get_pycuda_include_path, get_pystencils_include_path
+from pystencils.include import get_pystencils_include_path
 from pystencils.kernel_wrapper import KernelWrapper
-from pystencils.typing.typed_sympy import FieldPointerSymbol
+from pystencils.typing import BasicType, FieldPointerSymbol
 USE_FAST_MATH = True
@@ -21,39 +21,49 @@ def get_cubic_interpolation_include_paths():
 def make_python_function(kernel_function_node, argument_dict=None, custom_backend=None):
    """
    Creates a kernel function from an abstract syntax tree which
-    was created e.g. by :func:`pystencils.gpucuda.create_cuda_kernel`
+    was created e.g. by :func:`pystencils.gpu.create_cuda_kernel`
-    or :func:`pystencils.gpucuda.created_indexed_cuda_kernel`
+    or :func:`pystencils.gpu.created_indexed_cuda_kernel`
    Args:
        kernel_function_node: the abstract syntax tree
        argument_dict: parameters passed here are already fixed. Remaining parameters have to be passed to the
                       returned kernel functor.
+        custom_backend: use own custom printer for code generation
    Returns:
        compiled kernel as Python function
    """
-    import pycuda.autoinit  # NOQA
+    import cupy as cp
-    from pycuda.compiler import SourceModule
    if argument_dict is None:
        argument_dict = {}
-    header_list = ['<cstdint>'] + list(get_headers(kernel_function_node))
+    headers = get_headers(kernel_function_node)
+    if cp.cuda.runtime.is_hip:
+        headers.add('"gpu_defines.h"')
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<hip/hip_fp16.h>')
+    else:
+        headers.update({'"gpu_defines.h"', '<cstdint>'})
+        for field in kernel_function_node.fields_accessed:
+            if isinstance(field.dtype, BasicType) and field.dtype.is_half():
+                headers.add('<cuda_fp16.h>')
+    header_list = sorted(headers)
    includes = "\n".join([f"#include {include_file}" for include_file in header_list])
    code = includes + "\n"
    code += "#define FUNC_PREFIX __global__\n"
    code += "#define RESTRICT __restrict__\n\n"
-    code += str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
+    code += 'extern "C" {\n%s\n}\n' % str(generate_cuda(kernel_function_node, custom_backend=custom_backend))
-    nvcc_options = ["-w", "-std=c++11", "-Wno-deprecated-gpu-targets"]
+    options = ["-w", "-std=c++11"]
    if USE_FAST_MATH:
-        nvcc_options.append("-use_fast_math")
+        options.append("-use_fast_math")
+    options.append("-I" + get_pystencils_include_path())
-    mod = SourceModule(code, options=nvcc_options, include_dirs=[
-                       get_pystencils_include_path(), get_pycuda_include_path()])
-    func = mod.get_function(kernel_function_node.function_name)
+    func = cp.RawKernel(code, kernel_function_node.function_name, options=tuple(options), backend="nvrtc", jitify=True)
    parameters = kernel_function_node.get_parameters()
    cache = {}
@@ -64,7 +74,10 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
                         for k, v in kwargs.items()))
        try:
            args, block_and_thread_numbers = cache[key]
-            func(*args, **block_and_thread_numbers)
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
+                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
        except KeyError:
            full_arguments = argument_dict.copy()
            full_arguments.update(kwargs)
@@ -75,11 +88,16 @@ def make_python_function(kernel_function_node, argument_dict=None, custom_backen
            block_and_thread_numbers['block'] = tuple(int(i) for i in block_and_thread_numbers['block'])
            block_and_thread_numbers['grid'] = tuple(int(i) for i in block_and_thread_numbers['grid'])
-            args = _build_numpy_argument_list(parameters, full_arguments)
+            args = tuple(_build_numpy_argument_list(parameters, full_arguments))
            cache[key] = (args, block_and_thread_numbers)
            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(*args, **block_and_thread_numbers)
+            device = set(a.device.id for a in args if type(a) is cp.ndarray)
-        # import pycuda.driver as cuda
+            assert len(device) == 1, "All arrays used by a kernel need to be allocated on the same device"
+            with cp.cuda.Device(device.pop()):
+                func(block_and_thread_numbers['grid'], block_and_thread_numbers['block'], args)
+                # useful for debugging:
+                # cp.cuda.runtime.deviceSynchronize()
        # cuda.Context.synchronize() # useful for debugging, to get errors right after kernel was called
    ast = kernel_function_node
    parameters = kernel_function_node.get_parameters()
@@ -98,8 +116,8 @@ def _build_numpy_argument_list(parameters, argument_dict):
            actual_type = array.dtype
            expected_type = param.fields[0].dtype.numpy_dtype
            if expected_type != actual_type:
-                raise ValueError("Data type mismatch for field '%s'. Expected '%s' got '%s'." %
+                raise ValueError(f"Data type mismatch for field {param.field_name}. "
-                                 (param.field_name, expected_type, actual_type))
+                                 f"Expected {expected_type} got {actual_type}.")
            result.append(array)
        elif param.is_field_stride:
            cast_to_dtype = param.symbol.dtype.numpy_dtype.type
@@ -134,22 +152,22 @@ def _check_arguments(parameter_specification, argument_dict):
            try:
                field_arr = argument_dict[symbolic_field.name]
            except KeyError:
-                raise KeyError("Missing field parameter for kernel call " + str(symbolic_field))
+                raise KeyError(f"Missing field parameter for kernel call {str(symbolic_field)}")
            if symbolic_field.has_fixed_shape:
                symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
                if isinstance(symbolic_field.dtype, StructType):
                    symbolic_field_shape = symbolic_field_shape[:-1]
                if symbolic_field_shape != field_arr.shape:
-                    raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
+                    raise ValueError(f"Passed array {symbolic_field.name} has shape {str(field_arr.shape)} "
-                                     (symbolic_field.name, str(field_arr.shape), str(symbolic_field.shape)))
+                                     f"which does not match expected shape {str(symbolic_field.shape)}")
            if symbolic_field.has_fixed_shape:
                symbolic_field_strides = tuple(int(i) * field_arr.dtype.itemsize for i in symbolic_field.strides)
                if isinstance(symbolic_field.dtype, StructType):
                    symbolic_field_strides = symbolic_field_strides[:-1]
                if symbolic_field_strides != field_arr.strides:
-                    raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
+                    raise ValueError(f"Passed array {symbolic_field.name} has strides {str(field_arr.strides)} "
-                                     (symbolic_field.name, str(field_arr.strides), str(symbolic_field_strides)))
+                                     f"which does not match expected strides {str(symbolic_field_strides)}")
            if FieldType.is_indexed(symbolic_field):
                index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
@@ -157,9 +175,9 @@ def _check_arguments(parameter_specification, argument_dict):
                array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
    if len(array_shapes) > 1:
-        raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
+        raise ValueError(f"All passed arrays have to have the same size {str(array_shapes)}")
    if len(index_arr_shapes) > 1:
-        raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
+        raise ValueError(f"All passed index arrays have to have the same size {str(array_shapes)}")
    if len(index_arr_shapes) > 0:
        return list(index_arr_shapes)[0]

--- a/pystencils/gpucuda/indexing.py
+++ b/pystencils/gpucuda/indexing.py
 import abc
 from functools import partial
+import math
+from typing import List, Tuple
 import sympy as sp
 from sympy.core.cache import cacheit
-from pystencils.astnodes import Block, Conditional
+from pystencils.astnodes import Block, Conditional, SympyAssignment
 from pystencils.typing import TypedSymbol, create_type
 from pystencils.integer_functions import div_ceil, div_floor
-from pystencils.slicing import normalize_slice
 from pystencils.sympyextensions import is_integer_sequence, prod
@@ -32,23 +33,58 @@ GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for c
 class AbstractIndexing(abc.ABC):
    """
-    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
+    Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
-    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
+    to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
-    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
+    and computes the number of blocks and threads a kernel is started with.
-    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
+    The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
+    and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
+    coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
    """
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        for iter_space in iteration_space:
+            assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
+                                                  f"not tuple of type {type(iter_space)}"
+        self._iteration_space = iteration_space
+        self._data_layout = data_layout
+        self._dim = len(iteration_space)
+    @property
+    def iteration_space(self):
+        """Iteration space to loop over"""
+        return self._iteration_space
+    @property
+    def data_layout(self):
+        """Data layout of the kernels arrays"""
+        return self._data_layout
+    @property
+    def dim(self):
+        """Number of spatial dimensions"""
+        return self._dim
    @property
    @abc.abstractmethod
    def coordinates(self):
-        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
+        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic GPU block and thread indices.
        These symbolic indices can be obtained with the method `index_variables` """
    @property
    def index_variables(self):
-        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
+        """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
+    @abc.abstractmethod
+    def get_loop_ctr_assignments(self, loop_counter_symbols) -> List[SympyAssignment]:
+        """Adds assignments for the loop counter symbols depending on the gpu threads.
+        Args:
+            loop_counter_symbols: typed symbols representing the loop counters
+        Returns:
+            assignments for the loop counters
+        """
    @abc.abstractmethod
    def call_parameters(self, arr_shape):
        """Determine grid and block size for kernel call.
@@ -88,57 +124,79 @@ class AbstractIndexing(abc.ABC):
 class BlockIndexing(AbstractIndexing):
-    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
+    """Generic indexing scheme that maps sub-blocks of an array to GPU blocks.
    Args:
-        field: pystencils field (common to all Indexing classes)
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
-        iteration_slice: slice that defines rectangular subarea which is iterated over
+        data_layout: tuple specifying loop order with innermost loop last.
+                     This is the same format as returned by `Field.layout`.
        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
                                                gets the largest amount of threads
-        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
+        compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
+        maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
+                            maximum block size from the device properties
+        device_number: device number of the used GPU. By default, the zeroth device is used.
    """
-    def __init__(self, field, iteration_slice,
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple[int],
-                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
+                 block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
-                 maximum_block_size=(1024, 1024, 64)):
+                 maximum_block_size=(1024, 1024, 64), device_number=None):
-        if field.spatial_dimensions > 3:
+        super(BlockIndexing, self).__init__(iteration_space, data_layout)
-            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
+        if self._dim > 4:
+            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
-        if permute_block_size_dependent_on_layout:
+        if permute_block_size_dependent_on_layout and self._dim < 4:
-            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
+            block_size = self.permute_block_size_according_to_layout(block_size, data_layout)
        self._block_size = block_size
        if maximum_block_size == 'auto':
+            assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
            # Get device limits
-            import pycuda.driver as cuda
+            import cupy as cp
-            # noinspection PyUnresolvedReferences
+            # See https://github.com/cupy/cupy/issues/7676
-            import pycuda.autoinit  # NOQA
+            if cp.cuda.runtime.is_hip:
-            da = cuda.device_attribute
+                maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
-            device = cuda.Context.get_device()
+            else:
-            maximum_block_size = tuple(device.get_attribute(a)
+                da = cp.cuda.Device(device_number).attributes
-                                       for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z))
+                maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
        self._maximum_block_size = maximum_block_size
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._dim = field.spatial_dimensions
-        self._symbolic_shape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
        self._compile_time_block_size = compile_time_block_size
+        self._device_number = device_number
    @property
-    def coordinates(self):
+    def cuda_indices(self):
-        offsets = _get_start_from_slice(self._iterationSlice)
        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
-        coordinates = [block_index * bs + thread_idx + off
+        indices = [block_index * bs + thread_idx
-                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)]
+                   for block_index, bs, thread_idx in zip(BLOCK_IDX, block_size, THREAD_IDX)]
+        return indices[:self._dim]
-        return coordinates[:self._dim]
+    @property
+    def coordinates(self):
+        if self._dim < 4:
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
+            return coordinates[:self._dim]
+        else:
+            coordinates = list()
+            width = self._iteration_space[1].stop - self.iteration_space[1].start
+            coordinates.append(div_floor(self.cuda_indices[0], width))
+            coordinates.append(sp.Mod(self.cuda_indices[0], width))
+            coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
+            coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
+            return coordinates
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolic_shape, arr_shape) if sym is not None}
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)
+        if len(widths) > 3:
+            widths = [widths[0] * widths[1], widths[2], widths[3]]
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
        extend_bs = (1,) * (3 - len(self._block_size))
        block_size = self._block_size + extend_bs
        if not self._compile_time_block_size:
@@ -159,42 +217,59 @@ class BlockIndexing(AbstractIndexing):
    def guard(self, kernel_content, arr_shape):
        arr_shape = arr_shape[:self._dim]
-        conditions = [c < end
+        if len(self._iteration_space) - 1 == len(arr_shape):
-                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
+            numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space[1:], arr_shape)
+            numeric_iteration_slice = [self.iteration_space[0]] + numeric_iteration_slice
+        else:
+            assert len(self._iteration_space) == len(arr_shape), "Iteration space must be equal to the array shape"
+            numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
+        for i, s in enumerate(numeric_iteration_slice):
+            if s.step and s.step != 1:
+                end[i] = div_ceil(s.stop - s.start, s.step) + s.start
+        if self._dim < 4:
+            conditions = [c < e for c, e in zip(self.coordinates, end)]
+        else:
+            end = [end[0] * end[1], end[2], end[3]]
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
+            conditions = [c < e for c, e in zip(coordinates, end)]
        condition = conditions[0]
        for c in conditions[1:]:
            condition = sp.And(condition, c)
        return Block([Conditional(condition, kernel_content)])
-    @staticmethod
+    def numeric_iteration_space(self, arr_shape):
-    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
-        """Shrinks the block_size if there are too many registers used per multiprocessor.
+    def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
+        """Shrinks the block_size if there are too many registers used per block.
        This is not done automatically, since the required_registers_per_thread are not known before compilation.
-        They can be obtained by ``func.num_regs`` from a pycuda function.
+        They can be obtained by ``func.num_regs`` from a cupy function.
-        :returns smaller block_size if too many registers are used.
+        Args:
+            block_size: used block size that is target for limiting
+            required_registers_per_thread: needed registers per thread
+        returns: smaller block_size if too many registers are used.
        """
-        import pycuda.driver as cuda
+        import cupy as cp
-        # noinspection PyUnresolvedReferences
-        import pycuda.autoinit  # NOQA
-        da = cuda.device_attribute
-        if device is None:
-            device = cuda.Context.get_device()
-        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
-        block = block_size
+        # See https://github.com/cupy/cupy/issues/7676
+        if cp.cuda.runtime.is_hip:
+            max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
+        else:
+            device = cp.cuda.Device(self._device_number)
+            da = device.attributes
+            max_registers_per_block = da.get("MaxRegistersPerBlock")
+        result = list(block_size)
        while True:
-            num_threads = 1
+            required_registers = math.prod(result) * required_registers_per_thread
-            for t in block:
+            if required_registers <= max_registers_per_block:
-                num_threads *= t
+                return result
-            required_registers_per_mt = num_threads * required_registers_per_thread
-            if required_registers_per_mt <= available_registers_per_mp:
-                return block
            else:
-                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
+                largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
-                assert block[largest_grid_entry_idx] >= 2
+                assert result[largest_list_entry_idx] >= 2
-                block[largest_grid_entry_idx] //= 2
+                result[largest_list_entry_idx] //= 2
    @staticmethod
    def permute_block_size_according_to_layout(block_size, layout):
@@ -223,42 +298,48 @@ class BlockIndexing(AbstractIndexing):
 class LineIndexing(AbstractIndexing):
    """
-    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
+    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D GPU block.
    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
-    maximum amount of threads allowed in a CUDA block (which depends on device).
+    maximum amount of threads allowed in a GPU block (which depends on device).
+    Args:
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple to determine the fast and slow coordinates.
    """
-    def __init__(self, field, iteration_slice):
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
-        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
+        super(LineIndexing, self).__init__(iteration_space, data_layout)
-        if field.spatial_dimensions > 4:
+        if len(iteration_space) > 4:
            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
-        coordinates = available_indices[:field.spatial_dimensions]
+    @property
+    def cuda_indices(self):
+        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
+        coordinates = available_indices[:self.dim]
-        fastest_coordinate = field.layout[-1]
+        fastest_coordinate = self.data_layout[-1]
        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
-        self._coordinates = coordinates
+        return coordinates
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
    @property
    def coordinates(self):
-        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
+        return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]
-    def call_parameters(self, arr_shape):
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
+    def call_parameters(self, arr_shape):
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
-        widths = sp.Matrix(widths).subs(substitution_dict)
+        widths = _get_widths(numeric_iteration_slice)
        def get_shape_of_cuda_idx(cuda_idx):
-            if cuda_idx not in self._coordinates:
+            if cuda_idx not in self.cuda_indices:
                return 1
            else:
-                idx = self._coordinates.index(cuda_idx)
+                idx = self.cuda_indices.index(cuda_idx)
                return widths[idx]
        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
@@ -273,30 +354,66 @@ class LineIndexing(AbstractIndexing):
    def symbolic_parameters(self):
        return set()
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
 # -------------------------------------- Helper functions --------------------------------------------------------------
-def _get_start_from_slice(iteration_slice):
+def _get_numeric_iteration_slice(iteration_slice, arr_shape):
    res = []
-    for slice_component in iteration_slice:
+    for slice_component, shape in zip(iteration_slice, arr_shape):
-        if type(slice_component) is slice:
+        result_slice = slice_component
-            res.append(slice_component.start if slice_component.start is not None else 0)
+        if not isinstance(result_slice.start, int):
-        else:
+            start = result_slice.start
-            assert isinstance(slice_component, int)
+            assert len(start.free_symbols) == 1
-            res.append(slice_component)
+            start = start.subs({symbol: shape for symbol in start.free_symbols})
+            result_slice = slice(start, result_slice.stop, result_slice.step)
+        if not isinstance(result_slice.stop, int):
+            stop = result_slice.stop
+            assert len(stop.free_symbols) == 1
+            stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
+            result_slice = slice(result_slice.start, stop, result_slice.step)
+        assert isinstance(result_slice.step, int)
+        res.append(result_slice)
    return res
-def _get_end_from_slice(iteration_slice, arr_shape):
+def _get_widths(iteration_slice):
-    iter_slice = normalize_slice(iteration_slice, arr_shape)
+    widths = []
-    res = []
+    for iter_slice in iteration_slice:
-    for slice_component in iter_slice:
+        step = iter_slice.step
-        if type(slice_component) is slice:
+        assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
-            res.append(slice_component.stop)
+        start = iter_slice.start
+        stop = iter_slice.stop
+        if step == 1:
+            if stop - start == 0:
+                widths.append(1)
+            else:
+                widths.append(stop - start)
        else:
-            assert isinstance(slice_component, int)
+            width = (stop - start) / step
-            res.append(slice_component + 1)
+            if isinstance(width, int):
-    return res
+                widths.append(width)
+            elif isinstance(width, float):
+                widths.append(math.ceil(width))
+            else:
+                widths.append(div_ceil(stop - start, step))
+    return widths
+def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
+    loop_ctr_assignments = []
+    for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
+        if isinstance(iter_slice, slice) and iter_slice.step > 1:
+            offset = (iter_slice.step * iter_slice.start) - iter_slice.start
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step - offset))
+        elif iter_slice.start == iter_slice.stop:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
+        else:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))
+    return loop_ctr_assignments
 def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):

--- a/pystencils/gpucuda/kernelcreation.py
+++ b/pystencils/gpucuda/kernelcreation.py
-from typing import Union
+import sympy as sp
-import numpy as np
 from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
 from pystencils.config import CreateKernelConfig
@@ -8,17 +6,16 @@ from pystencils.typing import StructType, TypedSymbol
 from pystencils.typing.transformations import add_types
 from pystencils.field import Field, FieldType
 from pystencils.enums import Target, Backend
-from pystencils.gpucuda.cudajit import make_python_function
+from pystencils.gpu.gpujit import make_python_function
 from pystencils.node_collection import NodeCollection
-from pystencils.gpucuda.indexing import indexing_creator_from_params
+from pystencils.gpu.indexing import indexing_creator_from_params
-from pystencils.simp.assignment_collection import AssignmentCollection
+from pystencils.slicing import normalize_slice
 from pystencils.transformations import (
-    get_base_buffer_index, get_common_shape, parse_base_pointer_info,
+    get_base_buffer_index, get_common_field, get_common_indexed_element, parse_base_pointer_info,
    resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
-def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
+def create_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
-                       config: CreateKernelConfig):
    function_name = config.function_name
    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
@@ -34,17 +31,21 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])
-    buffers = set([f for f in all_fields if FieldType.is_buffer(f) or FieldType.is_custom(f)])
+    buffers = set([f for f in all_fields if FieldType.is_buffer(f)])
    fields_without_buffers = all_fields - buffers
    field_accesses = set()
    num_buffer_accesses = 0
+    indexed_elements = set()
    for eq in assignments:
+        indexed_elements.update(eq.atoms(sp.Indexed))
        field_accesses.update(eq.atoms(Field.Access))
        field_accesses = {e for e in field_accesses if not e.is_absolute_access}
        num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field))
-    common_shape = get_common_shape(fields_without_buffers)
+    # common shape and field to from the iteration space
+    common_field = get_common_field(fields_without_buffers)
+    common_shape = common_field.spatial_shape
    if iteration_slice is None:
        # determine iteration slice from ghost layers
@@ -62,17 +63,28 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
                iteration_slice.append(slice(ghost_layers[i][0],
                                             -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None))
-    indexing = indexing_creator(field=list(fields_without_buffers)[0], iteration_slice=iteration_slice)
+        iteration_space = normalize_slice(iteration_slice, common_shape)
-    coord_mapping = indexing.coordinates
+    else:
+        iteration_space = normalize_slice(iteration_slice, common_shape)
-    cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value)
-                            for i, value in enumerate(coord_mapping)]
+    iteration_space = tuple([s if isinstance(s, slice) else slice(s, s + 1, 1) for s in iteration_space])
-    cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)]
-    assignments = cell_idx_assignments + assignments
+    loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]
+    if len(indexed_elements) > 0:
+        common_indexed_element = get_common_indexed_element(indexed_elements)
+        index = common_indexed_element.indices[0].atoms(TypedSymbol)
+        assert len(index) == 1, "index expressions must only contain one symbol representing the index"
+        indexing = indexing_creator(iteration_space=(slice(0, common_indexed_element.shape[0], 1), *iteration_space),
+                                    data_layout=common_field.layout)
+        extended_ctrs = [index.pop(), *loop_counter_symbols]
+        loop_counter_assignments = indexing.get_loop_ctr_assignments(extended_ctrs)
+    else:
+        indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
+        loop_counter_assignments = indexing.get_loop_ctr_assignments(loop_counter_symbols)
+    assignments = loop_counter_assignments + assignments
+    block = indexing.guard(Block(assignments), common_shape)
-    block = Block(assignments)
-    block = indexing.guard(block, common_shape)
    unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)
    ast = KernelFunction(block,
@@ -84,17 +96,18 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
                         assignments=assignments)
    ast.global_variables.update(indexing.index_variables)
-    base_pointer_spec = [['spatialInner0']]
+    base_pointer_spec = config.base_pointer_specification
+    if base_pointer_spec is None:
+        base_pointer_spec = []
    base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
                                                         f.spatial_dimensions, f.index_dimensions)
                         for f in all_fields}
-    coord_mapping = {f.name: cell_idx_symbols for f in all_fields}
+    coord_mapping = {f.name: loop_counter_symbols for f in all_fields}
-    loop_strides = list(fields_without_buffers)[0].shape
    if any(FieldType.is_buffer(f) for f in all_fields):
-        resolve_buffer_accesses(ast, get_base_buffer_index(ast, indexing.coordinates, loop_strides), read_only_fields)
+        base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
+        resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)
    resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
                           field_to_fixed_coordinates=coord_mapping)
@@ -113,40 +126,41 @@ def create_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
    return ast
-def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCollection],
+def created_indexed_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
-                                config: CreateKernelConfig):
    index_fields = config.index_fields
    function_name = config.function_name
    coordinate_names = config.coordinate_names
    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
    fields_written = assignments.bound_fields
    fields_read = assignments.rhs_fields
-    assignments = assignments.all_assignments
-    assignments = add_types(assignments, config)
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])
+    # extract the index fields based on the name. The original index field might have been modified
+    index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
+    non_index_fields = [f for f in all_fields if f not in index_fields]
+    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
+    assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
+                                          f"Non index fields are {non_index_fields}, spatial coordinates are " \
+                                          f"{spatial_coordinates}"
+    spatial_coordinates = list(spatial_coordinates)[0]
+    assignments = assignments.all_assignments
+    assignments = add_types(assignments, config)
    for index_field in index_fields:
        index_field.field_type = FieldType.INDEXED
        assert FieldType.is_indexed(index_field)
        assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
-    non_index_fields = [f for f in all_fields if f not in index_fields]
-    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
-    assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
-    spatial_coordinates = list(spatial_coordinates)[0]
    def get_coordinate_symbol_assignment(name):
        for ind_f in index_fields:
            assert isinstance(ind_f.dtype, StructType), "Index fields have to have a struct data type"
            data_type = ind_f.dtype
            if data_type.has_element(name):
                rhs = ind_f[0](name)
-                lhs = TypedSymbol(name, np.int64)
+                lhs = TypedSymbol(name, data_type.get_element_type(name))
                return SympyAssignment(lhs, rhs)
        raise ValueError(f"Index {name} not found in any of the passed index fields")
@@ -155,11 +169,15 @@ def created_indexed_cuda_kernel(assignments: Union[AssignmentCollection, NodeCol
    coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments]
    idx_field = list(index_fields)[0]
-    indexing = indexing_creator(field=idx_field,
-                                iteration_slice=[slice(None, None, None)] * len(idx_field.spatial_shape))
+    iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape),
+                                      idx_field.spatial_shape)
+    indexing = indexing_creator(iteration_space=iteration_space,
+                                data_layout=idx_field.layout)
    function_body = Block(coordinate_symbol_assignments + assignments)
-    function_body = indexing.guard(function_body, get_common_shape(index_fields))
+    function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape)
    ast = KernelFunction(function_body, Target.GPU, Backend.CUDA, make_python_function,
                         None, function_name, assignments=assignments)
    ast.global_variables.update(indexing.index_variables)

--- a/pystencils/gpucuda/periodicity.py
+++ b/pystencils/gpucuda/periodicity.py
@@ -2,7 +2,7 @@ import numpy as np
 from itertools import product
 from pystencils import CreateKernelConfig, create_kernel
-import pystencils.gpucuda
+from pystencils.gpu import make_python_function
 from pystencils import Assignment, Field
 from pystencils.enums import Target
 from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
@@ -40,7 +40,7 @@ def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, inde
    for src_slice, dst_slice in src_dst_slice_tuples:
        ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
-        kernels.append(pystencils.gpucuda.make_python_function(ast))
+        kernels.append(make_python_function(ast))
    def functor(pdfs, **_):
        for kernel in kernels:

--- a/pystencils/include/__init__.py
+++ b/pystencils/include/__init__.py
-from os.path import dirname, join, realpath
+from os.path import dirname, realpath
 def get_pystencils_include_path():
    return dirname(realpath(__file__))
-def get_pycuda_include_path():
-    import pycuda
-    return join(dirname(realpath(pycuda.__file__)), 'cuda')
--- a/pystencils/include/aesni_rand.h
+++ b/pystencils/include/aesni_rand.h
+/*
+Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
+Copyright 2019-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
 #include <emmintrin.h> // SSE2
 #include <wmmintrin.h> // AES
 #ifdef __AVX__
@@ -551,7 +583,7 @@ QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr
 #endif
-#ifdef __AVX512F__
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
 QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) {
    alignas(64) std::array<uint32,16> a;
    _mm512_store_si512((__m512i*) a.data(), k512);

--- a/pystencils/include/arm_neon_helpers.h
+++ b/pystencils/include/arm_neon_helpers.h
+/*
+Copyright 2021-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#if defined(_MSC_VER)
+#define __ARM_NEON
+#endif
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -32,10 +67,13 @@ inline int32x4_t makeVec_s32(int a, int b, int c, int d)
 #endif
 inline void cachelineZero(void * p) {
+#if !defined(_MSC_VER) || defined(__clang__)
 	__asm__ volatile("dc zva, %0"::"r"(p):"memory");
+#endif
 }
 inline size_t _cachelineSize() {
+#if !defined(_MSC_VER) || defined(__clang__)
 	// check that dc zva is permitted
 	uint64_t dczid;
 	__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
@@ -72,6 +110,7 @@ inline size_t _cachelineSize() {
 			return size;
 		}
 	}
+#endif
 	// too much was zeroed
 	return SIZE_MAX;

--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
+/*
+Copyright 2023, Markus Holzer.
+Copyright 2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#define POS_INFINITY __int_as_float(0x7f800000)
+#define INFINITY POS_INFINITY
+#define NEG_INFINITY __int_as_float(0xff800000)
+#ifdef __HIPCC_RTC__
+typedef __hip_uint8_t uint8_t;
+typedef __hip_int8_t int8_t;
+typedef __hip_uint16_t uint16_t;
+typedef __hip_int16_t int16_t;
+#endif
--- a/src/pystencils/include/half_precision.h
+++ b/src/pystencils/include/half_precision.h
+/*
+Copyright 2023, Markus Holzer.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/// Half precision support. Experimental. Use carefully.
+///
+/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
+/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
+/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
+/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
+/// Clang version must be 15 or higher for x86 half precision support.
+/// GCC version must be 12 or higher for x86 half precision support.
+/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
+/// See
+///   https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
+///   https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
+/// for more information.
+#pragma once
+using half    = _Float16;
--- a/pystencils/include/myintrin.h
+++ b/pystencils/include/myintrin.h
+/*
+Copyright 2019-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
 #pragma once
-#if defined(__SSE2__) || defined(_MSC_VER)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
 {
-#ifdef __AVX512VL__
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
    return _mm_cvtepu32_ps(v);
 #else
    __m128i v2 = _mm_srli_epi32(v, 1);
@@ -28,13 +59,13 @@ QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, _
 }
 #endif
-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
 __attribute__((optimize("no-associative-math")))
 #endif
 QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
 {
-#ifdef __AVX512VL__
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
    return _mm_cvtepu64_pd(x);
 #elif defined(__clang__)
    return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d);
@@ -69,7 +100,7 @@ QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
 QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
 {
-#ifdef __AVX512VL__
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
    return _mm256_cvtepu32_ps(v);
 #else
    __m256i v2 = _mm256_srli_epi32(v, 1);
@@ -80,12 +111,12 @@ QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
 #endif
 }
-#if !defined(__AVX512VL__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
 __attribute__((optimize("no-associative-math")))
 #endif
 QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
 {
-#ifdef __AVX512VL__
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
    return _mm256_cvtepu64_pd(x);
 #elif defined(__clang__)
    return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d);
@@ -99,7 +130,7 @@ QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
 }
 #endif
-#ifdef __AVX512F__
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
 QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
 {
    return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);

--- a/pystencils/include/philox_rand.h
+++ b/pystencils/include/philox_rand.h
-#ifndef __OPENCL_VERSION__
+/*
-#if defined(__SSE2__) || defined(_MSC_VER)
+Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
+Copyright 2019-2024, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <emmintrin.h> // SSE2
 #endif
 #ifdef __AVX2__
 #include <immintrin.h> // AVX*
-#elif defined(__SSE4_1__) || defined(_MSC_VER)
+#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 #include <smmintrin.h>  // SSE4
 #ifdef __FMA__
 #include <immintrin.h> // FMA
 #endif
 #endif
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#define __ARM_NEON
+#endif
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
-#ifdef __ARM_FEATURE_SVE
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 #include <arm_sve.h>
 #endif
@@ -34,7 +70,7 @@
 #endif
 #endif
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #define QUALIFIERS static __forceinline__ __device__
 #elif defined(__OPENCL_VERSION__)
 #define QUALIFIERS static inline
@@ -43,6 +79,12 @@
 #include "myintrin.h"
 #endif
+#if defined(__ARM_FEATURE_SME)
+#define SVE_QUALIFIERS __attribute__((arm_streaming_compatible)) QUALIFIERS
+#else
+#define SVE_QUALIFIERS QUALIFIERS
+#endif
 #define PHILOX_W32_0   (0x9E3779B9)
 #define PHILOX_W32_1   (0xBB67AE85)
 #define PHILOX_M4x32_0 (0xD2511F53)
@@ -55,7 +97,9 @@
 typedef uint32_t uint32;
 typedef uint64_t uint64;
 #else
+#ifndef __HIPCC_RTC__
 #include <cstdint>
+#endif
 typedef std::uint32_t uint32;
 typedef std::uint64_t uint64;
 #endif
@@ -63,7 +107,7 @@ typedef std::uint64_t uint64;
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
 typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
 typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
-#elif defined(__ARM_FEATURE_SVE)
+#elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
 typedef svfloat32_t svfloat32_st;
 typedef svfloat64_t svfloat64_st;
 #endif
@@ -71,7 +115,7 @@ typedef svfloat64_t svfloat64_st;
 QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
 {
-#ifndef __CUDA_ARCH__
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
    // host code
 #if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
    *hip = __mulhwu(a,b);
@@ -182,8 +226,8 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3
 #endif
 }
-#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__)
+#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__)
-#if defined(__SSE4_1__) || defined(_MSC_VER)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
 QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
 {
    __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
@@ -665,12 +709,14 @@ QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
+#ifndef _MSC_VER
 QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                              uint32 key0, uint32 key1,
                              float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
 {
    philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
+#endif
 QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
@@ -695,6 +741,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
 }
+#ifndef _MSC_VER
 QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
                               uint32 key0, uint32 key1,
                               float64x2_t & rnd1, float64x2_t & rnd2)
@@ -702,10 +749,11 @@ QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
 }
 #endif
+#endif
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
-QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
+SVE_QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
 {
    svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
    svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
@@ -718,14 +766,14 @@ QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
    ctr = svset4_u32(ctr, 3, lo0);
 }
-QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
+SVE_QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
 {
    key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0)));
    key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1)));
 }
 template<bool high>
-QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
+SVE_QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
 {
    // convert 32 to 64 bit
    if (high)
@@ -752,9 +800,9 @@ QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
 }
-QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
+SVE_QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
-                              uint32 key0, uint32 key1,
+                                  uint32 key0, uint32 key1,
-                              svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+                                  svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
 {
    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
@@ -782,9 +830,9 @@ QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2,
 }
-QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
+SVE_QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
-                               uint32 key0, uint32 key1,
+                                   uint32 key0, uint32 key1,
-                               svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+                                   svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
 {
    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
@@ -805,9 +853,9 @@ QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2
    rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
 }
-QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1,
+                                  uint32 key0, uint32 key1,
-                              svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+                                  svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
 {
    svuint32_t ctr0v = svdup_u32(ctr0);
    svuint32_t ctr2v = svdup_u32(ctr2);
@@ -816,16 +864,16 @@ QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
-QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                              uint32 key0, uint32 key1,
+                                  uint32 key0, uint32 key1,
-                              svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+                                  svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
 {
    philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
 }
-QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1,
+                                   uint32 key0, uint32 key1,
-                               svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+                                   svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
 {
    svuint32_t ctr0v = svdup_u32(ctr0);
    svuint32_t ctr2v = svdup_u32(ctr2);
@@ -834,9 +882,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
 }
-QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1,
+                                   uint32 key0, uint32 key1,
-                               svfloat64_st & rnd1, svfloat64_st & rnd2)
+                                   svfloat64_st & rnd1, svfloat64_st & rnd2)
 {
    svuint32_t ctr0v = svdup_u32(ctr0);
    svuint32_t ctr2v = svdup_u32(ctr2);
@@ -846,9 +894,9 @@ QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32
    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
 }
-QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
+SVE_QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
-                               uint32 key0, uint32 key1,
+                                   uint32 key0, uint32 key1,
-                               svfloat64_st & rnd1, svfloat64_st & rnd2)
+                                   svfloat64_st & rnd1, svfloat64_st & rnd2)
 {
    philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
 }
@@ -1174,7 +1222,7 @@ QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ct
 }
 #endif
-#ifdef __AVX512F__
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
 QUALIFIERS void _philox4x32round(__m512i* ctr, __m512i* key)
 {
    __m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0));

--- a/pystencils/include/ppc_altivec_helpers.h
+++ b/pystencils/include/ppc_altivec_helpers.h
+/*
+Copyright 2021, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
 #include <altivec.h>
 #undef vector
 #undef bool

--- a/src/pystencils/include/riscv_v_helpers.h
+++ b/src/pystencils/include/riscv_v_helpers.h
+/*
+Copyright 2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+inline void cachelineZero(void * p) {
+#ifdef __riscv_zicboz
+	__asm__ volatile("cbo.zero (%0)"::"r"(p):"memory");
+#endif
+}
+inline size_t _cachelineSize() {
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+	// too much was zeroed
+	return SIZE_MAX;
+}
+inline size_t cachelineSize() {
+#ifdef __riscv_zicboz
+	static size_t size = _cachelineSize();
+	return size;
+#else
+	return SIZE_MAX;
+#endif
+}
--- a/pystencils/integer_functions.py
+++ b/pystencils/integer_functions.py
--- a/pystencils/integer_set_analysis.py
+++ b/pystencils/integer_set_analysis.py
--- a/pystencils/jupyter.py
+++ b/pystencils/jupyter.py
--- a/pystencils/kernel_contrains_check.py
+++ b/pystencils/kernel_contrains_check.py
@@ -38,6 +38,7 @@ class KernelConstraintsCheck:
    def __init__(self, check_independence_condition=True, check_double_write_condition=True):
        self.scopes = NestedScopes()
+        self.field_reads = defaultdict(set)
        self.field_writes = defaultdict(set)
        self.fields_read = set()
        self.check_independence_condition = check_independence_condition
@@ -111,6 +112,13 @@ class KernelConstraintsCheck:
            if self.check_double_write_condition and len(self.field_writes[fai]) > 1:
                raise ValueError(
                    f"Field {lhs.field.name} is written at two different locations")
+            if fai in self.field_reads:
+                reads = tuple(self.field_reads[fai])
+                if len(reads) > 1 or lhs.offsets != reads[0]:
+                    if self.check_independence_condition:
+                        raise ValueError(f"Field {lhs.field.name} is written at different location than it was read. "
+                                         f"This means the resulting kernel would not be thread safe")
        elif isinstance(lhs, sp.Symbol):
            if self.scopes.is_defined_locally(lhs):
                raise ValueError(f"Assignments not in SSA form, multiple assignments to {lhs.name}")
@@ -120,8 +128,9 @@ class KernelConstraintsCheck:
    def update_accesses_rhs(self, rhs):
        if isinstance(rhs, Field.Access) and self.check_independence_condition:
-            writes = self.field_writes[self.FieldAndIndex(
+            fai = self.FieldAndIndex(rhs.field, rhs.index)
-                rhs.field, rhs.index)]
+            writes = self.field_writes[fai]
+            self.field_reads[fai].add(rhs.offsets)
            for write_offset in writes:
                assert len(writes) == 1
                if write_offset != rhs.offsets:

--- a/pystencils/kernel_decorator.py
+++ b/pystencils/kernel_decorator.py
No results found