5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/src/pystencils/gpu/indexing.py
+++ b/src/pystencils/gpu/indexing.py
+import abc
+from functools import partial
+import math
+from typing import List, Tuple
+import sympy as sp
+from sympy.core.cache import cacheit
+from pystencils.astnodes import Block, Conditional, SympyAssignment
+from pystencils.typing import TypedSymbol, create_type
+from pystencils.integer_functions import div_ceil, div_floor
+from pystencils.sympyextensions import is_integer_sequence, prod
+class ThreadIndexingSymbol(TypedSymbol):
+    def __new__(cls, *args, **kwds):
+        obj = ThreadIndexingSymbol.__xnew_cached_(cls, *args, **kwds)
+        return obj
+    def __new_stage2__(cls, name, dtype, *args, **kwargs):
+        obj = super(ThreadIndexingSymbol, cls).__xnew__(cls, name, dtype, *args, **kwargs)
+        return obj
+    __xnew__ = staticmethod(__new_stage2__)
+    __xnew_cached_ = staticmethod(cacheit(__new_stage2__))
+BLOCK_IDX = [ThreadIndexingSymbol("blockIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+THREAD_IDX = [ThreadIndexingSymbol("threadIdx." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+BLOCK_DIM = [ThreadIndexingSymbol("blockDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+GRID_DIM = [ThreadIndexingSymbol("gridDim." + coord, create_type("int32")) for coord in ('x', 'y', 'z')]
+class AbstractIndexing(abc.ABC):
+    """
+    Abstract base class for all Indexing classes. An Indexing class defines how an iteration space is mapped
+    to GPU's block and grid system. It calculates indices based on GPU's thread and block indices
+    and computes the number of blocks and threads a kernel is started with.
+    The Indexing class is created with an iteration space that is given as list of slices to determine start, stop
+    and the step size for each coordinate. Further the data_layout is given as tuple to determine the fast and slow
+    coordinates. This is important to get an optimal mapping of coordinates to GPU threads.
+    """
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        for iter_space in iteration_space:
+            assert isinstance(iter_space, slice), f"iteration_space must be of type Tuple[slice], " \
+                                                  f"not tuple of type {type(iter_space)}"
+        self._iteration_space = iteration_space
+        self._data_layout = data_layout
+        self._dim = len(iteration_space)
+    @property
+    def iteration_space(self):
+        """Iteration space to loop over"""
+        return self._iteration_space
+    @property
+    def data_layout(self):
+        """Data layout of the kernels arrays"""
+        return self._data_layout
+    @property
+    def dim(self):
+        """Number of spatial dimensions"""
+        return self._dim
+    @property
+    @abc.abstractmethod
+    def coordinates(self):
+        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic GPU block and thread indices.
+        These symbolic indices can be obtained with the method `index_variables` """
+    @property
+    def index_variables(self):
+        """Sympy symbols for GPU's block and thread indices, and block and grid dimensions. """
+        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
+    @abc.abstractmethod
+    def get_loop_ctr_assignments(self, loop_counter_symbols) -> List[SympyAssignment]:
+        """Adds assignments for the loop counter symbols depending on the gpu threads.
+        Args:
+            loop_counter_symbols: typed symbols representing the loop counters
+        Returns:
+            assignments for the loop counters
+        """
+    @abc.abstractmethod
+    def call_parameters(self, arr_shape):
+        """Determine grid and block size for kernel call.
+        Args:
+            arr_shape: the numeric (not symbolic) shape of the array
+        Returns:
+            dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
+            the kernel should be started with
+        """
+    @abc.abstractmethod
+    def guard(self, kernel_content, arr_shape):
+        """In some indexing schemes not all threads of a block execute the kernel content.
+        This function can return a Conditional ast node, defining this execution guard.
+        Args:
+            kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
+            arr_shape: the numeric or symbolic shape of the field
+        Returns:
+            ast node, which is put inside the kernel function
+        """
+    @abc.abstractmethod
+    def max_threads_per_block(self):
+        """Return maximal number of threads per block for launch bounds. If this cannot be determined without
+        knowing the array shape return None for unknown """
+    @abc.abstractmethod
+    def symbolic_parameters(self):
+        """Set of symbols required in call_parameters code"""
+# -------------------------------------------- Implementations ---------------------------------------------------------
+class BlockIndexing(AbstractIndexing):
+    """Generic indexing scheme that maps sub-blocks of an array to GPU blocks.
+    Args:
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple specifying loop order with innermost loop last.
+                     This is the same format as returned by `Field.layout`.
+        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
+                                                gets the largest amount of threads
+        compile_time_block_size: compile in concrete block size, otherwise the gpu variable 'blockDim' is used
+        maximum_block_size: maximum block size that is possible for the GPU. Set to 'auto' to let cupy define the
+                            maximum block size from the device properties
+        device_number: device number of the used GPU. By default, the zeroth device is used.
+    """
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple[int],
+                 block_size=(128, 2, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False,
+                 maximum_block_size=(1024, 1024, 64), device_number=None):
+        super(BlockIndexing, self).__init__(iteration_space, data_layout)
+        if self._dim > 4:
+            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
+        if permute_block_size_dependent_on_layout and self._dim < 4:
+            block_size = self.permute_block_size_according_to_layout(block_size, data_layout)
+        self._block_size = block_size
+        if maximum_block_size == 'auto':
+            assert device_number is not None, 'If "maximum_block_size" is set to "auto" a device number must be stated'
+            # Get device limits
+            import cupy as cp
+            # See https://github.com/cupy/cupy/issues/7676
+            if cp.cuda.runtime.is_hip:
+                maximum_block_size = tuple(cp.cuda.runtime.deviceGetAttribute(i, device_number) for i in range(26, 29))
+            else:
+                da = cp.cuda.Device(device_number).attributes
+                maximum_block_size = tuple(da[f"MaxBlockDim{c}"] for c in ["X", "Y", "Z"])
+        self._maximum_block_size = maximum_block_size
+        self._compile_time_block_size = compile_time_block_size
+        self._device_number = device_number
+    @property
+    def cuda_indices(self):
+        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
+        indices = [block_index * bs + thread_idx
+                   for block_index, bs, thread_idx in zip(BLOCK_IDX, block_size, THREAD_IDX)]
+        return indices[:self._dim]
+    @property
+    def coordinates(self):
+        if self._dim < 4:
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space)]
+            return coordinates[:self._dim]
+        else:
+            coordinates = list()
+            width = self._iteration_space[1].stop - self.iteration_space[1].start
+            coordinates.append(div_floor(self.cuda_indices[0], width))
+            coordinates.append(sp.Mod(self.cuda_indices[0], width))
+            coordinates.append(self.cuda_indices[1] + self.iteration_space[2].start)
+            coordinates.append(self.cuda_indices[2] + self.iteration_space[3].start)
+            return coordinates
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
+    def call_parameters(self, arr_shape):
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)
+        if len(widths) > 3:
+            widths = [widths[0] * widths[1], widths[2], widths[3]]
+        extend_bs = (1,) * (3 - len(self._block_size))
+        block_size = self._block_size + extend_bs
+        if not self._compile_time_block_size:
+            assert len(block_size) == 3
+            adapted_block_size = []
+            for i in range(len(widths)):
+                factor = div_floor(prod(block_size[:i]), prod(adapted_block_size))
+                adapted_block_size.append(sp.Min(block_size[i] * factor, widths[i]))
+            extend_adapted_bs = (1,) * (3 - len(adapted_block_size))
+            block_size = tuple(adapted_block_size) + extend_adapted_bs
+        block_size = tuple(sp.Min(bs, max_bs) for bs, max_bs in zip(block_size, self._maximum_block_size))
+        grid = tuple(div_ceil(length, block_size) for length, block_size in zip(widths, block_size))
+        extend_gr = (1,) * (3 - len(grid))
+        return {'block': block_size,
+                'grid': grid + extend_gr}
+    def guard(self, kernel_content, arr_shape):
+        arr_shape = arr_shape[:self._dim]
+        if len(self._iteration_space) - 1 == len(arr_shape):
+            numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space[1:], arr_shape)
+            numeric_iteration_slice = [self.iteration_space[0]] + numeric_iteration_slice
+        else:
+            assert len(self._iteration_space) == len(arr_shape), "Iteration space must be equal to the array shape"
+            numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        end = [s.stop if s.stop != 0 else 1 for s in numeric_iteration_slice]
+        for i, s in enumerate(numeric_iteration_slice):
+            if s.step and s.step != 1:
+                end[i] = div_ceil(s.stop - s.start, s.step) + s.start
+        if self._dim < 4:
+            conditions = [c < e for c, e in zip(self.coordinates, end)]
+        else:
+            end = [end[0] * end[1], end[2], end[3]]
+            coordinates = [c + iter_slice.start for c, iter_slice in zip(self.cuda_indices, self._iteration_space[1:])]
+            conditions = [c < e for c, e in zip(coordinates, end)]
+        condition = conditions[0]
+        for c in conditions[1:]:
+            condition = sp.And(condition, c)
+        return Block([Conditional(condition, kernel_content)])
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+    def limit_block_size_by_register_restriction(self, block_size, required_registers_per_thread):
+        """Shrinks the block_size if there are too many registers used per block.
+        This is not done automatically, since the required_registers_per_thread are not known before compilation.
+        They can be obtained by ``func.num_regs`` from a cupy function.
+        Args:
+            block_size: used block size that is target for limiting
+            required_registers_per_thread: needed registers per thread
+        returns: smaller block_size if too many registers are used.
+        """
+        import cupy as cp
+        # See https://github.com/cupy/cupy/issues/7676
+        if cp.cuda.runtime.is_hip:
+            max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, self._device_number)
+        else:
+            device = cp.cuda.Device(self._device_number)
+            da = device.attributes
+            max_registers_per_block = da.get("MaxRegistersPerBlock")
+        result = list(block_size)
+        while True:
+            required_registers = math.prod(result) * required_registers_per_thread
+            if required_registers <= max_registers_per_block:
+                return result
+            else:
+                largest_list_entry_idx = max(range(len(result)), key=lambda e: result[e])
+                assert result[largest_list_entry_idx] >= 2
+                result[largest_list_entry_idx] //= 2
+    @staticmethod
+    def permute_block_size_according_to_layout(block_size, layout):
+        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
+        if not is_integer_sequence(block_size):
+            return block_size
+        sorted_block_size = list(sorted(block_size, reverse=True))
+        while len(sorted_block_size) > len(layout):
+            sorted_block_size[0] *= sorted_block_size[-1]
+            sorted_block_size = sorted_block_size[:-1]
+        result = list(block_size)
+        for l, bs in zip(reversed(layout), sorted_block_size):
+            result[l] = bs
+        return tuple(result[:len(layout)])
+    def max_threads_per_block(self):
+        if is_integer_sequence(self._block_size):
+            return prod(self._block_size)
+        else:
+            return None
+    def symbolic_parameters(self):
+        return set(b for b in self._block_size if isinstance(b, sp.Symbol))
+class LineIndexing(AbstractIndexing):
+    """
+    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D GPU block.
+    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
+    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
+    maximum amount of threads allowed in a GPU block (which depends on device).
+    Args:
+        iteration_space: list of slices to determine start, stop and the step size for each coordinate
+        data_layout: tuple to determine the fast and slow coordinates.
+    """
+    def __init__(self, iteration_space: Tuple[slice], data_layout: Tuple):
+        super(LineIndexing, self).__init__(iteration_space, data_layout)
+        if len(iteration_space) > 4:
+            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
+    @property
+    def cuda_indices(self):
+        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
+        coordinates = available_indices[:self.dim]
+        fastest_coordinate = self.data_layout[-1]
+        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
+        return coordinates
+    @property
+    def coordinates(self):
+        return [i + o.start for i, o in zip(self.cuda_indices, self._iteration_space)]
+    def get_loop_ctr_assignments(self, loop_counter_symbols):
+        return _loop_ctr_assignments(loop_counter_symbols, self.coordinates, self._iteration_space)
+    def call_parameters(self, arr_shape):
+        numeric_iteration_slice = _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+        widths = _get_widths(numeric_iteration_slice)
+        def get_shape_of_cuda_idx(cuda_idx):
+            if cuda_idx not in self.cuda_indices:
+                return 1
+            else:
+                idx = self.cuda_indices.index(cuda_idx)
+                return widths[idx]
+        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
+                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
+    def guard(self, kernel_content, arr_shape):
+        return kernel_content
+    def max_threads_per_block(self):
+        return None
+    def symbolic_parameters(self):
+        return set()
+    def numeric_iteration_space(self, arr_shape):
+        return _get_numeric_iteration_slice(self._iteration_space, arr_shape)
+# -------------------------------------- Helper functions --------------------------------------------------------------
+def _get_numeric_iteration_slice(iteration_slice, arr_shape):
+    res = []
+    for slice_component, shape in zip(iteration_slice, arr_shape):
+        result_slice = slice_component
+        if not isinstance(result_slice.start, int):
+            start = result_slice.start
+            assert len(start.free_symbols) == 1
+            start = start.subs({symbol: shape for symbol in start.free_symbols})
+            result_slice = slice(start, result_slice.stop, result_slice.step)
+        if not isinstance(result_slice.stop, int):
+            stop = result_slice.stop
+            assert len(stop.free_symbols) == 1
+            stop = stop.subs({symbol: shape for symbol in stop.free_symbols})
+            result_slice = slice(result_slice.start, stop, result_slice.step)
+        assert isinstance(result_slice.step, int)
+        res.append(result_slice)
+    return res
+def _get_widths(iteration_slice):
+    widths = []
+    for iter_slice in iteration_slice:
+        step = iter_slice.step
+        assert isinstance(step, int), f"Step can only be of type int not of type {type(step)}"
+        start = iter_slice.start
+        stop = iter_slice.stop
+        if step == 1:
+            if stop - start == 0:
+                widths.append(1)
+            else:
+                widths.append(stop - start)
+        else:
+            width = (stop - start) / step
+            if isinstance(width, int):
+                widths.append(width)
+            elif isinstance(width, float):
+                widths.append(math.ceil(width))
+            else:
+                widths.append(div_ceil(stop - start, step))
+    return widths
+def _loop_ctr_assignments(loop_counter_symbols, coordinates, iteration_space):
+    loop_ctr_assignments = []
+    for loop_counter, coordinate, iter_slice in zip(loop_counter_symbols, coordinates, iteration_space):
+        if isinstance(iter_slice, slice) and iter_slice.step > 1:
+            offset = (iter_slice.step * iter_slice.start) - iter_slice.start
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate * iter_slice.step - offset))
+        elif iter_slice.start == iter_slice.stop:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, 0))
+        else:
+            loop_ctr_assignments.append(SympyAssignment(loop_counter, coordinate))
+    return loop_ctr_assignments
+def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
+    if isinstance(gpu_indexing, str):
+        if gpu_indexing == 'block':
+            indexing_creator = BlockIndexing
+        elif gpu_indexing == 'line':
+            indexing_creator = LineIndexing
+        else:
+            raise ValueError(f"Unknown GPU indexing {gpu_indexing}. Valid values are 'block' and 'line'")
+        if gpu_indexing_params:
+            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
+        return indexing_creator
+    else:
+        return gpu_indexing
--- a/pystencils/gpucuda/kernelcreation.py
+++ b/pystencils/gpucuda/kernelcreation.py
-from functools import partial
+import sympy as sp
-from pystencils.gpucuda.indexing import BlockIndexing
+from pystencils.astnodes import Block, KernelFunction, LoopOverCoordinate, SympyAssignment
-from pystencils.transformations import resolve_field_accesses, add_types, parse_base_pointer_info, \
+from pystencils.config import CreateKernelConfig
-    get_common_shape, resolve_buffer_accesses, unify_shape_symbols, get_base_buffer_index
+from pystencils.typing import StructType, TypedSymbol
-from pystencils.astnodes import Block, KernelFunction, SympyAssignment, LoopOverCoordinate
+from pystencils.typing.transformations import add_types
-from pystencils.data_types import TypedSymbol, BasicType, StructType
+from pystencils.field import Field, FieldType
-from pystencils import Field, FieldType
+from pystencils.enums import Target, Backend
-from pystencils.gpucuda.cudajit import make_python_function
+from pystencils.gpu.gpujit import make_python_function
+from pystencils.node_collection import NodeCollection
+from pystencils.gpu.indexing import indexing_creator_from_params
+from pystencils.slicing import normalize_slice
+from pystencils.transformations import (
+    get_base_buffer_index, get_common_field, get_common_indexed_element, parse_base_pointer_info,
+    resolve_buffer_accesses, resolve_field_accesses, unify_shape_symbols)
-def create_cuda_kernel(assignments, function_name="kernel", type_info=None, indexing_creator=BlockIndexing,
+def create_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
-                       iteration_slice=None, ghost_layers=None, skip_independence_check=False):
-    fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check)
+    function_name = config.function_name
+    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
+    iteration_slice = config.iteration_slice
+    ghost_layers = config.ghost_layers
+    fields_written = assignments.bound_fields
+    fields_read = assignments.rhs_fields
+    assignments = assignments.all_assignments
+    assignments = add_types(assignments, config)
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])
@@ -20,11 +36,16 @@ def create_cuda_kernel(assignments, function_name="kernel", type_info=None, inde
    field_accesses = set()
    num_buffer_accesses = 0
+    indexed_elements = set()
    for eq in assignments:
+        indexed_elements.update(eq.atoms(sp.Indexed))
        field_accesses.update(eq.atoms(Field.Access))
+        field_accesses = {e for e in field_accesses if not e.is_absolute_access}
        num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field))
-    common_shape = get_common_shape(fields_without_buffers)
+    # common shape and field to from the iteration space
+    common_field = get_common_field(fields_without_buffers)
+    common_shape = common_field.spatial_shape
    if iteration_slice is None:
        # determine iteration slice from ghost layers
@@ -42,33 +63,51 @@ def create_cuda_kernel(assignments, function_name="kernel", type_info=None, inde
                iteration_slice.append(slice(ghost_layers[i][0],
                                             -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None))
-    indexing = indexing_creator(field=list(fields_without_buffers)[0], iteration_slice=iteration_slice)
+        iteration_space = normalize_slice(iteration_slice, common_shape)
-    coord_mapping = indexing.coordinates
+    else:
+        iteration_space = normalize_slice(iteration_slice, common_shape)
-    cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value)
-                            for i, value in enumerate(coord_mapping)]
+    iteration_space = tuple([s if isinstance(s, slice) else slice(s, s + 1, 1) for s in iteration_space])
-    cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)]
-    assignments = cell_idx_assignments + assignments
+    loop_counter_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(iteration_space))]
+    if len(indexed_elements) > 0:
+        common_indexed_element = get_common_indexed_element(indexed_elements)
+        index = common_indexed_element.indices[0].atoms(TypedSymbol)
+        assert len(index) == 1, "index expressions must only contain one symbol representing the index"
+        indexing = indexing_creator(iteration_space=(slice(0, common_indexed_element.shape[0], 1), *iteration_space),
+                                    data_layout=common_field.layout)
+        extended_ctrs = [index.pop(), *loop_counter_symbols]
+        loop_counter_assignments = indexing.get_loop_ctr_assignments(extended_ctrs)
+    else:
+        indexing = indexing_creator(iteration_space=iteration_space, data_layout=common_field.layout)
+        loop_counter_assignments = indexing.get_loop_ctr_assignments(loop_counter_symbols)
+    assignments = loop_counter_assignments + assignments
+    block = indexing.guard(Block(assignments), common_shape)
-    block = Block(assignments)
-    block = indexing.guard(block, common_shape)
    unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers)
-    ast = KernelFunction(block, function_name=function_name, ghost_layers=ghost_layers, backend='gpucuda')
+    ast = KernelFunction(block,
+                         Target.GPU,
+                         Backend.CUDA,
+                         make_python_function,
+                         ghost_layers,
+                         function_name,
+                         assignments=assignments)
    ast.global_variables.update(indexing.index_variables)
-    base_pointer_spec = [['spatialInner0']]
+    base_pointer_spec = config.base_pointer_specification
+    if base_pointer_spec is None:
+        base_pointer_spec = []
    base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0],
                                                         f.spatial_dimensions, f.index_dimensions)
                         for f in all_fields}
-    coord_mapping = {f.name: cell_idx_symbols for f in all_fields}
+    coord_mapping = {f.name: loop_counter_symbols for f in all_fields}
-    loop_strides = list(fields_without_buffers)[0].shape
    if any(FieldType.is_buffer(f) for f in all_fields):
-        resolve_buffer_accesses(ast, get_base_buffer_index(ast, indexing.coordinates, loop_strides), read_only_fields)
+        base_buffer_index = get_base_buffer_index(ast, loop_counter_symbols, iteration_space)
+        resolve_buffer_accesses(ast, base_buffer_index, read_only_fields)
    resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info,
                           field_to_fixed_coordinates=coord_mapping)
@@ -84,47 +123,63 @@ def create_cuda_kernel(assignments, function_name="kernel", type_info=None, inde
        ast.body.insert_front(SympyAssignment(loop_counter, indexing.coordinates[i]))
    ast.indexing = indexing
-    ast.compile = partial(make_python_function, ast)
    return ast
-def created_indexed_cuda_kernel(assignments, index_fields, function_name="kernel", type_info=None,
+def created_indexed_cuda_kernel(assignments: NodeCollection, config: CreateKernelConfig):
-                                coordinate_names=('x', 'y', 'z'), indexing_creator=BlockIndexing):
-    fields_read, fields_written, assignments = add_types(assignments, type_info, check_independence_condition=False)
+    index_fields = config.index_fields
+    function_name = config.function_name
+    coordinate_names = config.coordinate_names
+    indexing_creator = indexing_creator_from_params(config.gpu_indexing, config.gpu_indexing_params)
+    fields_written = assignments.bound_fields
+    fields_read = assignments.rhs_fields
    all_fields = fields_read.union(fields_written)
    read_only_fields = set([f.name for f in fields_read - fields_written])
+    # extract the index fields based on the name. The original index field might have been modified
+    index_fields = [idx_field for idx_field in index_fields if idx_field.name in [f.name for f in all_fields]]
+    non_index_fields = [f for f in all_fields if f not in index_fields]
+    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
+    assert len(spatial_coordinates) == 1, f"Non-index fields do not have the same number of spatial coordinates " \
+                                          f"Non index fields are {non_index_fields}, spatial coordinates are " \
+                                          f"{spatial_coordinates}"
+    spatial_coordinates = list(spatial_coordinates)[0]
+    assignments = assignments.all_assignments
+    assignments = add_types(assignments, config)
    for index_field in index_fields:
        index_field.field_type = FieldType.INDEXED
        assert FieldType.is_indexed(index_field)
        assert index_field.spatial_dimensions == 1, "Index fields have to be 1D"
-    non_index_fields = [f for f in all_fields if f not in index_fields]
-    spatial_coordinates = {f.spatial_dimensions for f in non_index_fields}
-    assert len(spatial_coordinates) == 1, "Non-index fields do not have the same number of spatial coordinates"
-    spatial_coordinates = list(spatial_coordinates)[0]
    def get_coordinate_symbol_assignment(name):
        for ind_f in index_fields:
            assert isinstance(ind_f.dtype, StructType), "Index fields have to have a struct data type"
            data_type = ind_f.dtype
            if data_type.has_element(name):
                rhs = ind_f[0](name)
-                lhs = TypedSymbol(name, BasicType(data_type.get_element_type(name)))
+                lhs = TypedSymbol(name, data_type.get_element_type(name))
                return SympyAssignment(lhs, rhs)
-        raise ValueError("Index %s not found in any of the passed index fields" % (name,))
+        raise ValueError(f"Index {name} not found in any of the passed index fields")
    coordinate_symbol_assignments = [get_coordinate_symbol_assignment(n)
                                     for n in coordinate_names[:spatial_coordinates]]
    coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments]
    idx_field = list(index_fields)[0]
-    indexing = indexing_creator(field=idx_field,
-                                iteration_slice=[slice(None, None, None)] * len(idx_field.spatial_shape))
+    iteration_space = normalize_slice(tuple([slice(None, None, None)]) * len(idx_field.spatial_shape),
+                                      idx_field.spatial_shape)
+    indexing = indexing_creator(iteration_space=iteration_space,
+                                data_layout=idx_field.layout)
    function_body = Block(coordinate_symbol_assignments + assignments)
-    function_body = indexing.guard(function_body, get_common_shape(index_fields))
+    function_body = indexing.guard(function_body, get_common_field(index_fields).spatial_shape)
-    ast = KernelFunction(function_body, function_name=function_name, backend='gpucuda')
+    ast = KernelFunction(function_body, Target.GPU, Backend.CUDA, make_python_function,
+                         None, function_name, assignments=assignments)
    ast.global_variables.update(indexing.index_variables)
    coord_mapping = indexing.coordinates
@@ -141,5 +196,4 @@ def created_indexed_cuda_kernel(assignments, index_fields, function_name="kernel
    # add the function which determines #blocks and #threads as additional member to KernelFunction node
    # this is used by the jit
    ast.indexing = indexing
-    ast.compile = partial(make_python_function, ast)
    return ast
--- a/pystencils/gpucuda/periodicity.py
+++ b/pystencils/gpucuda/periodicity.py
 import numpy as np
-from pystencils import Field, Assignment
+from itertools import product
-from pystencils.slicing import normalize_slice, get_periodic_boundary_src_dst_slices
-from pystencils.gpucuda import make_python_function
+from pystencils import CreateKernelConfig, create_kernel
-from pystencils.gpucuda.kernelcreation import create_cuda_kernel
+from pystencils.gpu import make_python_function
+from pystencils import Assignment, Field
+from pystencils.enums import Target
+from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
 def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, index_dim_shape=1, dtype=np.float64):
    """Copies a rectangular part of an array to another non-overlapping part"""
-    if index_dimensions not in (0, 1):
-        raise NotImplementedError("Works only for one or zero index coordinates")
    f = Field.create_generic("pdfs", len(domain_size), index_dimensions=index_dimensions, dtype=dtype)
    normalized_from_slice = normalize_slice(from_slice, f.spatial_shape)
@@ -19,21 +20,27 @@ def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, in
        "Slices have to have same size"
    update_eqs = []
-    for i in range(index_dim_shape):
+    if index_dimensions < 2:
-        eq = Assignment(f(i), f[tuple(offset)](i))
+        index_dim_shape = [index_dim_shape]
+    for i in product(*[range(d) for d in index_dim_shape]):
+        eq = Assignment(f(*i), f[tuple(offset)](*i))
        update_eqs.append(eq)
-    ast = create_cuda_kernel(update_eqs, iteration_slice=to_slice, skip_independence_check=True)
+    config = CreateKernelConfig(target=Target.GPU, iteration_slice=to_slice, skip_independence_check=True)
-    return make_python_function(ast)
+    ast = create_kernel(update_eqs, config=config)
+    return ast
 def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
-                                  thickness=None, dtype=float):
+                                  thickness=None, dtype=np.float64, target=Target.GPU):
+    assert target in {Target.GPU}
    src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
    kernels = []
-    index_dimensions = index_dimensions
    for src_slice, dst_slice in src_dst_slice_tuples:
-        kernels.append(create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype))
+        ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
+        kernels.append(make_python_function(ast))
    def functor(pdfs, **_):
        for kernel in kernels:

--- a/pystencils/include/__init__.py
+++ b/pystencils/include/__init__.py
-import os
+from os.path import dirname, realpath
 def get_pystencils_include_path():
-    return os.path.dirname(os.path.realpath(__file__))
+    return dirname(realpath(__file__))
--- a/src/pystencils/include/aesni_rand.h
+++ b/src/pystencils/include/aesni_rand.h
+/*
+Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
+Copyright 2019-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <emmintrin.h> // SSE2
+#include <wmmintrin.h> // AES
+#ifdef __AVX__
+#include <immintrin.h> // AVX*
+#else
+#include <smmintrin.h>  // SSE4
+#ifdef __FMA__
+#include <immintrin.h> // FMA
+#endif
+#endif
+#include <cstdint>
+#include <array>
+#include <map>
+#define QUALIFIERS inline
+#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
+#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
+#include "myintrin.h"
+typedef std::uint32_t uint32;
+typedef std::uint64_t uint64;
+template <typename T, std::size_t Alignment>
+class AlignedAllocator
+{
+public:
+    typedef T value_type;
+    template <typename U>
+    struct rebind {
+        typedef AlignedAllocator<U, Alignment> other;
+    };
+    T * allocate(const std::size_t n) const {
+        if (n == 0) {
+            return nullptr;
+        }
+        void * const p = _mm_malloc(n*sizeof(T), Alignment);
+        if (p == nullptr) {
+            throw std::bad_alloc();
+        }
+        return static_cast<T *>(p);
+    }
+    void deallocate(T * const p, const std::size_t n) const {
+        _mm_free(p);
+    }
+};
+template <typename Key, typename T>
+using AlignedMap = std::map<Key, T, std::less<Key>, AlignedAllocator<std::pair<const Key, T>, sizeof(Key)>>;
+#if defined(__AES__) || defined(_MSC_VER)
+QUALIFIERS __m128i aesni_keygen_assist(__m128i temp1, __m128i temp2) {
+    __m128i temp3; 
+    temp2 = _mm_shuffle_epi32(temp2 ,0xff); 
+    temp3 = _mm_slli_si128(temp1, 0x4);
+    temp1 = _mm_xor_si128(temp1, temp3);
+    temp3 = _mm_slli_si128(temp3, 0x4);
+    temp1 = _mm_xor_si128(temp1, temp3);
+    temp3 = _mm_slli_si128(temp3, 0x4);
+    temp1 = _mm_xor_si128(temp1, temp3);
+    temp1 = _mm_xor_si128(temp1, temp2); 
+    return temp1; 
+}
+QUALIFIERS std::array<__m128i,11> aesni_keygen(__m128i k) {
+    std::array<__m128i,11> rk;
+    __m128i tmp;
+    rk[0] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x1);
+    k = aesni_keygen_assist(k, tmp);
+    rk[1] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x2);
+    k = aesni_keygen_assist(k, tmp);
+    rk[2] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x4);
+    k = aesni_keygen_assist(k, tmp);
+    rk[3] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x8);
+    k = aesni_keygen_assist(k, tmp);
+    rk[4] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x10);
+    k = aesni_keygen_assist(k, tmp);
+    rk[5] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x20);
+    k = aesni_keygen_assist(k, tmp);
+    rk[6] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x40);
+    k = aesni_keygen_assist(k, tmp);
+    rk[7] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x80);
+    k = aesni_keygen_assist(k, tmp);
+    rk[8] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x1b);
+    k = aesni_keygen_assist(k, tmp);
+    rk[9] = k;
+    tmp = _mm_aeskeygenassist_si128(k, 0x36);
+    k = aesni_keygen_assist(k, tmp);
+    rk[10] = k;
+    return rk;
+}
+QUALIFIERS const std::array<__m128i,11> & aesni_roundkeys(const __m128i & k128) {
+    alignas(16) std::array<uint32,4> a;
+    _mm_store_si128((__m128i*) a.data(), k128);
+    static AlignedMap<std::array<uint32,4>, std::array<__m128i,11>> roundkeys;
+    if(roundkeys.find(a) == roundkeys.end()) {
+        auto rk = aesni_keygen(k128);
+        roundkeys[a] = rk;
+    }
+    return roundkeys[a];
+}
+QUALIFIERS __m128i aesni1xm128i(const __m128i & in, const __m128i & k0) {
+    auto k = aesni_roundkeys(k0);
+    __m128i x = _mm_xor_si128(k[0], in);
+    x = _mm_aesenc_si128(x, k[1]);
+    x = _mm_aesenc_si128(x, k[2]);
+    x = _mm_aesenc_si128(x, k[3]);
+    x = _mm_aesenc_si128(x, k[4]);
+    x = _mm_aesenc_si128(x, k[5]);
+    x = _mm_aesenc_si128(x, k[6]);
+    x = _mm_aesenc_si128(x, k[7]);
+    x = _mm_aesenc_si128(x, k[8]);
+    x = _mm_aesenc_si128(x, k[9]);
+    x = _mm_aesenclast_si128(x, k[10]);
+    return x;
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              double & rnd1, double & rnd2)
+{
+    // pack input and call AES
+    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    c128 = aesni1xm128i(c128, k128);
+    // convert 32 to 64 bit and put 0th and 2nd element into x, 1st and 3rd element into y
+    __m128i x = _mm_and_si128(c128, _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff));
+    __m128i y = _mm_and_si128(c128, _mm_set_epi32(0xffffffff, 0, 0xffffffff, 0));
+    y = _mm_srli_si128(y, 4);
+    // calculate z = x ^ y << (53 - 32))
+    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm_xor_si128(x, z);
+    // convert uint64 to double
+    __m128d rs = _my_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+#ifdef __FMA__
+    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#else
+    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#endif
+    // store result
+    alignas(16) double rr[2];
+    _mm_store_pd(rr, rs);
+    rnd1 = rr[0];
+    rnd2 = rr[1];
+}
+QUALIFIERS void aesni_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             float & rnd1, float & rnd2, float & rnd3, float & rnd4)
+{
+    // pack input and call AES
+    __m128i c128 = _mm_set_epi32(ctr3, ctr2, ctr1, ctr0);
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    c128 = aesni1xm128i(c128, k128);
+    // convert uint32 to float
+    __m128 rs = _my_cvtepu32_ps(c128);
+    // calculate rs * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+#ifdef __FMA__
+    rs = _mm_fmadd_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#else
+    rs = _mm_mul_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rs = _mm_add_ps(rs, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#endif
+    // store result
+    alignas(16) float r[4];
+    _mm_store_ps(r, rs);
+    rnd1 = r[0];
+    rnd2 = r[1];
+    rnd3 = r[2];
+    rnd4 = r[3];
+}
+template<bool high>
+QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
+        y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
+    }
+    else
+    {
+        x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
+        y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm_xor_si128(x, z);
+    // convert uint64 to double
+    __m128d rs = _my_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+#ifdef __FMA__
+    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#else
+    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#endif
+    return rs;
+}
+QUALIFIERS void aesni_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
+{
+    // pack input and call AES
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k128);
+    }
+    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    // convert uint32 to float
+    rnd1 = _my_cvtepu32_ps(ctr[0]);
+    rnd2 = _my_cvtepu32_ps(ctr[1]);
+    rnd3 = _my_cvtepu32_ps(ctr[2]);
+    rnd4 = _my_cvtepu32_ps(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+#ifdef __FMA__
+    rnd1 = _mm_fmadd_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm_fmadd_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm_fmadd_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm_fmadd_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+#else
+    rnd1 = _mm_mul_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd1 = _mm_add_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = _mm_mul_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd2 = _mm_add_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = _mm_mul_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd3 = _mm_add_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = _mm_mul_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd4 = _mm_add_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#endif
+}
+QUALIFIERS void aesni_double2(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
+{
+    // pack input and call AES
+    __m128i k128 = _mm_set_epi32(key3, key2, key1, key0);
+    __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k128);
+    }
+    _MY_TRANSPOSE4_EPI32(ctr[0], ctr[1], ctr[2], ctr[3]);
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void aesni_float4(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             __m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
+{
+    __m128i ctr0v = _mm_set1_epi32(ctr0);
+    __m128i ctr2v = _mm_set1_epi32(ctr2);
+    __m128i ctr3v = _mm_set1_epi32(ctr3);
+    aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
+{
+    __m128i ctr0v = _mm_set1_epi32(ctr0);
+    __m128i ctr2v = _mm_set1_epi32(ctr2);
+    __m128i ctr3v = _mm_set1_epi32(ctr3);
+   aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m128d & rnd1, __m128d & rnd2)
+{
+    __m128i ctr0v = _mm_set1_epi32(ctr0);
+    __m128i ctr2v = _mm_set1_epi32(ctr2);
+    __m128i ctr3v = _mm_set1_epi32(ctr3);
+    __m128d ignore;
+   aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
+}
+#endif
+#ifdef __AVX2__
+QUALIFIERS const std::array<__m256i,11> & aesni_roundkeys(const __m256i & k256) {
+    alignas(32) std::array<uint32,8> a;
+    _mm256_store_si256((__m256i*) a.data(), k256);
+    static AlignedMap<std::array<uint32,8>, std::array<__m256i,11>> roundkeys;
+    if(roundkeys.find(a) == roundkeys.end()) {
+        auto rk1 = aesni_keygen(_mm256_extractf128_si256(k256, 0));
+        auto rk2 = aesni_keygen(_mm256_extractf128_si256(k256, 1));
+        for(int i = 0; i < 11; ++i) {
+            roundkeys[a][i] = _my256_set_m128i(rk2[i], rk1[i]);
+        }
+    }
+    return roundkeys[a];
+}
+QUALIFIERS __m256i aesni1xm128i(const __m256i & in, const __m256i & k0) {
+#if defined(__VAES__)
+    auto k = aesni_roundkeys(k0);
+    __m256i x = _mm256_xor_si256(k[0], in);
+    x = _mm256_aesenc_epi128(x, k[1]);
+    x = _mm256_aesenc_epi128(x, k[2]);
+    x = _mm256_aesenc_epi128(x, k[3]);
+    x = _mm256_aesenc_epi128(x, k[4]);
+    x = _mm256_aesenc_epi128(x, k[5]);
+    x = _mm256_aesenc_epi128(x, k[6]);
+    x = _mm256_aesenc_epi128(x, k[7]);
+    x = _mm256_aesenc_epi128(x, k[8]);
+    x = _mm256_aesenc_epi128(x, k[9]);
+    x = _mm256_aesenclast_epi128(x, k[10]);
+#else
+    __m128i a = aesni1xm128i(_mm256_extractf128_si256(in, 0), _mm256_extractf128_si256(k0, 0));
+    __m128i b = aesni1xm128i(_mm256_extractf128_si256(in, 1), _mm256_extractf128_si256(k0, 1));
+    __m256i x = _my256_set_m128i(b, a);
+#endif
+    return x;
+}
+template<bool high>
+QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 1));
+        y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 1));
+    }
+    else
+    {
+        x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 0));
+        y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 0));
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm256_xor_si256(x, z);
+    // convert uint64 to double
+    __m256d rs = _my256_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+#ifdef __FMA__
+    rs = _mm256_fmadd_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE), _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#else
+    rs = _mm256_mul_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm256_add_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#endif
+    return rs;
+}
+QUALIFIERS void aesni_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
+{
+    // pack input and call AES
+    __m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
+    __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    __m128i a[4], b[4];
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm256_extractf128_si256(ctr[i], 0);
+        b[i] = _mm256_extractf128_si256(ctr[i], 1);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my256_set_m128i(b[i], a[i]);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k256);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm256_extractf128_si256(ctr[i], 0);
+        b[i] = _mm256_extractf128_si256(ctr[i], 1);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my256_set_m128i(b[i], a[i]);
+    }
+    // convert uint32 to float
+    rnd1 = _my256_cvtepu32_ps(ctr[0]);
+    rnd2 = _my256_cvtepu32_ps(ctr[1]);
+    rnd3 = _my256_cvtepu32_ps(ctr[2]);
+    rnd4 = _my256_cvtepu32_ps(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+#ifdef __FMA__
+    rnd1 = _mm256_fmadd_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm256_fmadd_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm256_fmadd_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm256_fmadd_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+#else
+    rnd1 = _mm256_mul_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd1 = _mm256_add_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = _mm256_mul_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd2 = _mm256_add_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = _mm256_mul_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd3 = _mm256_add_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = _mm256_mul_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd4 = _mm256_add_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#endif
+}
+QUALIFIERS void aesni_double2(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
+{
+    // pack input and call AES
+    __m256i k256 = _mm256_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0);
+    __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    __m128i a[4], b[4];
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm256_extractf128_si256(ctr[i], 0);
+        b[i] = _mm256_extractf128_si256(ctr[i], 1);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my256_set_m128i(b[i], a[i]);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k256);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm256_extractf128_si256(ctr[i], 0);
+        b[i] = _mm256_extractf128_si256(ctr[i], 1);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my256_set_m128i(b[i], a[i]);
+    }
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void aesni_float4(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             __m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
+{
+    __m256i ctr0v = _mm256_set1_epi32(ctr0);
+    __m256i ctr2v = _mm256_set1_epi32(ctr2);
+    __m256i ctr3v = _mm256_set1_epi32(ctr3);
+    aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
+{
+    __m256i ctr0v = _mm256_set1_epi32(ctr0);
+    __m256i ctr2v = _mm256_set1_epi32(ctr2);
+    __m256i ctr3v = _mm256_set1_epi32(ctr3);
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m256d & rnd1, __m256d & rnd2)
+{
+#if 0
+    __m256i ctr0v = _mm256_set1_epi32(ctr0);
+    __m256i ctr2v = _mm256_set1_epi32(ctr2);
+    __m256i ctr3v = _mm256_set1_epi32(ctr3);
+    __m256d ignore;
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
+#else
+    __m128d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
+    aesni_double2(ctr0, _mm256_extractf128_si256(ctr1, 0), ctr2, ctr3, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+    rnd1 = _my256_set_m128d(rnd1hi, rnd1lo);
+    rnd2 = _my256_set_m128d(rnd2hi, rnd2lo);
+#endif
+}
+#endif
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
+QUALIFIERS const std::array<__m512i,11> & aesni_roundkeys(const __m512i & k512) {
+    alignas(64) std::array<uint32,16> a;
+    _mm512_store_si512((__m512i*) a.data(), k512);
+    static AlignedMap<std::array<uint32,16>, std::array<__m512i,11>> roundkeys;
+    if(roundkeys.find(a) == roundkeys.end()) {
+        auto rk1 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 0));
+        auto rk2 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 1));
+        auto rk3 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 2));
+        auto rk4 = aesni_keygen(_mm512_extracti32x4_epi32(k512, 3));
+        for(int i = 0; i < 11; ++i) {
+            roundkeys[a][i] = _my512_set_m128i(rk4[i], rk3[i], rk2[i], rk1[i]);
+        }
+    }
+    return roundkeys[a];
+}
+QUALIFIERS __m512i aesni1xm128i(const __m512i & in, const __m512i & k0) {
+#ifdef __VAES__
+    auto k = aesni_roundkeys(k0);
+    __m512i x = _mm512_xor_si512(k[0], in);
+    x = _mm512_aesenc_epi128(x, k[1]);
+    x = _mm512_aesenc_epi128(x, k[2]);
+    x = _mm512_aesenc_epi128(x, k[3]);
+    x = _mm512_aesenc_epi128(x, k[4]);
+    x = _mm512_aesenc_epi128(x, k[5]);
+    x = _mm512_aesenc_epi128(x, k[6]);
+    x = _mm512_aesenc_epi128(x, k[7]);
+    x = _mm512_aesenc_epi128(x, k[8]);
+    x = _mm512_aesenc_epi128(x, k[9]);
+    x = _mm512_aesenclast_epi128(x, k[10]);
+#else
+    __m128i a = aesni1xm128i(_mm512_extracti32x4_epi32(in, 0), _mm512_extracti32x4_epi32(k0, 0));
+    __m128i b = aesni1xm128i(_mm512_extracti32x4_epi32(in, 1), _mm512_extracti32x4_epi32(k0, 1));
+    __m128i c = aesni1xm128i(_mm512_extracti32x4_epi32(in, 2), _mm512_extracti32x4_epi32(k0, 2));
+    __m128i d = aesni1xm128i(_mm512_extracti32x4_epi32(in, 3), _mm512_extracti32x4_epi32(k0, 3));
+    __m512i x = _my512_set_m128i(d, c, b, a);
+#endif
+    return x;
+}
+template<bool high>
+QUALIFIERS __m512d _uniform_double_hq(__m512i x, __m512i y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 1));
+        y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 1));
+    }
+    else
+    {
+        x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 0));
+        y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 0));
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm512_xor_si512(x, z);
+    // convert uint64 to double
+    __m512d rs = _mm512_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = _mm512_fmadd_pd(rs, _mm512_set1_pd(TWOPOW53_INV_DOUBLE), _mm512_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+    return rs;
+}
+QUALIFIERS void aesni_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
+{
+    // pack input and call AES
+    __m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
+                                    key3, key2, key1, key0, key3, key2, key1, key0);
+    __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    __m128i a[4], b[4], c[4], d[4];
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
+        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
+        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
+        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
+    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k512);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
+        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
+        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
+        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
+    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
+    }
+    // convert uint32 to float
+    rnd1 = _mm512_cvtepu32_ps(ctr[0]);
+    rnd2 = _mm512_cvtepu32_ps(ctr[1]);
+    rnd3 = _mm512_cvtepu32_ps(ctr[2]);
+    rnd4 = _mm512_cvtepu32_ps(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = _mm512_fmadd_ps(rnd1, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm512_fmadd_ps(rnd2, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm512_fmadd_ps(rnd3, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm512_fmadd_ps(rnd4, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+}
+QUALIFIERS void aesni_double2(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
+{
+    // pack input and call AES
+    __m512i k512 = _mm512_set_epi32(key3, key2, key1, key0, key3, key2, key1, key0,
+                                    key3, key2, key1, key0, key3, key2, key1, key0);
+    __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    __m128i a[4], b[4], c[4], d[4];
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
+        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
+        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
+        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
+    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = aesni1xm128i(ctr[i], k512);
+    }
+    for (int i = 0; i < 4; ++i)
+    {
+        a[i] = _mm512_extracti32x4_epi32(ctr[i], 0);
+        b[i] = _mm512_extracti32x4_epi32(ctr[i], 1);
+        c[i] = _mm512_extracti32x4_epi32(ctr[i], 2);
+        d[i] = _mm512_extracti32x4_epi32(ctr[i], 3);
+    }
+    _MY_TRANSPOSE4_EPI32(a[0], a[1], a[2], a[3]);
+    _MY_TRANSPOSE4_EPI32(b[0], b[1], b[2], b[3]);
+    _MY_TRANSPOSE4_EPI32(c[0], c[1], c[2], c[3]);
+    _MY_TRANSPOSE4_EPI32(d[0], d[1], d[2], d[3]);
+    for (int i = 0; i < 4; ++i)
+    {
+        ctr[i] = _my512_set_m128i(d[i], c[i], b[i], a[i]);
+    }
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void aesni_float4(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
+                             uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                             __m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
+{
+    __m512i ctr0v = _mm512_set1_epi32(ctr0);
+    __m512i ctr2v = _mm512_set1_epi32(ctr2);
+    __m512i ctr3v = _mm512_set1_epi32(ctr3);
+    aesni_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
+{
+    __m512i ctr0v = _mm512_set1_epi32(ctr0);
+    __m512i ctr2v = _mm512_set1_epi32(ctr2);
+    __m512i ctr3v = _mm512_set1_epi32(ctr3);
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void aesni_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1, uint32 key2, uint32 key3,
+                              __m512d & rnd1, __m512d & rnd2)
+{
+#if 0
+    __m512i ctr0v = _mm512_set1_epi32(ctr0);
+    __m512i ctr2v = _mm512_set1_epi32(ctr2);
+    __m512i ctr3v = _mm512_set1_epi32(ctr3);
+    __m512d ignore;
+    aesni_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, key2, key3, rnd1, ignore, rnd2, ignore);
+#else
+   __m256d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
+   aesni_double2(ctr0, _mm512_extracti64x4_epi64(ctr1, 0), ctr2, ctr3, key0, key1, key2, key3, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+   rnd1 = _my512_set_m256d(rnd1hi, rnd1lo);
+   rnd2 = _my512_set_m256d(rnd2hi, rnd2lo);
+#endif
+}
+#endif
--- a/src/pystencils/include/arm_neon_helpers.h
+++ b/src/pystencils/include/arm_neon_helpers.h
+/*
+Copyright 2021-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#if defined(_MSC_VER)
+#define __ARM_NEON
+#endif
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+#include <arm_sve.h>
+typedef svbool_t svbool_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+typedef svint32_t svint32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+#endif
+#ifdef __ARM_NEON
+inline float32x4_t makeVec_f32(float a, float b, float c, float d)
+{
+    alignas(16) float data[4] = {a, b, c, d};
+    return vld1q_f32(data);
+}
+inline float64x2_t makeVec_f64(double a, double b)
+{
+    alignas(16) double data[2] = {a, b};
+    return vld1q_f64(data);
+}
+inline int32x4_t makeVec_s32(int a, int b, int c, int d)
+{
+    alignas(16) int data[4] = {a, b, c, d};
+    return vld1q_s32(data);
+}
+#endif
+inline void cachelineZero(void * p) {
+#if !defined(_MSC_VER) || defined(__clang__)
+	__asm__ volatile("dc zva, %0"::"r"(p):"memory");
+#endif
+}
+inline size_t _cachelineSize() {
+#if !defined(_MSC_VER) || defined(__clang__)
+	// check that dc zva is permitted
+	uint64_t dczid;
+	__asm__ volatile ("mrs %0, dczid_el0" : "=r"(dczid));
+	if ((dczid & (1 << 4)) != 0) {
+		return SIZE_MAX;
+	}
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+#endif
+	// too much was zeroed
+	return SIZE_MAX;
+}
+inline size_t cachelineSize() {
+	static size_t size = _cachelineSize();
+	return size;
+}
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
+/*
+Copyright 2023, Markus Holzer.
+Copyright 2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#define POS_INFINITY __int_as_float(0x7f800000)
+#define INFINITY POS_INFINITY
+#define NEG_INFINITY __int_as_float(0xff800000)
+#ifdef __HIPCC_RTC__
+typedef __hip_uint8_t uint8_t;
+typedef __hip_int8_t int8_t;
+typedef __hip_uint16_t uint16_t;
+typedef __hip_int16_t int16_t;
+#endif
--- a/src/pystencils/include/half_precision.h
+++ b/src/pystencils/include/half_precision.h
+/*
+Copyright 2023, Markus Holzer.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/// Half precision support. Experimental. Use carefully.
+///
+/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
+/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
+/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
+/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
+/// Clang version must be 15 or higher for x86 half precision support.
+/// GCC version must be 12 or higher for x86 half precision support.
+/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
+/// See
+///   https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
+///   https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
+/// for more information.
+#pragma once
+using half    = _Float16;
--- a/src/pystencils/include/myintrin.h
+++ b/src/pystencils/include/myintrin.h
+/*
+Copyright 2019-2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
+QUALIFIERS __m128 _my_cvtepu32_ps(const __m128i v)
+{
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
+    return _mm_cvtepu32_ps(v);
+#else
+    __m128i v2 = _mm_srli_epi32(v, 1);
+    __m128i v1 = _mm_and_si128(v, _mm_set1_epi32(1));
+    __m128 v2f = _mm_cvtepi32_ps(v2);
+    __m128 v1f = _mm_cvtepi32_ps(v1);
+    return _mm_add_ps(_mm_add_ps(v2f, v2f), v1f);
+#endif
+}
+QUALIFIERS void _MY_TRANSPOSE4_EPI32(__m128i & R0, __m128i & R1, __m128i & R2, __m128i & R3)
+{
+    __m128i T0, T1, T2, T3;
+    T0  = _mm_unpacklo_epi32(R0, R1);
+    T1  = _mm_unpacklo_epi32(R2, R3);
+    T2  = _mm_unpackhi_epi32(R0, R1);
+    T3  = _mm_unpackhi_epi32(R2, R3);
+    R0  = _mm_unpacklo_epi64(T0, T1);
+    R1  = _mm_unpackhi_epi64(T0, T1);
+    R2  = _mm_unpacklo_epi64(T2, T3);
+    R3  = _mm_unpackhi_epi64(T2, T3);
+}
+#endif
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
+#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+__attribute__((optimize("no-associative-math")))
+#endif
+QUALIFIERS __m128d _my_cvtepu64_pd(const __m128i x)
+{
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
+    return _mm_cvtepu64_pd(x);
+#elif defined(__clang__)
+    return __builtin_convertvector((uint64_t __attribute__((__vector_size__(16)))) x, __m128d);
+#else
+    __m128i xH = _mm_srli_epi64(x, 32);
+    xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));          //  2^84
+    __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
+    __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
+    return _mm_add_pd(f, _mm_castsi128_pd(xL));
+#endif
+}
+#endif
+#ifdef __AVX2__
+QUALIFIERS __m256i _my256_set_m128i(__m128i hi, __m128i lo)
+{
+#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
+    return _mm256_set_m128i(hi, lo);
+#else
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+#endif
+}
+QUALIFIERS __m256d _my256_set_m128d(__m128d hi, __m128d lo)
+{
+#if (!defined(__GNUC__) || __GNUC__ >= 8) || defined(__clang__)
+    return _mm256_set_m128d(hi, lo);
+#else
+    return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1);
+#endif
+}
+QUALIFIERS __m256 _my256_cvtepu32_ps(const __m256i v)
+{
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
+    return _mm256_cvtepu32_ps(v);
+#else
+    __m256i v2 = _mm256_srli_epi32(v, 1);
+    __m256i v1 = _mm256_and_si256(v, _mm256_set1_epi32(1));
+    __m256 v2f = _mm256_cvtepi32_ps(v2);
+    __m256 v1f = _mm256_cvtepi32_ps(v1);
+    return _mm256_add_ps(_mm256_add_ps(v2f, v2f), v1f);
+#endif
+}
+#if !defined(__AVX512VL__) && !defined(__AVX10_1__) && defined(__GNUC__) && __GNUC__ >= 5 && !defined(__clang__)
+__attribute__((optimize("no-associative-math")))
+#endif
+QUALIFIERS __m256d _my256_cvtepu64_pd(const __m256i x)
+{
+#if defined(__AVX512VL__) || defined(__AVX10_1__)
+    return _mm256_cvtepu64_pd(x);
+#elif defined(__clang__)
+    return __builtin_convertvector((uint64_t __attribute__((__vector_size__(32)))) x, __m256d);
+#else
+    __m256i xH = _mm256_srli_epi64(x, 32);
+    xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.)));          //  2^84
+    __m256i xL = _mm256_blend_epi16(x, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)), 0xcc);   //  2^52
+    __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.));     //  2^84 + 2^52
+    return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+#endif
+}
+#endif
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
+QUALIFIERS __m512i _my512_set_m128i(__m128i d, __m128i c, __m128i b, __m128i a)
+{
+    return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a), b, 1), c, 2), d, 3);
+}
+QUALIFIERS __m512d _my512_set_m256d(__m256d b, __m256d a)
+{
+    return _mm512_insertf64x4(_mm512_castpd256_pd512(a), b, 1);
+}
+#endif
--- a/src/pystencils/include/philox_rand.h
+++ b/src/pystencils/include/philox_rand.h
+/*
+Copyright 2010-2011, D. E. Shaw Research. All rights reserved.
+Copyright 2019-2024, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#if !defined(__OPENCL_VERSION__) && !defined(__HIPCC_RTC__)
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_ARM64))
+#include <emmintrin.h> // SSE2
+#endif
+#ifdef __AVX2__
+#include <immintrin.h> // AVX*
+#elif defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
+#include <smmintrin.h>  // SSE4
+#ifdef __FMA__
+#include <immintrin.h> // FMA
+#endif
+#endif
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#define __ARM_NEON
+#endif
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
+#include <arm_sve.h>
+#endif
+#if defined(__powerpc__) && defined(__GNUC__) && !defined(__clang__) && !defined(__xlC__)
+#include <ppu_intrinsics.h>
+#endif
+#ifdef __ALTIVEC__
+#include <altivec.h>
+#undef bool
+#ifndef _ARCH_PWR8
+#include <pveclib/vec_int64_ppc.h>
+#endif
+#endif
+#ifdef __riscv_v
+#include <riscv_vector.h>
+#endif
+#endif
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#define QUALIFIERS static __forceinline__ __device__
+#elif defined(__OPENCL_VERSION__)
+#define QUALIFIERS static inline
+#else
+#define QUALIFIERS inline
+#include "myintrin.h"
+#endif
+#if defined(__ARM_FEATURE_SME)
+#define SVE_QUALIFIERS __attribute__((arm_streaming_compatible)) QUALIFIERS
+#else
+#define SVE_QUALIFIERS QUALIFIERS
+#endif
+#define PHILOX_W32_0   (0x9E3779B9)
+#define PHILOX_W32_1   (0xBB67AE85)
+#define PHILOX_M4x32_0 (0xD2511F53)
+#define PHILOX_M4x32_1 (0xCD9E8D57)
+#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
+#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
+#ifdef __OPENCL_VERSION__
+#include "opencl_stdint.h"
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+#else
+#ifndef __HIPCC_RTC__
+#include <cstdint>
+#endif
+typedef std::uint32_t uint32;
+typedef std::uint64_t uint64;
+#endif
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+typedef svfloat32_t svfloat32_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+typedef svfloat64_t svfloat64_st __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)));
+#elif defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
+typedef svfloat32_t svfloat32_st;
+typedef svfloat64_t svfloat64_st;
+#endif
+QUALIFIERS uint32 mulhilo32(uint32 a, uint32 b, uint32* hip)
+{
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+    // host code
+#if defined(__powerpc__) && (!defined(__clang__) || defined(__xlC__))
+    *hip = __mulhwu(a,b);
+    return a*b;
+#elif defined(__OPENCL_VERSION__)
+    *hip = mul_hi(a,b);
+    return a*b;
+#else
+    uint64 product = ((uint64)a) * ((uint64)b);
+    *hip = product >> 32;
+    return (uint32)product;
+#endif
+#else
+    // device code
+    *hip = __umulhi(a,b);
+    return a*b;
+#endif
+}
+QUALIFIERS void _philox4x32round(uint32* ctr, uint32* key)
+{
+    uint32 hi0;
+    uint32 hi1;
+    uint32 lo0 = mulhilo32(PHILOX_M4x32_0, ctr[0], &hi0);
+    uint32 lo1 = mulhilo32(PHILOX_M4x32_1, ctr[2], &hi1);
+    ctr[0] = hi1^ctr[1]^key[0];
+    ctr[1] = lo1;
+    ctr[2] = hi0^ctr[3]^key[1];
+    ctr[3] = lo0;
+}
+QUALIFIERS void _philox4x32bumpkey(uint32* key)
+{
+    key[0] += PHILOX_W32_0;
+    key[1] += PHILOX_W32_1;
+}
+QUALIFIERS double _uniform_double_hq(uint32 x, uint32 y)
+{
+    uint64 z = (uint64)x ^ ((uint64)y << (53 - 32));
+    return z * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+#ifdef __OPENCL_VERSION__
+                               double * rnd1, double * rnd2)
+#else
+                               double & rnd1, double & rnd2)
+#endif
+{
+    uint32 key[2] = {key0, key1};
+    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+#ifdef __OPENCL_VERSION__
+    *rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
+    *rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
+#else
+    rnd1 = _uniform_double_hq(ctr[0], ctr[1]);
+    rnd2 = _uniform_double_hq(ctr[2], ctr[3]);
+#endif
+}
+QUALIFIERS void philox_float4(uint32 ctr0, uint32 ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+#ifdef __OPENCL_VERSION__
+                              float * rnd1, float * rnd2, float * rnd3, float * rnd4)
+#else
+                              float & rnd1, float & rnd2, float & rnd3, float & rnd4)
+#endif
+{
+    uint32 key[2] = {key0, key1};
+    uint32 ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+#ifdef __OPENCL_VERSION__
+    *rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    *rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    *rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    *rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+#else
+    rnd1 = ctr[0] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    rnd2 = ctr[1] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    rnd3 = ctr[2] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+    rnd4 = ctr[3] * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f);
+#endif
+}
+#if !defined(__CUDA_ARCH__) && !defined(__OPENCL_VERSION__) && !defined(__HIP_DEVICE_COMPILE__)
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_ARM64))
+QUALIFIERS void _philox4x32round(__m128i* ctr, __m128i* key)
+{
+    __m128i lohi0a = _mm_mul_epu32(ctr[0], _mm_set1_epi32(PHILOX_M4x32_0));
+    __m128i lohi0b = _mm_mul_epu32(_mm_srli_epi64(ctr[0], 32), _mm_set1_epi32(PHILOX_M4x32_0));
+    __m128i lohi1a = _mm_mul_epu32(ctr[2], _mm_set1_epi32(PHILOX_M4x32_1));
+    __m128i lohi1b = _mm_mul_epu32(_mm_srli_epi64(ctr[2], 32), _mm_set1_epi32(PHILOX_M4x32_1));
+    lohi0a = _mm_shuffle_epi32(lohi0a, 0xD8);
+    lohi0b = _mm_shuffle_epi32(lohi0b, 0xD8);
+    lohi1a = _mm_shuffle_epi32(lohi1a, 0xD8);
+    lohi1b = _mm_shuffle_epi32(lohi1b, 0xD8);
+    __m128i lo0 = _mm_unpacklo_epi32(lohi0a, lohi0b);
+    __m128i hi0 = _mm_unpackhi_epi32(lohi0a, lohi0b);
+    __m128i lo1 = _mm_unpacklo_epi32(lohi1a, lohi1b);
+    __m128i hi1 = _mm_unpackhi_epi32(lohi1a, lohi1b);
+    ctr[0] = _mm_xor_si128(_mm_xor_si128(hi1, ctr[1]), key[0]);
+    ctr[1] = lo1;
+    ctr[2] = _mm_xor_si128(_mm_xor_si128(hi0, ctr[3]), key[1]);
+    ctr[3] = lo0;
+}
+QUALIFIERS void _philox4x32bumpkey(__m128i* key)
+{
+    key[0] = _mm_add_epi32(key[0], _mm_set1_epi32(PHILOX_W32_0));
+    key[1] = _mm_add_epi32(key[1], _mm_set1_epi32(PHILOX_W32_1));
+}
+template<bool high>
+QUALIFIERS __m128d _uniform_double_hq(__m128i x, __m128i y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = _mm_unpackhi_epi32(x, _mm_setzero_si128());
+        y = _mm_unpackhi_epi32(y, _mm_setzero_si128());
+    }
+    else
+    {
+        x = _mm_unpacklo_epi32(x, _mm_setzero_si128());
+        y = _mm_unpacklo_epi32(y, _mm_setzero_si128());
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __m128i z = _mm_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm_xor_si128(x, z);
+    // convert uint64 to double
+    __m128d rs = _my_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+#ifdef __FMA__
+    rs = _mm_fmadd_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE), _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#else
+    rs = _mm_mul_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm_add_pd(rs, _mm_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#endif
+    return rs;
+}
+QUALIFIERS void philox_float4(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
+                              uint32 key0, uint32 key1,
+                              __m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
+{
+    __m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
+    __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    // convert uint32 to float
+    rnd1 = _my_cvtepu32_ps(ctr[0]);
+    rnd2 = _my_cvtepu32_ps(ctr[1]);
+    rnd3 = _my_cvtepu32_ps(ctr[2]);
+    rnd4 = _my_cvtepu32_ps(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+#ifdef __FMA__
+    rnd1 = _mm_fmadd_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm_fmadd_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm_fmadd_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm_fmadd_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT), _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+#else
+    rnd1 = _mm_mul_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd1 = _mm_add_ps(rnd1, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = _mm_mul_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd2 = _mm_add_ps(rnd2, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = _mm_mul_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd3 = _mm_add_ps(rnd3, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = _mm_mul_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd4 = _mm_add_ps(rnd4, _mm_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#endif
+}
+QUALIFIERS void philox_double2(__m128i ctr0, __m128i ctr1, __m128i ctr2, __m128i ctr3,
+                               uint32 key0, uint32 key1,
+                               __m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
+{
+    __m128i key[2] = {_mm_set1_epi32(key0), _mm_set1_epi32(key1)};
+    __m128i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void philox_float4(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __m128 & rnd1, __m128 & rnd2, __m128 & rnd3, __m128 & rnd4)
+{
+    __m128i ctr0v = _mm_set1_epi32(ctr0);
+    __m128i ctr2v = _mm_set1_epi32(ctr2);
+    __m128i ctr3v = _mm_set1_epi32(ctr3);
+    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __m128d & rnd1lo, __m128d & rnd1hi, __m128d & rnd2lo, __m128d & rnd2hi)
+{
+    __m128i ctr0v = _mm_set1_epi32(ctr0);
+    __m128i ctr2v = _mm_set1_epi32(ctr2);
+    __m128i ctr3v = _mm_set1_epi32(ctr3);
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __m128i ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __m128d & rnd1, __m128d & rnd2)
+{
+    __m128i ctr0v = _mm_set1_epi32(ctr0);
+    __m128i ctr2v = _mm_set1_epi32(ctr2);
+    __m128i ctr3v = _mm_set1_epi32(ctr3);
+    __m128d ignore;
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
+}
+#endif
+#ifdef __ALTIVEC__
+QUALIFIERS void _philox4x32round(__vector unsigned int* ctr, __vector unsigned int* key)
+{
+#ifndef _ARCH_PWR8
+    __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int hi0 = vec_mulhuw(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector unsigned int hi1 = vec_mulhuw(ctr[2], vec_splats(PHILOX_M4x32_1));
+#elif defined(_ARCH_PWR10)
+    __vector unsigned int lo0 = vec_mul(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int hi0 = vec_mulh(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int lo1 = vec_mul(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector unsigned int hi1 = vec_mulh(ctr[2], vec_splats(PHILOX_M4x32_1));
+#else
+    __vector unsigned int lohi0a = (__vector unsigned int) vec_mule(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int lohi0b = (__vector unsigned int) vec_mulo(ctr[0], vec_splats(PHILOX_M4x32_0));
+    __vector unsigned int lohi1a = (__vector unsigned int) vec_mule(ctr[2], vec_splats(PHILOX_M4x32_1));
+    __vector unsigned int lohi1b = (__vector unsigned int) vec_mulo(ctr[2], vec_splats(PHILOX_M4x32_1));
+#ifdef __LITTLE_ENDIAN__
+    __vector unsigned int lo0 = vec_mergee(lohi0a, lohi0b);
+    __vector unsigned int lo1 = vec_mergee(lohi1a, lohi1b);
+    __vector unsigned int hi0 = vec_mergeo(lohi0a, lohi0b);
+    __vector unsigned int hi1 = vec_mergeo(lohi1a, lohi1b);
+#else
+    __vector unsigned int lo0 = vec_mergeo(lohi0a, lohi0b);
+    __vector unsigned int lo1 = vec_mergeo(lohi1a, lohi1b);
+    __vector unsigned int hi0 = vec_mergee(lohi0a, lohi0b);
+    __vector unsigned int hi1 = vec_mergee(lohi1a, lohi1b);
+#endif
+#endif
+    ctr[0] = vec_xor(vec_xor(hi1, ctr[1]), key[0]);
+    ctr[1] = lo1;
+    ctr[2] = vec_xor(vec_xor(hi0, ctr[3]), key[1]);
+    ctr[3] = lo0;
+}
+QUALIFIERS void _philox4x32bumpkey(__vector unsigned int* key)
+{
+    key[0] = vec_add(key[0], vec_splats(PHILOX_W32_0));
+    key[1] = vec_add(key[1], vec_splats(PHILOX_W32_1));
+}
+#ifdef __VSX__
+template<bool high>
+QUALIFIERS __vector double _uniform_double_hq(__vector unsigned int x, __vector unsigned int y)
+{
+    // convert 32 to 64 bit
+#ifdef __LITTLE_ENDIAN__
+    if (high)
+    {
+        x = vec_mergel(x, vec_splats(0U));
+        y = vec_mergel(y, vec_splats(0U));
+    }
+    else
+    {
+        x = vec_mergeh(x, vec_splats(0U));
+        y = vec_mergeh(y, vec_splats(0U));
+    }
+#else
+    if (high)
+    {
+        x = vec_mergel(vec_splats(0U), x);
+        y = vec_mergel(vec_splats(0U), y);
+    }
+    else
+    {
+        x = vec_mergeh(vec_splats(0U), x);
+        y = vec_mergeh(vec_splats(0U), y);
+    }
+#endif
+    // calculate z = x ^ y << (53 - 32))
+#ifdef _ARCH_PWR8
+    __vector unsigned long long z = vec_sl((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
+#else
+    __vector unsigned long long z = vec_vsld((__vector unsigned long long) y, vec_splats(53ULL - 32ULL));
+#endif
+    z = vec_xor((__vector unsigned long long) x, z);
+    // convert uint64 to double
+#ifdef __xlC__
+    __vector double rs = vec_ctd(z, 0);
+#else
+    __vector double rs = vec_ctf(z, 0);
+#endif
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = vec_madd(rs, vec_splats(TWOPOW53_INV_DOUBLE), vec_splats(TWOPOW53_INV_DOUBLE/2.0));
+    return rs;
+}
+#endif
+QUALIFIERS void philox_float4(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
+                              uint32 key0, uint32 key1,
+                              __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
+{
+    __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
+    __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    // convert uint32 to float
+    rnd1 = vec_ctf(ctr[0], 0);
+    rnd2 = vec_ctf(ctr[1], 0);
+    rnd3 = vec_ctf(ctr[2], 0);
+    rnd4 = vec_ctf(ctr[3], 0);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = vec_madd(rnd1, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = vec_madd(rnd2, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = vec_madd(rnd3, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = vec_madd(rnd4, vec_splats(TWOPOW32_INV_FLOAT), vec_splats(TWOPOW32_INV_FLOAT/2.0f));
+}
+#ifdef __VSX__
+QUALIFIERS void philox_double2(__vector unsigned int ctr0, __vector unsigned int ctr1, __vector unsigned int ctr2, __vector unsigned int ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
+{
+    __vector unsigned int key[2] = {vec_splats(key0), vec_splats(key1)};
+    __vector unsigned int ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+#endif
+QUALIFIERS void philox_float4(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
+{
+    __vector unsigned int ctr0v = vec_splats(ctr0);
+    __vector unsigned int ctr2v = vec_splats(ctr2);
+    __vector unsigned int ctr3v = vec_splats(ctr3);
+    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void philox_float4(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __vector float & rnd1, __vector float & rnd2, __vector float & rnd3, __vector float & rnd4)
+{
+    philox_float4(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+#ifdef __VSX__
+QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1lo, __vector double & rnd1hi, __vector double & rnd2lo, __vector double & rnd2hi)
+{
+    __vector unsigned int ctr0v = vec_splats(ctr0);
+    __vector unsigned int ctr2v = vec_splats(ctr2);
+    __vector unsigned int ctr3v = vec_splats(ctr3);
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __vector unsigned int ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1, __vector double & rnd2)
+{
+    __vector unsigned int ctr0v = vec_splats(ctr0);
+    __vector unsigned int ctr2v = vec_splats(ctr2);
+    __vector unsigned int ctr3v = vec_splats(ctr3);
+    __vector double ignore;
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __vector int ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __vector double & rnd1, __vector double & rnd2)
+{
+    philox_double2(ctr0, (__vector unsigned int) ctr1, ctr2, ctr3, key0, key1, rnd1, rnd2);
+}
+#endif
+#endif
+#if defined(__ARM_NEON)
+QUALIFIERS void _philox4x32round(uint32x4_t* ctr, uint32x4_t* key)
+{
+    uint32x4_t lohi0a = vreinterpretq_u32_u64(vmull_u32(vget_low_u32(ctr[0]), vdup_n_u32(PHILOX_M4x32_0)));
+    uint32x4_t lohi0b = vreinterpretq_u32_u64(vmull_high_u32(ctr[0], vdupq_n_u32(PHILOX_M4x32_0)));
+    uint32x4_t lohi1a = vreinterpretq_u32_u64(vmull_u32(vget_low_u32(ctr[2]), vdup_n_u32(PHILOX_M4x32_1)));
+    uint32x4_t lohi1b = vreinterpretq_u32_u64(vmull_high_u32(ctr[2], vdupq_n_u32(PHILOX_M4x32_1)));
+    uint32x4_t lo0 = vuzp1q_u32(lohi0a, lohi0b);
+    uint32x4_t lo1 = vuzp1q_u32(lohi1a, lohi1b);
+    uint32x4_t hi0 = vuzp2q_u32(lohi0a, lohi0b);
+    uint32x4_t hi1 = vuzp2q_u32(lohi1a, lohi1b);
+    ctr[0] = veorq_u32(veorq_u32(hi1, ctr[1]), key[0]);
+    ctr[1] = lo1;
+    ctr[2] = veorq_u32(veorq_u32(hi0, ctr[3]), key[1]);
+    ctr[3] = lo0;
+}
+QUALIFIERS void _philox4x32bumpkey(uint32x4_t* key)
+{
+    key[0] = vaddq_u32(key[0], vdupq_n_u32(PHILOX_W32_0));
+    key[1] = vaddq_u32(key[1], vdupq_n_u32(PHILOX_W32_1));
+}
+template<bool high>
+QUALIFIERS float64x2_t _uniform_double_hq(uint32x4_t x, uint32x4_t y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = vzip2q_u32(x, vdupq_n_u32(0));
+        y = vzip2q_u32(y, vdupq_n_u32(0));
+    }
+    else
+    {
+        x = vzip1q_u32(x, vdupq_n_u32(0));
+        y = vzip1q_u32(y, vdupq_n_u32(0));
+    }
+    // calculate z = x ^ y << (53 - 32))
+    uint64x2_t z = vshlq_n_u64(vreinterpretq_u64_u32(y), 53 - 32);
+    z = veorq_u64(vreinterpretq_u64_u32(x), z);
+    // convert uint64 to double
+    float64x2_t rs = vcvtq_f64_u64(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = vfmaq_f64(vdupq_n_f64(TWOPOW53_INV_DOUBLE/2.0), vdupq_n_f64(TWOPOW53_INV_DOUBLE), rs);
+    return rs;
+}
+QUALIFIERS void philox_float4(uint32x4_t ctr0, uint32x4_t ctr1, uint32x4_t ctr2, uint32x4_t ctr3,
+                              uint32 key0, uint32 key1,
+                              float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
+{
+    uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
+    uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    // convert uint32 to float
+    rnd1 = vcvtq_f32_u32(ctr[0]);
+    rnd2 = vcvtq_f32_u32(ctr[1]);
+    rnd3 = vcvtq_f32_u32(ctr[2]);
+    rnd4 = vcvtq_f32_u32(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd1);
+    rnd2 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd2);
+    rnd3 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd3);
+    rnd4 = vfmaq_f32(vdupq_n_f32(TWOPOW32_INV_FLOAT/2.0), vdupq_n_f32(TWOPOW32_INV_FLOAT), rnd4);
+}
+QUALIFIERS void philox_double2(uint32x4_t ctr0, uint32x4_t ctr1, uint32x4_t ctr2, uint32x4_t ctr3,
+                               uint32 key0, uint32 key1,
+                               float64x2_t & rnd1lo, float64x2_t & rnd1hi, float64x2_t & rnd2lo, float64x2_t & rnd2hi)
+{
+    uint32x4_t key[2] = {vdupq_n_u32(key0), vdupq_n_u32(key1)};
+    uint32x4_t ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void philox_float4(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
+{
+    uint32x4_t ctr0v = vdupq_n_u32(ctr0);
+    uint32x4_t ctr2v = vdupq_n_u32(ctr2);
+    uint32x4_t ctr3v = vdupq_n_u32(ctr3);
+    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+#ifndef _MSC_VER
+QUALIFIERS void philox_float4(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              float32x4_t & rnd1, float32x4_t & rnd2, float32x4_t & rnd3, float32x4_t & rnd4)
+{
+    philox_float4(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+#endif
+QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               float64x2_t & rnd1lo, float64x2_t & rnd1hi, float64x2_t & rnd2lo, float64x2_t & rnd2hi)
+{
+    uint32x4_t ctr0v = vdupq_n_u32(ctr0);
+    uint32x4_t ctr2v = vdupq_n_u32(ctr2);
+    uint32x4_t ctr3v = vdupq_n_u32(ctr3);
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, uint32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               float64x2_t & rnd1, float64x2_t & rnd2)
+{
+    uint32x4_t ctr0v = vdupq_n_u32(ctr0);
+    uint32x4_t ctr2v = vdupq_n_u32(ctr2);
+    uint32x4_t ctr3v = vdupq_n_u32(ctr3);
+    float64x2_t ignore;
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
+}
+#ifndef _MSC_VER
+QUALIFIERS void philox_double2(uint32 ctr0, int32x4_t ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               float64x2_t & rnd1, float64x2_t & rnd2)
+{
+    philox_double2(ctr0, vreinterpretq_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
+}
+#endif
+#endif
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_SME)
+SVE_QUALIFIERS void _philox4x32round(svuint32x4_t & ctr, svuint32x2_t & key)
+{
+    svuint32_t lo0 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
+    svuint32_t hi0 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 0), svdup_u32(PHILOX_M4x32_0));
+    svuint32_t lo1 = svmul_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
+    svuint32_t hi1 = svmulh_u32_x(svptrue_b32(), svget4_u32(ctr, 2), svdup_u32(PHILOX_M4x32_1));
+    ctr = svset4_u32(ctr, 0, sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi1, svget4_u32(ctr, 1)), svget2_u32(key, 0)));
+    ctr = svset4_u32(ctr, 1, lo1);
+    ctr = svset4_u32(ctr, 2, sveor_u32_x(svptrue_b32(), sveor_u32_x(svptrue_b32(), hi0, svget4_u32(ctr, 3)), svget2_u32(key, 1)));
+    ctr = svset4_u32(ctr, 3, lo0);
+}
+SVE_QUALIFIERS void _philox4x32bumpkey(svuint32x2_t & key)
+{
+    key = svset2_u32(key, 0, svadd_u32_x(svptrue_b32(), svget2_u32(key, 0), svdup_u32(PHILOX_W32_0)));
+    key = svset2_u32(key, 1, svadd_u32_x(svptrue_b32(), svget2_u32(key, 1), svdup_u32(PHILOX_W32_1)));
+}
+template<bool high>
+SVE_QUALIFIERS svfloat64_t _uniform_double_hq(svuint32_t x, svuint32_t y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = svzip2_u32(x, svdup_u32(0));
+        y = svzip2_u32(y, svdup_u32(0));
+    }
+    else
+    {
+        x = svzip1_u32(x, svdup_u32(0));
+        y = svzip1_u32(y, svdup_u32(0));
+    }
+    // calculate z = x ^ y << (53 - 32))
+    svuint64_t z = svlsl_n_u64_x(svptrue_b64(), svreinterpret_u64_u32(y), 53 - 32);
+    z = sveor_u64_x(svptrue_b64(), svreinterpret_u64_u32(x), z);
+    // convert uint64 to double
+    svfloat64_t rs = svcvt_f64_u64_x(svptrue_b64(), z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = svmad_f64_x(svptrue_b64(), rs, svdup_f64(TWOPOW53_INV_DOUBLE), svdup_f64(TWOPOW53_INV_DOUBLE/2.0));
+    return rs;
+}
+SVE_QUALIFIERS void philox_float4(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
+                                  uint32 key0, uint32 key1,
+                                  svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+{
+    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
+    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    // convert uint32 to float
+    rnd1 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 0));
+    rnd2 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 1));
+    rnd3 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 2));
+    rnd4 = svcvt_f32_u32_x(svptrue_b32(), svget4_u32(ctr, 3));
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = svmad_f32_x(svptrue_b32(), rnd1, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = svmad_f32_x(svptrue_b32(), rnd2, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = svmad_f32_x(svptrue_b32(), rnd3, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = svmad_f32_x(svptrue_b32(), rnd4, svdup_f32(TWOPOW32_INV_FLOAT), svdup_f32(TWOPOW32_INV_FLOAT/2.0));
+}
+SVE_QUALIFIERS void philox_double2(svuint32_t ctr0, svuint32_t ctr1, svuint32_t ctr2, svuint32_t ctr3,
+                                   uint32 key0, uint32 key1,
+                                   svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+{
+    svuint32x2_t key = svcreate2_u32(svdup_u32(key0), svdup_u32(key1));
+    svuint32x4_t ctr = svcreate4_u32(ctr0, ctr1, ctr2, ctr3);
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    rnd1lo = _uniform_double_hq<false>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
+    rnd1hi = _uniform_double_hq<true>(svget4_u32(ctr, 0), svget4_u32(ctr, 1));
+    rnd2lo = _uniform_double_hq<false>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
+    rnd2hi = _uniform_double_hq<true>(svget4_u32(ctr, 2), svget4_u32(ctr, 3));
+}
+SVE_QUALIFIERS void philox_float4(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                                  uint32 key0, uint32 key1,
+                                  svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+{
+    svuint32_t ctr0v = svdup_u32(ctr0);
+    svuint32_t ctr2v = svdup_u32(ctr2);
+    svuint32_t ctr3v = svdup_u32(ctr3);
+    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+SVE_QUALIFIERS void philox_float4(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                                  uint32 key0, uint32 key1,
+                                  svfloat32_st & rnd1, svfloat32_st & rnd2, svfloat32_st & rnd3, svfloat32_st & rnd4)
+{
+    philox_float4(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                                   uint32 key0, uint32 key1,
+                                   svfloat64_st & rnd1lo, svfloat64_st & rnd1hi, svfloat64_st & rnd2lo, svfloat64_st & rnd2hi)
+{
+    svuint32_t ctr0v = svdup_u32(ctr0);
+    svuint32_t ctr2v = svdup_u32(ctr2);
+    svuint32_t ctr3v = svdup_u32(ctr3);
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+SVE_QUALIFIERS void philox_double2(uint32 ctr0, svuint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                                   uint32 key0, uint32 key1,
+                                   svfloat64_st & rnd1, svfloat64_st & rnd2)
+{
+    svuint32_t ctr0v = svdup_u32(ctr0);
+    svuint32_t ctr2v = svdup_u32(ctr2);
+    svuint32_t ctr3v = svdup_u32(ctr3);
+    svfloat64_st ignore;
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
+}
+SVE_QUALIFIERS void philox_double2(uint32 ctr0, svint32_t ctr1, uint32 ctr2, uint32 ctr3,
+                                   uint32 key0, uint32 key1,
+                                   svfloat64_st & rnd1, svfloat64_st & rnd2)
+{
+    philox_double2(ctr0, svreinterpret_u32_s32(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
+}
+#endif
+#if defined(__riscv_v)
+QUALIFIERS void _philox4x32round(vuint32m1_t & ctr0, vuint32m1_t & ctr1, vuint32m1_t & ctr2, vuint32m1_t & ctr3,
+                                 vuint32m1_t key0, vuint32m1_t key1)
+{
+    vuint32m1_t lo0 = vmul_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    vuint32m1_t hi0 = vmulhu_vv_u32m1(ctr0, vmv_v_x_u32m1(PHILOX_M4x32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    vuint32m1_t lo1 = vmul_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    vuint32m1_t hi1 = vmulhu_vv_u32m1(ctr2, vmv_v_x_u32m1(PHILOX_M4x32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    ctr0 = vxor_vv_u32m1(vxor_vv_u32m1(hi1, ctr1, vsetvlmax_e32m1()), key0, vsetvlmax_e32m1());
+    ctr1 = lo1;
+    ctr2 = vxor_vv_u32m1(vxor_vv_u32m1(hi0, ctr3, vsetvlmax_e32m1()), key1, vsetvlmax_e32m1());
+    ctr3 = lo0;
+}
+QUALIFIERS void _philox4x32bumpkey(vuint32m1_t & key0, vuint32m1_t & key1)
+{
+    key0 = vadd_vv_u32m1(key0, vmv_v_x_u32m1(PHILOX_W32_0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    key1 = vadd_vv_u32m1(key1, vmv_v_x_u32m1(PHILOX_W32_1, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+}
+template<bool high>
+QUALIFIERS vfloat64m1_t _uniform_double_hq(vuint32m1_t x, vuint32m1_t y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        size_t s = vsetvlmax_e32m1();
+        x = vslidedown_vx_u32m1(vundefined_u32m1(), x, s/2, s);
+        y = vslidedown_vx_u32m1(vundefined_u32m1(), y, s/2, s);
+    }
+    vuint64m1_t x64 = vwcvtu_x_x_v_u64m1(vlmul_trunc_v_u32m1_u32mf2(x), vsetvlmax_e64m1());
+    vuint64m1_t y64 = vwcvtu_x_x_v_u64m1(vlmul_trunc_v_u32m1_u32mf2(y), vsetvlmax_e64m1());
+    // calculate z = x ^ y << (53 - 32))
+    vuint64m1_t z = vsll_vx_u64m1(y64, 53 - 32, vsetvlmax_e64m1());
+    z = vxor_vv_u64m1(x64, z, vsetvlmax_e64m1());
+    // convert uint64 to double
+    vfloat64m1_t rs = vfcvt_f_xu_v_f64m1(z, vsetvlmax_e64m1());
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = vfmadd_vv_f64m1(rs, vfmv_v_f_f64m1(TWOPOW53_INV_DOUBLE, vsetvlmax_e64m1()), vfmv_v_f_f64m1(TWOPOW53_INV_DOUBLE/2.0, vsetvlmax_e64m1()), vsetvlmax_e64m1());
+    return rs;
+}
+QUALIFIERS void philox_float4(vuint32m1_t ctr0, vuint32m1_t ctr1, vuint32m1_t ctr2, vuint32m1_t ctr3,
+                              uint32 key0, uint32 key1,
+                              vfloat32m1_t & rnd1, vfloat32m1_t & rnd2, vfloat32m1_t & rnd3, vfloat32m1_t & rnd4)
+{
+    vuint32m1_t key0v = vmv_v_x_u32m1(key0, vsetvlmax_e32m1());
+    vuint32m1_t key1v = vmv_v_x_u32m1(key1, vsetvlmax_e32m1());
+    _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);                                  // 1
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 2
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 3
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 4
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 5
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 6
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 7
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 8
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 9
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 10
+    // convert uint32 to float
+    rnd1 = vfcvt_f_xu_v_f32m1(ctr0, vsetvlmax_e32m1());
+    rnd2 = vfcvt_f_xu_v_f32m1(ctr1, vsetvlmax_e32m1());
+    rnd3 = vfcvt_f_xu_v_f32m1(ctr2, vsetvlmax_e32m1());
+    rnd4 = vfcvt_f_xu_v_f32m1(ctr3, vsetvlmax_e32m1());
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = vfmadd_vv_f32m1(rnd1, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    rnd2 = vfmadd_vv_f32m1(rnd2, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    rnd3 = vfmadd_vv_f32m1(rnd3, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+    rnd4 = vfmadd_vv_f32m1(rnd4, vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT, vsetvlmax_e32m1()), vfmv_v_f_f32m1(TWOPOW32_INV_FLOAT/2.0, vsetvlmax_e32m1()), vsetvlmax_e32m1());
+}
+QUALIFIERS void philox_double2(vuint32m1_t ctr0, vuint32m1_t ctr1, vuint32m1_t ctr2, vuint32m1_t ctr3,
+                               uint32 key0, uint32 key1,
+                               vfloat64m1_t & rnd1lo, vfloat64m1_t & rnd1hi, vfloat64m1_t & rnd2lo, vfloat64m1_t & rnd2hi)
+{
+    vuint32m1_t key0v = vmv_v_x_u32m1(key0, vsetvlmax_e32m1());
+    vuint32m1_t key1v = vmv_v_x_u32m1(key1, vsetvlmax_e32m1());
+    _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);                                  // 1
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 2
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 3
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 4
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 5
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 6
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 7
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 8
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 9
+    _philox4x32bumpkey(key0v, key1v); _philox4x32round(ctr0, ctr1, ctr2, ctr3, key0v, key1v);  // 10
+    rnd1lo = _uniform_double_hq<false>(ctr0, ctr1);
+    rnd1hi = _uniform_double_hq<true>(ctr0, ctr1);
+    rnd2lo = _uniform_double_hq<false>(ctr2, ctr3);
+    rnd2hi = _uniform_double_hq<true>(ctr2, ctr3);
+}
+QUALIFIERS void philox_float4(uint32 ctr0, vuint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              vfloat32m1_t & rnd1, vfloat32m1_t & rnd2, vfloat32m1_t & rnd3, vfloat32m1_t & rnd4)
+{
+    vuint32m1_t ctr0v = vmv_v_x_u32m1(ctr0, vsetvlmax_e32m1());
+    vuint32m1_t ctr2v = vmv_v_x_u32m1(ctr2, vsetvlmax_e32m1());
+    vuint32m1_t ctr3v = vmv_v_x_u32m1(ctr3, vsetvlmax_e32m1());
+    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void philox_float4(uint32 ctr0, vint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              vfloat32m1_t & rnd1, vfloat32m1_t & rnd2, vfloat32m1_t & rnd3, vfloat32m1_t & rnd4)
+{
+    philox_float4(ctr0, vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, vuint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               vfloat64m1_t & rnd1lo, vfloat64m1_t & rnd1hi, vfloat64m1_t & rnd2lo, vfloat64m1_t & rnd2hi)
+{
+    vuint32m1_t ctr0v = vmv_v_x_u32m1(ctr0, vsetvlmax_e32m1());
+    vuint32m1_t ctr2v = vmv_v_x_u32m1(ctr2, vsetvlmax_e32m1());
+    vuint32m1_t ctr3v = vmv_v_x_u32m1(ctr3, vsetvlmax_e32m1());
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, vuint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               vfloat64m1_t & rnd1, vfloat64m1_t & rnd2)
+{
+    vuint32m1_t ctr0v = vmv_v_x_u32m1(ctr0, vsetvlmax_e32m1());
+    vuint32m1_t ctr2v = vmv_v_x_u32m1(ctr2, vsetvlmax_e32m1());
+    vuint32m1_t ctr3v = vmv_v_x_u32m1(ctr3, vsetvlmax_e32m1());
+    vfloat64m1_t ignore;
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, vint32m1_t ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               vfloat64m1_t & rnd1, vfloat64m1_t & rnd2)
+{
+    philox_double2(ctr0, vreinterpret_v_i32m1_u32m1(ctr1), ctr2, ctr3, key0, key1, rnd1, rnd2);
+}
+#endif
+#ifdef __AVX2__
+QUALIFIERS void _philox4x32round(__m256i* ctr, __m256i* key)
+{
+    __m256i lohi0a = _mm256_mul_epu32(ctr[0], _mm256_set1_epi32(PHILOX_M4x32_0));
+    __m256i lohi0b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[0], 32), _mm256_set1_epi32(PHILOX_M4x32_0));
+    __m256i lohi1a = _mm256_mul_epu32(ctr[2], _mm256_set1_epi32(PHILOX_M4x32_1));
+    __m256i lohi1b = _mm256_mul_epu32(_mm256_srli_epi64(ctr[2], 32), _mm256_set1_epi32(PHILOX_M4x32_1));
+    lohi0a = _mm256_shuffle_epi32(lohi0a, 0xD8);
+    lohi0b = _mm256_shuffle_epi32(lohi0b, 0xD8);
+    lohi1a = _mm256_shuffle_epi32(lohi1a, 0xD8);
+    lohi1b = _mm256_shuffle_epi32(lohi1b, 0xD8);
+    __m256i lo0 = _mm256_unpacklo_epi32(lohi0a, lohi0b);
+    __m256i hi0 = _mm256_unpackhi_epi32(lohi0a, lohi0b);
+    __m256i lo1 = _mm256_unpacklo_epi32(lohi1a, lohi1b);
+    __m256i hi1 = _mm256_unpackhi_epi32(lohi1a, lohi1b);
+    ctr[0] = _mm256_xor_si256(_mm256_xor_si256(hi1, ctr[1]), key[0]);
+    ctr[1] = lo1;
+    ctr[2] = _mm256_xor_si256(_mm256_xor_si256(hi0, ctr[3]), key[1]);
+    ctr[3] = lo0;
+}
+QUALIFIERS void _philox4x32bumpkey(__m256i* key)
+{
+    key[0] = _mm256_add_epi32(key[0], _mm256_set1_epi32(PHILOX_W32_0));
+    key[1] = _mm256_add_epi32(key[1], _mm256_set1_epi32(PHILOX_W32_1));
+}
+template<bool high>
+QUALIFIERS __m256d _uniform_double_hq(__m256i x, __m256i y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 1));
+        y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 1));
+    }
+    else
+    {
+        x = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(x, 0));
+        y = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(y, 0));
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __m256i z = _mm256_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm256_xor_si256(x, z);
+    // convert uint64 to double
+    __m256d rs = _my256_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+#ifdef __FMA__
+    rs = _mm256_fmadd_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE), _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#else
+    rs = _mm256_mul_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE));
+    rs = _mm256_add_pd(rs, _mm256_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+#endif
+    return rs;
+}
+QUALIFIERS void philox_float4(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
+                              uint32 key0, uint32 key1,
+                              __m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
+{
+    __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
+    __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    // convert uint32 to float
+    rnd1 = _my256_cvtepu32_ps(ctr[0]);
+    rnd2 = _my256_cvtepu32_ps(ctr[1]);
+    rnd3 = _my256_cvtepu32_ps(ctr[2]);
+    rnd4 = _my256_cvtepu32_ps(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+#ifdef __FMA__
+    rnd1 = _mm256_fmadd_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm256_fmadd_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm256_fmadd_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm256_fmadd_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT), _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+#else
+    rnd1 = _mm256_mul_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd1 = _mm256_add_ps(rnd1, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd2 = _mm256_mul_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd2 = _mm256_add_ps(rnd2, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd3 = _mm256_mul_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd3 = _mm256_add_ps(rnd3, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+    rnd4 = _mm256_mul_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT));
+    rnd4 = _mm256_add_ps(rnd4, _mm256_set1_ps(TWOPOW32_INV_FLOAT/2.0f));
+#endif
+}
+QUALIFIERS void philox_double2(__m256i ctr0, __m256i ctr1, __m256i ctr2, __m256i ctr3,
+                               uint32 key0, uint32 key1,
+                               __m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
+{
+    __m256i key[2] = {_mm256_set1_epi32(key0), _mm256_set1_epi32(key1)};
+    __m256i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void philox_float4(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __m256 & rnd1, __m256 & rnd2, __m256 & rnd3, __m256 & rnd4)
+{
+    __m256i ctr0v = _mm256_set1_epi32(ctr0);
+    __m256i ctr2v = _mm256_set1_epi32(ctr2);
+    __m256i ctr3v = _mm256_set1_epi32(ctr3);
+    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __m256d & rnd1lo, __m256d & rnd1hi, __m256d & rnd2lo, __m256d & rnd2hi)
+{
+    __m256i ctr0v = _mm256_set1_epi32(ctr0);
+    __m256i ctr2v = _mm256_set1_epi32(ctr2);
+    __m256i ctr3v = _mm256_set1_epi32(ctr3);
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __m256i ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __m256d & rnd1, __m256d & rnd2)
+{
+#if 0
+    __m256i ctr0v = _mm256_set1_epi32(ctr0);
+    __m256i ctr2v = _mm256_set1_epi32(ctr2);
+    __m256i ctr3v = _mm256_set1_epi32(ctr3);
+    __m256d ignore;
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
+#else
+    __m128d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
+    philox_double2(ctr0, _mm256_extractf128_si256(ctr1, 0), ctr2, ctr3, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+    rnd1 = _my256_set_m128d(rnd1hi, rnd1lo);
+    rnd2 = _my256_set_m128d(rnd2hi, rnd2lo);
+#endif
+}
+#endif
+#if defined(__AVX512F__) || defined(__AVX10_512BIT__)
+QUALIFIERS void _philox4x32round(__m512i* ctr, __m512i* key)
+{
+    __m512i lohi0a = _mm512_mul_epu32(ctr[0], _mm512_set1_epi32(PHILOX_M4x32_0));
+    __m512i lohi0b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[0], 32), _mm512_set1_epi32(PHILOX_M4x32_0));
+    __m512i lohi1a = _mm512_mul_epu32(ctr[2], _mm512_set1_epi32(PHILOX_M4x32_1));
+    __m512i lohi1b = _mm512_mul_epu32(_mm512_srli_epi64(ctr[2], 32), _mm512_set1_epi32(PHILOX_M4x32_1));
+    lohi0a = _mm512_shuffle_epi32(lohi0a, _MM_PERM_DBCA);
+    lohi0b = _mm512_shuffle_epi32(lohi0b, _MM_PERM_DBCA);
+    lohi1a = _mm512_shuffle_epi32(lohi1a, _MM_PERM_DBCA);
+    lohi1b = _mm512_shuffle_epi32(lohi1b, _MM_PERM_DBCA);
+    __m512i lo0 = _mm512_unpacklo_epi32(lohi0a, lohi0b);
+    __m512i hi0 = _mm512_unpackhi_epi32(lohi0a, lohi0b);
+    __m512i lo1 = _mm512_unpacklo_epi32(lohi1a, lohi1b);
+    __m512i hi1 = _mm512_unpackhi_epi32(lohi1a, lohi1b);
+    ctr[0] = _mm512_xor_si512(_mm512_xor_si512(hi1, ctr[1]), key[0]);
+    ctr[1] = lo1;
+    ctr[2] = _mm512_xor_si512(_mm512_xor_si512(hi0, ctr[3]), key[1]);
+    ctr[3] = lo0;
+}
+QUALIFIERS void _philox4x32bumpkey(__m512i* key)
+{
+    key[0] = _mm512_add_epi32(key[0], _mm512_set1_epi32(PHILOX_W32_0));
+    key[1] = _mm512_add_epi32(key[1], _mm512_set1_epi32(PHILOX_W32_1));
+}
+template<bool high>
+QUALIFIERS __m512d _uniform_double_hq(__m512i x, __m512i y)
+{
+    // convert 32 to 64 bit
+    if (high)
+    {
+        x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 1));
+        y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 1));
+    }
+    else
+    {
+        x = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(x, 0));
+        y = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(y, 0));
+    }
+    // calculate z = x ^ y << (53 - 32))
+    __m512i z = _mm512_sll_epi64(y, _mm_set1_epi64x(53 - 32));
+    z = _mm512_xor_si512(x, z);
+    // convert uint64 to double
+    __m512d rs = _mm512_cvtepu64_pd(z);
+    // calculate rs * TWOPOW53_INV_DOUBLE + (TWOPOW53_INV_DOUBLE/2.0)
+    rs = _mm512_fmadd_pd(rs, _mm512_set1_pd(TWOPOW53_INV_DOUBLE), _mm512_set1_pd(TWOPOW53_INV_DOUBLE/2.0));
+    return rs;
+}
+QUALIFIERS void philox_float4(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
+                              uint32 key0, uint32 key1,
+                              __m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
+{
+    __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
+    __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    // convert uint32 to float
+    rnd1 = _mm512_cvtepu32_ps(ctr[0]);
+    rnd2 = _mm512_cvtepu32_ps(ctr[1]);
+    rnd3 = _mm512_cvtepu32_ps(ctr[2]);
+    rnd4 = _mm512_cvtepu32_ps(ctr[3]);
+    // calculate rnd * TWOPOW32_INV_FLOAT + (TWOPOW32_INV_FLOAT/2.0f)
+    rnd1 = _mm512_fmadd_ps(rnd1, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd2 = _mm512_fmadd_ps(rnd2, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd3 = _mm512_fmadd_ps(rnd3, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+    rnd4 = _mm512_fmadd_ps(rnd4, _mm512_set1_ps(TWOPOW32_INV_FLOAT), _mm512_set1_ps(TWOPOW32_INV_FLOAT/2.0));
+}
+QUALIFIERS void philox_double2(__m512i ctr0, __m512i ctr1, __m512i ctr2, __m512i ctr3,
+                               uint32 key0, uint32 key1,
+                               __m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
+{
+    __m512i key[2] = {_mm512_set1_epi32(key0), _mm512_set1_epi32(key1)};
+    __m512i ctr[4] = {ctr0, ctr1, ctr2, ctr3};
+    _philox4x32round(ctr, key);                           // 1
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 2
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 3
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 4
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 5
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 6
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 7
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 8
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 9
+    _philox4x32bumpkey(key); _philox4x32round(ctr, key);  // 10
+    rnd1lo = _uniform_double_hq<false>(ctr[0], ctr[1]);
+    rnd1hi = _uniform_double_hq<true>(ctr[0], ctr[1]);
+    rnd2lo = _uniform_double_hq<false>(ctr[2], ctr[3]);
+    rnd2hi = _uniform_double_hq<true>(ctr[2], ctr[3]);
+}
+QUALIFIERS void philox_float4(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
+                              uint32 key0, uint32 key1,
+                              __m512 & rnd1, __m512 & rnd2, __m512 & rnd3, __m512 & rnd4)
+{
+    __m512i ctr0v = _mm512_set1_epi32(ctr0);
+    __m512i ctr2v = _mm512_set1_epi32(ctr2);
+    __m512i ctr3v = _mm512_set1_epi32(ctr3);
+    philox_float4(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, rnd2, rnd3, rnd4);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __m512d & rnd1lo, __m512d & rnd1hi, __m512d & rnd2lo, __m512d & rnd2hi)
+{
+    __m512i ctr0v = _mm512_set1_epi32(ctr0);
+    __m512i ctr2v = _mm512_set1_epi32(ctr2);
+    __m512i ctr3v = _mm512_set1_epi32(ctr3);
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+}
+QUALIFIERS void philox_double2(uint32 ctr0, __m512i ctr1, uint32 ctr2, uint32 ctr3,
+                               uint32 key0, uint32 key1,
+                               __m512d & rnd1, __m512d & rnd2)
+{
+#if 0
+    __m512i ctr0v = _mm512_set1_epi32(ctr0);
+    __m512i ctr2v = _mm512_set1_epi32(ctr2);
+    __m512i ctr3v = _mm512_set1_epi32(ctr3);
+    __m512d ignore;
+    philox_double2(ctr0v, ctr1, ctr2v, ctr3v, key0, key1, rnd1, ignore, rnd2, ignore);
+#else
+   __m256d rnd1lo, rnd1hi, rnd2lo, rnd2hi;
+   philox_double2(ctr0, _mm512_extracti64x4_epi64(ctr1, 0), ctr2, ctr3, key0, key1, rnd1lo, rnd1hi, rnd2lo, rnd2hi);
+   rnd1 = _my512_set_m256d(rnd1hi, rnd1lo);
+   rnd2 = _my512_set_m256d(rnd2hi, rnd2lo);
+#endif
+}
+#endif
+#endif
--- a/src/pystencils/include/ppc_altivec_helpers.h
+++ b/src/pystencils/include/ppc_altivec_helpers.h
+/*
+Copyright 2021, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include <altivec.h>
+#undef vector
+#undef bool
+inline void cachelineZero(void * p) {
+#ifdef __xlC__
+	__dcbz(p);
+#else
+	__asm__ volatile("dcbz 0, %0"::"r"(p):"memory");
+#endif
+}
+inline size_t _cachelineSize() {
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+	// too much was zeroed
+	return SIZE_MAX;
+}
+inline size_t cachelineSize() {
+	static size_t size = _cachelineSize();
+	return size;
+}
--- a/src/pystencils/include/riscv_v_helpers.h
+++ b/src/pystencils/include/riscv_v_helpers.h
+/*
+Copyright 2023, Michael Kuron.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions, and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions, and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+inline void cachelineZero(void * p) {
+#ifdef __riscv_zicboz
+	__asm__ volatile("cbo.zero (%0)"::"r"(p):"memory");
+#endif
+}
+inline size_t _cachelineSize() {
+	// allocate and fill with ones
+	const size_t max_size = 0x100000;
+	uint8_t data[2*max_size];
+	for (size_t i = 0; i < 2*max_size; ++i) {
+		data[i] = 0xff;
+	}
+	// find alignment offset
+	size_t offset = max_size - ((uintptr_t) data) % max_size;
+	// zero a cacheline
+	cachelineZero((void*) (data + offset));
+	// make sure that at least one byte was zeroed
+	if (data[offset] != 0) {
+		return SIZE_MAX;
+	}
+	// make sure that nothing was zeroed before the pointer
+	if (data[offset-1] == 0) {
+		return SIZE_MAX;
+	}
+	// find the last byte that was zeroed
+	for (size_t size = 1; size < max_size; ++size) {
+		if (data[offset + size] != 0) {
+			return size;
+		}
+	}
+	// too much was zeroed
+	return SIZE_MAX;
+}
+inline size_t cachelineSize() {
+#ifdef __riscv_zicboz
+	static size_t size = _cachelineSize();
+	return size;
+#else
+	return SIZE_MAX;
+#endif
+}
--- a/pystencils/integer_functions.py
+++ b/pystencils/integer_functions.py
+# TODO #47 move to a module functions
+import numpy as np
 import sympy as sp
-from pystencils.data_types import get_type_of_expression, collate_types
+from pystencils.typing import CastFunc, collate_types, create_type, get_type_of_expression
 from pystencils.sympyextensions import is_integer_sequence
-bitwise_xor = sp.Function("bitwise_xor")
-bit_shift_right = sp.Function("bit_shift_right")
+class IntegerFunctionTwoArgsMixIn(sp.Function):
-bit_shift_left = sp.Function("bit_shift_left")
+    is_integer = True
-bitwise_and = sp.Function("bitwise_and")
-bitwise_or = sp.Function("bitwise_or")
+    def __new__(cls, arg1, arg2):
+        args = []
+        for a in (arg1, arg2):
+            if isinstance(a, sp.Number) or isinstance(a, int):
+                args.append(CastFunc(a, create_type("int")))
+            elif isinstance(a, np.generic):
+                args.append(CastFunc(a, a.dtype))
+            else:
+                args.append(a)
+        for a in args:
+            try:
+                dtype = get_type_of_expression(a)
+                if not dtype.is_int():
+                    raise ValueError("Argument to integer function is not an int but " + str(dtype))
+            except NotImplementedError:
+                raise ValueError("Integer functions can only be constructed with typed expressions")
+        return super().__new__(cls, *args)
+    def _eval_evalf(self, *pargs, **kwargs):
+        arg1 = self.args[0].evalf(*pargs, **kwargs) if hasattr(self.args[0], 'evalf') else self.args[0]
+        arg2 = self.args[1].evalf(*pargs, **kwargs) if hasattr(self.args[1], 'evalf') else self.args[1]
+        return self._eval_op(arg1, arg2)
+    def _eval_op(self, arg1, arg2):
+        return self
+# noinspection PyPep8Naming
+class bitwise_xor(IntegerFunctionTwoArgsMixIn):
+    pass
+# noinspection PyPep8Naming
+class bit_shift_right(IntegerFunctionTwoArgsMixIn):
+    pass
+# noinspection PyPep8Naming
+class bit_shift_left(IntegerFunctionTwoArgsMixIn):
+    pass
+# noinspection PyPep8Naming
+class bitwise_and(IntegerFunctionTwoArgsMixIn):
+    pass
+# noinspection PyPep8Naming
+class bitwise_or(IntegerFunctionTwoArgsMixIn):
+    pass
+# noinspection PyPep8Naming
+class int_div(IntegerFunctionTwoArgsMixIn):
+    def _eval_op(self, arg1, arg2):
+        return int(arg1 // arg2)
+# noinspection PyPep8Naming
+class int_power_of_2(IntegerFunctionTwoArgsMixIn):
+    pass
 # noinspection PyPep8Naming
@@ -27,6 +91,7 @@ class modulo_floor(sp.Function):
        '(int64_t)((a) / (b)) * (b)'
    """
    nargs = 2
+    is_integer = True
    def __new__(cls, integer, divisor):
        if is_integer_sequence((integer, divisor)):
@@ -58,6 +123,7 @@ class modulo_ceil(sp.Function):
        '((a) % (b) == 0 ? a : ((int64_t)((a) / (b))+1) * (b))'
    """
    nargs = 2
+    is_integer = True
    def __new__(cls, integer, divisor):
        if is_integer_sequence((integer, divisor)):
@@ -87,6 +153,7 @@ class div_ceil(sp.Function):
        '( (a) % (b) == 0 ? (int64_t)(a) / (int64_t)(b) : ( (int64_t)(a) / (int64_t)(b) ) +1 )'
    """
    nargs = 2
+    is_integer = True
    def __new__(cls, integer, divisor):
        if is_integer_sequence((integer, divisor)):
@@ -99,3 +166,33 @@ class div_ceil(sp.Function):
        assert dtype.is_int()
        code = "( ({0}) % ({1}) == 0 ? ({dtype})({0}) / ({dtype})({1}) : ( ({dtype})({0}) / ({dtype})({1}) ) +1 )"
        return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)
+# noinspection PyPep8Naming
+class div_floor(sp.Function):
+    """Integer division
+    Examples:
+        >>> div_floor(9, 4)
+        2
+        >>> div_floor(8, 4)
+        2
+        >>> from pystencils import TypedSymbol
+        >>> a, b = TypedSymbol("a", "int64"), TypedSymbol("b", "int32")
+        >>> div_floor(a, b).to_c(str)
+        '((int64_t)(a) / (int64_t)(b))'
+    """
+    nargs = 2
+    is_integer = True
+    def __new__(cls, integer, divisor):
+        if is_integer_sequence((integer, divisor)):
+            return integer // divisor
+        else:
+            return super().__new__(cls, integer, divisor)
+    def to_c(self, print_func):
+        dtype = collate_types((get_type_of_expression(self.args[0]), get_type_of_expression(self.args[1])))
+        assert dtype.is_int()
+        code = "(({dtype})({0}) / ({dtype})({1}))"
+        return code.format(print_func(self.args[0]), print_func(self.args[1]), dtype=dtype)
--- a/pystencils/integer_set_analysis.py
+++ b/pystencils/integer_set_analysis.py
 """Transformations using integer sets based on ISL library"""
-import sympy as sp
 import islpy as isl
+import sympy as sp
 import pystencils.astnodes as ast
-from pystencils.transformations import parents_of_type
+from pystencils.typing import parents_of_type
+from pystencils.backends.cbackend import CustomSympyPrinter
 def remove_brackets(s):
@@ -36,13 +37,12 @@ def isl_iteration_set(node: ast.Node):
        loop_start_str = remove_brackets(str(loop.start))
        loop_stop_str = remove_brackets(str(loop.stop))
        ctr_name = loop.loop_counter_name
-        set_string_description = "{} >= {} and {} < {}".format(ctr_name, loop_start_str, ctr_name, loop_stop_str)
+        set_string_description = f"{ctr_name} >= {loop_start_str} and {ctr_name} < {loop_stop_str}"
        conditions.append(remove_brackets(set_string_description))
    symbol_names = ','.join(degrees_of_freedom)
    condition_str = ' and '.join(conditions)
-    set_description = "{{ [{symbol_names}] : {condition_str} }}".format(symbol_names=symbol_names,
+    set_description = f"{{ [{symbol_names}] : {condition_str} }}"
-                                                                        condition_str=condition_str)
    return degrees_of_freedom, isl.BasicSet(set_description)
@@ -52,12 +52,13 @@ def simplify_loop_counter_dependent_conditional(conditional):
    dofs_in_loops, iteration_set = isl_iteration_set(conditional)
    if dofs_in_condition.issubset(dofs_in_loops):
        symbol_names = ','.join(dofs_in_loops)
-        condition_str = remove_brackets(str(conditional.condition_expr))
+        condition_str = CustomSympyPrinter().doprint(conditional.condition_expr)
-        condition_set = isl.BasicSet("{{ [{symbol_names}] : {condition_str} }}".format(symbol_names=symbol_names,
+        condition_str = remove_brackets(condition_str)
-                                                                                       condition_str=condition_str))
+        condition_set = isl.BasicSet(f"{{ [{symbol_names}] : {condition_str} }}")
        if condition_set.is_empty():
            conditional.replace_by_false_block()
+            return
        intersection = iteration_set.intersect(condition_set)
        if intersection.is_empty():

--- a/pystencils/jupytersetup.py
+++ b/pystencils/jupytersetup.py
-import pystencils.plot2d as plt
-import matplotlib.animation as animation
-from IPython.display import HTML
-from tempfile import NamedTemporaryFile
 import base64
-import sympy as sp
+from tempfile import NamedTemporaryFile
-__all__ = ['log_progress', 'make_imshow_animation', 'display_animation', 'set_display_mode']
-def log_progress(sequence, every=None, size=None, name='Items'):
-    """Copied from https://github.com/alexanderkuk/log-progress"""
-    from ipywidgets import IntProgress, HTML, VBox
-    from IPython.display import display
-    is_iterator = False
-    if size is None:
-        try:
-            size = len(sequence)
-        except TypeError:
-            is_iterator = True
-    if size is not None:
-        if every is None:
-            if size <= 200:
-                every = 1
-            else:
-                every = int(size / 200)     # every 0.5%
-    else:
-        assert every is not None, 'sequence is iterator, set every'
-    if is_iterator:
-        progress = IntProgress(min=0, max=1, value=1)
-        progress.bar_style = 'info'
-    else:
-        progress = IntProgress(min=0, max=size, value=0)
-    label = HTML()
-    box = VBox(children=[label, progress])
-    display(box)
-    index = 0
+import matplotlib.animation as animation
-    try:
+import sympy as sp
-        for index, record in enumerate(sequence, 1):
+from IPython.display import HTML
-            if index == 1 or index % every == 0:
-                if is_iterator:
-                    label.value = '{name}: {index} / ?'.format(
-                        name=name,
-                        index=index
-                    )
-                else:
-                    progress.value = index
-                    label.value = u'{name}: {index} / {size}'.format(
-                        name=name,
-                        index=index,
-                        size=size
-                    )
-            yield record
-    except:
-        progress.bar_style = 'danger'
-        raise
-    else:
-        progress.bar_style = 'success'
-        progress.value = index
-        label.value = "{name}: {index}".format(
-            name=name,
-            index=str(index or '?')
-        )
+import pystencils.plot as plt
 VIDEO_TAG = """<video controls width="80%">
 <source src="data:video/x-m4v;base64,{0}" type="video/mp4">
@@ -125,13 +66,13 @@ def display_in_extra_window(*_, **__):
 # -------   Version 3: Animation is shown in images that are updated directly in website --------------
-def display_as_html_image(animation, show=True, iterations=10000, *args, **kwargs):
+def display_as_html_image(animation, show=True, *args, **kwargs):
    from IPython import display
    try:
        if show:
            animation._init_draw()
-        for i in range(iterations):
+        for _ in animation.frame_seq:
            if show:
                fig = plt.gcf()
                display.display(fig)

--- a/src/pystencils/kernel_contrains_check.py
+++ b/src/pystencils/kernel_contrains_check.py
+from collections import namedtuple, defaultdict
+from typing import Union
+import sympy as sp
+from sympy.codegen import Assignment
+from pystencils.simp import AssignmentCollection
+from pystencils import astnodes as ast, TypedSymbol
+from pystencils.field import Field
+from pystencils.node_collection import NodeCollection
+from pystencils.transformations import NestedScopes
+# TODO use this in Constraint Checker
+accepted_functions = [
+    sp.Pow,
+    sp.sqrt,
+    sp.log,
+    # TODO trigonometric functions (and whatever tests will fail)
+]
+class KernelConstraintsCheck:
+    # TODO: proper specification
+    # TODO: More checks :)
+    """Checks if the input to create_kernel is valid.
+    Test the following conditions:
+    - SSA Form for pure symbols:
+        -  Every pure symbol may occur only once as left-hand-side of an assignment
+        -  Every pure symbol that is read, may not be written to later
+    - Independence / Parallelization condition:
+        - a field that is written may only be read at exact the same spatial position
+    (Pure symbols are symbols that are not Field.Accesses)
+    """
+    FieldAndIndex = namedtuple('FieldAndIndex', ['field', 'index'])
+    def __init__(self, check_independence_condition=True, check_double_write_condition=True):
+        self.scopes = NestedScopes()
+        self.field_reads = defaultdict(set)
+        self.field_writes = defaultdict(set)
+        self.fields_read = set()
+        self.check_independence_condition = check_independence_condition
+        self.check_double_write_condition = check_double_write_condition
+    def visit(self, obj):
+        if isinstance(obj, (AssignmentCollection, NodeCollection)):
+            [self.visit(e) for e in obj.all_assignments]
+        elif isinstance(obj, list) or isinstance(obj, tuple):
+            [self.visit(e) for e in obj]
+        elif isinstance(obj, (sp.Eq, ast.SympyAssignment, Assignment)):
+            self.process_assignment(obj)
+        elif isinstance(obj, ast.Conditional):
+            self.scopes.push()
+            # Disable double write check inside conditionals
+            # would be triggered by e.g. in-kernel boundaries
+            old_double_write = self.check_double_write_condition
+            old_independence_condition = self.check_independence_condition
+            self.check_double_write_condition = False
+            self.check_independence_condition = False
+            if obj.false_block:
+                self.visit(obj.false_block)
+            self.process_expression(obj.condition_expr)
+            self.process_expression(obj.true_block)
+            self.check_double_write_condition = old_double_write
+            self.check_independence_condition = old_independence_condition
+            self.scopes.pop()
+        elif isinstance(obj, ast.Block):
+            self.scopes.push()
+            [self.visit(e) for e in obj.args]
+            self.scopes.pop()
+        elif isinstance(obj, ast.Node) and not isinstance(obj, ast.LoopOverCoordinate):
+            pass
+        else:
+            raise ValueError(f'Invalid object in kernel {type(obj)}')
+    def process_assignment(self, assignment: Union[sp.Eq, ast.SympyAssignment, Assignment]):
+        # for checks it is crucial to process rhs before lhs to catch e.g. a = a + 1
+        self.process_expression(assignment.rhs)
+        self.process_lhs(assignment.lhs)
+    def process_expression(self, rhs):
+        # TODO constraint for accepted functions, see TODO above
+        self.update_accesses_rhs(rhs)
+        if isinstance(rhs, Field.Access):
+            self.fields_read.add(rhs.field)
+            self.fields_read.update(rhs.indirect_addressing_fields)
+        else:
+            for arg in rhs.args:
+                self.process_expression(arg)
+    @property
+    def fields_written(self):
+        """
+        Return all rhs fields
+        """
+        return set(k.field for k, v in self.field_writes.items() if len(v))
+    def process_lhs(self, lhs: Union[Field.Access, TypedSymbol, sp.Symbol]):
+        assert isinstance(lhs, sp.Symbol)
+        self.update_accesses_lhs(lhs)
+    def update_accesses_lhs(self, lhs):
+        if isinstance(lhs, Field.Access):
+            fai = self.FieldAndIndex(lhs.field, lhs.index)
+            if self.check_double_write_condition and lhs.offsets in self.field_writes[fai]:
+                raise ValueError(f"Field {lhs.field.name} is written twice at the same location")
+            self.field_writes[fai].add(lhs.offsets)
+            if self.check_double_write_condition and len(self.field_writes[fai]) > 1:
+                raise ValueError(
+                    f"Field {lhs.field.name} is written at two different locations")
+            if fai in self.field_reads:
+                reads = tuple(self.field_reads[fai])
+                if len(reads) > 1 or lhs.offsets != reads[0]:
+                    if self.check_independence_condition:
+                        raise ValueError(f"Field {lhs.field.name} is written at different location than it was read. "
+                                         f"This means the resulting kernel would not be thread safe")
+        elif isinstance(lhs, sp.Symbol):
+            if self.scopes.is_defined_locally(lhs):
+                raise ValueError(f"Assignments not in SSA form, multiple assignments to {lhs.name}")
+            if lhs in self.scopes.free_parameters:
+                raise ValueError(f"Symbol {lhs.name} is written, after it has been read")
+            self.scopes.define_symbol(lhs)
+    def update_accesses_rhs(self, rhs):
+        if isinstance(rhs, Field.Access) and self.check_independence_condition:
+            fai = self.FieldAndIndex(rhs.field, rhs.index)
+            writes = self.field_writes[fai]
+            self.field_reads[fai].add(rhs.offsets)
+            for write_offset in writes:
+                assert len(writes) == 1
+                if write_offset != rhs.offsets:
+                    raise ValueError(f"Violation of loop independence condition. Field "
+                                     f"{rhs.field} is read at {rhs.offsets} and written at {write_offset}")
+            self.fields_read.add(rhs.field)
+        elif isinstance(rhs, sp.Symbol):
+            self.scopes.access_symbol(rhs)
--- a/pystencils/kernel_decorator.py
+++ b/pystencils/kernel_decorator.py
 import ast
 import inspect
-import sympy as sp
 import textwrap
-from pystencils.sympyextensions import SymbolCreator
+from typing import Callable, Union, List, Dict, Tuple
-from pystencils.assignment import Assignment
-__all__ = ['kernel']
+import sympy as sp
-def kernel(func, **kwargs):
+from pystencils.assignment import Assignment
-    """Decorator to simplify generation of pystencils Assignments.
+from pystencils.sympyextensions import SymbolCreator
+from pystencils.config import CreateKernelConfig
-    Changes the meaning of the '@=' operator. Each line containing this operator gives a symbolic assignment
+__all__ = ['kernel', 'kernel_config']
-    in the result list. Furthermore the meaning of the ternary inline 'if-else' changes meaning to denote a
-    sympy Piecewise.
-    The decorated function may not receive any arguments, with exception of an argument called 's' that specifies
-    a SymbolCreator()
-    Examples:
+def _kernel(func: Callable[..., None], **kwargs) -> Tuple[List[Assignment], str]:
-        >>> import pystencils as ps
+    """
-        >>> @kernel
+    Convenient function for kernel decorator to prevent code duplication
-        ... def my_kernel(s):
+    Args:
-        ...     f, g = ps.fields('f, g: [2D]')
+        func: decorated function
-        ...     s.neighbors @= f[0,1] + f[1,0]
+        **kwargs: kwargs for the function
-        ...     g[0,0]      @= s.neighbors + f[0,0] if f[0,0] > 0 else 0
+    Returns:
-        >>> f, g = ps.fields('f, g: [2D]')
+        assignments, function_name
-        >>> assert my_kernel[0].rhs == f[0,1] + f[1,0]
    """
    source = inspect.getsource(func)
    source = textwrap.dedent(source)
@@ -49,9 +42,76 @@ def kernel(func, **kwargs):
    if 's' in args and 's' not in kwargs:
        kwargs['s'] = SymbolCreator()
    func(**kwargs)
+    return assignments, func.__name__
+def kernel(func: Callable[..., None], **kwargs) -> List[Assignment]:
+    """Decorator to simplify generation of pystencils Assignments.
+    Changes the meaning of the '@=' operator. Each line containing this operator gives a symbolic assignment
+    in the result list. Furthermore the meaning of the ternary inline 'if-else' changes meaning to denote a
+    sympy Piecewise.
+    The decorated function may not receive any arguments, with exception of an argument called 's' that specifies
+    a SymbolCreator()
+    Args:
+        func: decorated function
+        **kwargs: kwargs for the function
+    Examples:
+        >>> import pystencils as ps
+        >>> @kernel
+        ... def my_kernel(s):
+        ...     f, g = ps.fields('f, g: [2D]')
+        ...     s.neighbors @= f[0,1] + f[1,0]
+        ...     g[0,0]      @= s.neighbors + f[0,0] if f[0,0] > 0 else 0
+        >>> f, g = ps.fields('f, g: [2D]')
+        >>> assert my_kernel[0].rhs == f[0,1] + f[1,0]
+    """
+    assignments, _ = _kernel(func, **kwargs)
    return assignments
+def kernel_config(config: CreateKernelConfig, **kwargs) -> Callable[..., Dict]:
+    """Decorator to simplify generation of pystencils Assignments, which takes a configuration
+    and updates the function name accordingly.
+    Changes the meaning of the '@=' operator. Each line containing this operator gives a symbolic assignment
+    in the result list. Furthermore, the meaning of the ternary inline 'if-else' changes meaning to denote a
+    sympy Piecewise.
+    The decorated function may not receive any arguments, with exception to an argument called 's' that specifies
+    a SymbolCreator()
+    Args:
+        config: Specify whether to return the list with assignments, or a dictionary containing additional settings
+                like func_name
+    Returns:
+        decorator with config
+    Examples:
+        >>> import pystencils as ps
+        >>> kernel_configuration = ps.CreateKernelConfig()
+        >>> @kernel_config(kernel_configuration)
+        ... def my_kernel(s):
+        ...     src, dst = ps.fields('src, dst: [2D]')
+        ...     s.neighbors @= src[0, 1] + src[1, 0]
+        ...     dst[0, 0]      @= s.neighbors + src[0, 0] if src[0, 0] > 0 else 0
+        >>> f, g = ps.fields('src, dst: [2D]')
+        >>> assert my_kernel['assignments'][0].rhs == f[0, 1] + f[1, 0]
+    """
+    def decorator(func: Callable[..., None]) -> Union[List[Assignment], Dict]:
+        """
+        Args:
+            func: decorated function
+        Returns:
+            Dict for unpacking into create_kernel
+        """
+        assignments, func_name = _kernel(func, **kwargs)
+        config.function_name = func_name
+        return {'assignments': assignments, 'config': config}
+    return decorator
 # noinspection PyMethodMayBeStatic
 class KernelFunctionRewrite(ast.NodeTransformer):

--- a/src/pystencils/kernel_wrapper.py
+++ b/src/pystencils/kernel_wrapper.py
+import pystencils
+class KernelWrapper:
+    """
+    Light-weight wrapper around a compiled kernel.
+    Can be called while still providing access to underlying AST.
+    """
+    def __init__(self, kernel, parameters, ast_node: pystencils.astnodes.KernelFunction):
+        self.kernel = kernel
+        self.parameters = parameters
+        self.ast = ast_node
+        self.num_regs = None
+    def __call__(self, **kwargs):
+        return self.kernel(**kwargs)
+    @property
+    def code(self):
+        return pystencils.get_code_str(self.ast)
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
+import itertools
+import warnings
+from typing import Union, List
+import sympy as sp
+from pystencils.config import CreateKernelConfig
+from pystencils.assignment import Assignment, AddAugmentedAssignment
+from pystencils.astnodes import Node, Block, Conditional, LoopOverCoordinate, SympyAssignment
+from pystencils.cpu.vectorization import vectorize
+from pystencils.enums import Target, Backend
+from pystencils.field import Field, FieldType
+from pystencils.node_collection import NodeCollection
+from pystencils.simp.assignment_collection import AssignmentCollection
+from pystencils.kernel_contrains_check import KernelConstraintsCheck
+from pystencils.simplificationfactory import create_simplification_strategy
+from pystencils.stencil import direction_string_to_offset, inverse_direction_string
+from pystencils.transformations import (
+    loop_blocking, move_constants_before_loop, remove_conditionals_in_staggered_kernel)
+def create_kernel(assignments: Union[Assignment, List[Assignment],
+                                     AddAugmentedAssignment, List[AddAugmentedAssignment],
+                                     AssignmentCollection, List[Node], NodeCollection],
+                  *,
+                  config: CreateKernelConfig = None, **kwargs):
+    """
+    Creates abstract syntax tree (AST) of kernel, using a list of update equations.
+    This function forms the general API and delegates the kernel creation to others depending on the CreateKernelConfig.
+    Args:
+        assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
+        config: CreateKernelConfig which includes the needed configuration
+        kwargs: Arguments for updating the config
+    Returns:
+        abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
+        can be compiled with through its 'compile()' member
+    Example:
+        >>> import pystencils as ps
+        >>> import numpy as np
+        >>> s, d = ps.fields('s, d: [2D]')
+        >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
+        >>> kernel_ast = ps.create_kernel(assignment, config=ps.CreateKernelConfig(cpu_openmp=True))
+        >>> kernel = kernel_ast.compile()
+        >>> d_arr = np.zeros([5, 5])
+        >>> kernel(d=d_arr, s=np.ones([5, 5]))
+        >>> d_arr
+        array([[0., 0., 0., 0., 0.],
+               [0., 4., 4., 4., 0.],
+               [0., 4., 4., 4., 0.],
+               [0., 4., 4., 4., 0.],
+               [0., 0., 0., 0., 0.]])
+    """
+    # ----  Updating configuration from kwargs
+    if not config:
+        config = CreateKernelConfig(**kwargs)
+    else:
+        for k, v in kwargs.items():
+            if not hasattr(config, k):
+                raise KeyError(f'{v} is not a valid kwarg. Please look in CreateKernelConfig for valid settings')
+            setattr(config, k, v)
+    # ----  Normalizing parameters
+    if isinstance(assignments, (Assignment, AddAugmentedAssignment)):
+        assignments = [assignments]
+    assert assignments, "Assignments must not be empty!"
+    if isinstance(assignments, list):
+        assignments = NodeCollection(assignments)
+    elif isinstance(assignments, AssignmentCollection):
+        # TODO Markus check and doku
+        # --- applying first default simplifications
+        try:
+            if config.default_assignment_simplifications:
+                simplification = create_simplification_strategy()
+                assignments = simplification(assignments)
+        except Exception as e:
+            warnings.warn(f"It was not possible to apply the default pystencils optimisations to the "
+                          f"AssignmentCollection due to the following problem :{e}")
+        simplification_hints = assignments.simplification_hints
+        assignments = NodeCollection.from_assignment_collection(assignments)
+        assignments.simplification_hints = simplification_hints
+    if config.index_fields:
+        return create_indexed_kernel(assignments, config=config)
+    else:
+        return create_domain_kernel(assignments, config=config)
+def create_domain_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
+    """
+    Creates abstract syntax tree (AST) of kernel, using a NodeCollection.
+    Note that `create_domain_kernel` is a lower level function which shoul be accessed by not providing `index_fields`
+    to create_kernel
+    Args:
+        assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
+        config: CreateKernelConfig which includes the needed configuration
+    Returns:
+        abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
+        can be compiled with through its 'compile()' member
+    Example:
+        >>> import pystencils as ps
+        >>> import numpy as np
+        >>> from pystencils.kernelcreation import create_domain_kernel
+        >>> from pystencils.node_collection import NodeCollection
+        >>> s, d = ps.fields('s, d: [2D]')
+        >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
+        >>> kernel_config = ps.CreateKernelConfig(cpu_openmp=True)
+        >>> kernel_ast = create_domain_kernel(NodeCollection([assignment]), config=kernel_config)
+        >>> kernel = kernel_ast.compile()
+        >>> d_arr = np.zeros([5, 5])
+        >>> kernel(d=d_arr, s=np.ones([5, 5]))
+        >>> d_arr
+        array([[0., 0., 0., 0., 0.],
+               [0., 4., 4., 4., 0.],
+               [0., 4., 4., 4., 0.],
+               [0., 4., 4., 4., 0.],
+               [0., 0., 0., 0., 0.]])
+    """
+    # --- eval
+    assignments.evaluate_terms()
+    # FUTURE WORK from here we shouldn't NEED sympy
+    # --- check constrains
+    check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
+                                   check_double_write_condition=not config.allow_double_writes)
+    check.visit(assignments)
+    assignments.bound_fields = check.fields_written
+    assignments.rhs_fields = check.fields_read
+    # ----  Creating ast
+    ast = None
+    if config.target == Target.CPU:
+        if config.backend == Backend.C:
+            from pystencils.cpu import add_openmp, create_kernel
+            ast = create_kernel(assignments, config=config)
+            for optimization in config.cpu_prepend_optimizations:
+                optimization(ast)
+            omp_collapse = None
+            if config.cpu_blocking:
+                omp_collapse = loop_blocking(ast, config.cpu_blocking)
+            if config.cpu_openmp:
+                add_openmp(ast, num_threads=config.cpu_openmp, collapse=omp_collapse,
+                           assume_single_outer_loop=config.omp_single_loop)
+            if config.cpu_vectorize_info:
+                if config.cpu_vectorize_info is True:
+                    vectorize(ast)
+                elif isinstance(config.cpu_vectorize_info, dict):
+                    vectorize(ast, **config.cpu_vectorize_info)
+                    if config.cpu_openmp and config.cpu_blocking and 'nontemporal' in config.cpu_vectorize_info and \
+                            config.cpu_vectorize_info['nontemporal'] and 'cachelineZero' in ast.instruction_set:
+                        # This condition is stricter than it needs to be: if blocks along the fastest axis start on a
+                        # cache line boundary, it's okay. But we cannot determine that here.
+                        # We don't need to disallow OpenMP collapsing because it is never applied to the inner loop.
+                        raise ValueError("Blocking cannot be combined with cacheline-zeroing")
+                else:
+                    raise ValueError("Invalid value for cpu_vectorize_info")
+    elif config.target == Target.GPU:
+        if config.backend == Backend.CUDA:
+            from pystencils.gpu import create_cuda_kernel
+            ast = create_cuda_kernel(assignments, config=config)
+    if not ast:
+        raise NotImplementedError(
+            f'{config.target} together with {config.backend} is not supported by `create_domain_kernel`')
+    if config.use_auto_for_assignments:
+        for a in ast.atoms(SympyAssignment):
+            a.use_auto = True
+    return ast
+def create_indexed_kernel(assignments: NodeCollection, *, config: CreateKernelConfig):
+    """
+    Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
+    coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
+    The coordinates are stored in a separated index_field, which is a one dimensional array with struct data type.
+    This struct has to contain fields named 'x', 'y' and for 3D fields ('z'). These names are configurable with the
+    'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for
+    example boundary parameters.
+    Note that `create_indexed_kernel` is a lower level function which shoul be accessed by providing `index_fields`
+    to create_kernel
+    Args:
+        assignments: `pystencils.node_collection.NodeCollection` containing all assignements and nodes to be processed
+        config: CreateKernelConfig which includes the needed configuration
+    Returns:
+        abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
+        can be compiled with through its 'compile()' member
+    Example:
+        >>> import pystencils as ps
+        >>> from pystencils.node_collection import NodeCollection
+        >>> import numpy as np
+        >>> from pystencils.kernelcreation import create_indexed_kernel
+        >>>
+        >>> # Index field stores the indices of the cell to visit together with optional values
+        >>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)])
+        >>> index_arr = np.array([(1, 1, 0.1), (2, 2, 0.2), (3, 3, 0.3)], dtype=index_arr_dtype)
+        >>> idx_field = ps.fields(idx=index_arr)
+        >>>
+        >>> # Additional values  stored in index field can be accessed in the kernel as well
+        >>> s, d = ps.fields('s, d: [2D]')
+        >>> assignment = ps.Assignment(d[0, 0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val'))
+        >>> kernel_config = ps.CreateKernelConfig(index_fields=[idx_field], coordinate_names=('x', 'y'))
+        >>> kernel_ast = create_indexed_kernel(NodeCollection([assignment]), config=kernel_config)
+        >>> kernel = kernel_ast.compile()
+        >>> d_arr = np.zeros([5, 5])
+        >>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr)
+        >>> d_arr
+        array([[0. , 0. , 0. , 0. , 0. ],
+               [0. , 4.1, 0. , 0. , 0. ],
+               [0. , 0. , 4.2, 0. , 0. ],
+               [0. , 0. , 0. , 4.3, 0. ],
+               [0. , 0. , 0. , 0. , 0. ]])
+    """
+    # --- eval
+    assignments.evaluate_terms()
+    # FUTURE WORK from here we shouldn't NEED sympy
+    # --- check constrains
+    check = KernelConstraintsCheck(check_independence_condition=not config.skip_independence_check,
+                                   check_double_write_condition=not config.allow_double_writes)
+    check.visit(assignments)
+    assignments.bound_fields = check.fields_written
+    assignments.rhs_fields = check.fields_read
+    ast = None
+    if config.target == Target.CPU and config.backend == Backend.C:
+        from pystencils.cpu import add_openmp, create_indexed_kernel
+        ast = create_indexed_kernel(assignments, config=config)
+        if config.cpu_openmp:
+            add_openmp(ast, num_threads=config.cpu_openmp)
+    elif config.target == Target.GPU:
+        if config.backend == Backend.CUDA:
+            from pystencils.gpu import created_indexed_cuda_kernel
+            ast = created_indexed_cuda_kernel(assignments, config=config)
+    if not ast:
+        raise NotImplementedError(f'Indexed kernels are not yet supported for {config.target} with {config.backend}')
+    return ast
+def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclusive_conditions=False, **kwargs):
+    """Kernel that updates a staggered field.
+    .. image:: /img/staggered_grid.svg
+    For a staggered field, the first index coordinate defines the location of the staggered value.
+    Further index coordinates can be used to store vectors/tensors at each point.
+    Args:
+        assignments: a sequence of assignments or an AssignmentCollection.
+                     Assignments to staggered field are processed specially, while subexpressions and assignments to
+                     regular fields are passed through to `create_kernel`. Multiple different staggered fields can be
+                     used, but they all need to use the same stencil (i.e. the same number of staggered points) and
+                     shape.
+        target: 'CPU' or 'GPU'
+        gpu_exclusive_conditions: disable the use of multiple conditionals inside the loop. The outer layers are then
+                                  handled in an else branch.
+        kwargs: passed directly to create_kernel, iteration_slice and ghost_layers parameters are not allowed
+    Returns:
+        AST, see `create_kernel`
+    """
+    # TODO: Add doku like in the other kernels
+    if 'ghost_layers' in kwargs:
+        assert kwargs['ghost_layers'] is None
+        del kwargs['ghost_layers']
+    if 'iteration_slice' in kwargs:
+        assert kwargs['iteration_slice'] is None
+        del kwargs['iteration_slice']
+    if 'omp_single_loop' in kwargs:
+        assert kwargs['omp_single_loop'] is False
+        del kwargs['omp_single_loop']
+    if isinstance(assignments, AssignmentCollection):
+        subexpressions = assignments.subexpressions + [a for a in assignments.main_assignments
+                                                       if not hasattr(a, 'lhs')
+                                                       or type(a.lhs) is not Field.Access
+                                                       or not FieldType.is_staggered(a.lhs.field)]
+        assignments = [a for a in assignments.main_assignments if hasattr(a, 'lhs')
+                       and type(a.lhs) is Field.Access
+                       and FieldType.is_staggered(a.lhs.field)]
+    else:
+        subexpressions = [a for a in assignments if not hasattr(a, 'lhs')
+                          or type(a.lhs) is not Field.Access
+                          or not FieldType.is_staggered(a.lhs.field)]
+        assignments = [a for a in assignments if hasattr(a, 'lhs')
+                       and type(a.lhs) is Field.Access
+                       and FieldType.is_staggered(a.lhs.field)]
+    if len(set([tuple(a.lhs.field.staggered_stencil) for a in assignments])) != 1:
+        raise ValueError("All assignments need to be made to staggered fields with the same stencil")
+    if len(set([a.lhs.field.shape for a in assignments])) != 1:
+        raise ValueError("All assignments need to be made to staggered fields with the same shape")
+    staggered_field = assignments[0].lhs.field
+    stencil = staggered_field.staggered_stencil
+    dim = staggered_field.spatial_dimensions
+    shape = staggered_field.shape
+    counters = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim)]
+    final_assignments = []
+    # find out whether any of the ghost layers is not needed
+    common_exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim])
+    for direction in stencil:
+        exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim])
+        for elementary_direction in direction:
+            exclusions.remove(inverse_direction_string(elementary_direction))
+        common_exclusions.intersection_update(exclusions)
+    ghost_layers = [[0, 0] for d in range(dim)]
+    for direction in common_exclusions:
+        direction = direction_string_to_offset(direction)
+        for d, s in enumerate(direction):
+            if s == 1:
+                ghost_layers[d][1] = 1
+            elif s == -1:
+                ghost_layers[d][0] = 1
+    def condition(direction):
+        """exclude those staggered points that correspond to fluxes between ghost cells"""
+        exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim])
+        for elementary_direction in direction:
+            exclusions.remove(inverse_direction_string(elementary_direction))
+        conditions = []
+        for e in exclusions:
+            if e in common_exclusions:
+                continue
+            offset = direction_string_to_offset(e)
+            for i, o in enumerate(offset):
+                if o == 1:
+                    conditions.append(counters[i] < shape[i] - 1)
+                elif o == -1:
+                    conditions.append(counters[i] > 0)
+        return sp.And(*conditions)
+    if gpu_exclusive_conditions:
+        outer_assignment = None
+        conditions = {direction: condition(direction) for direction in stencil}
+        for num_conditions in range(len(stencil)):
+            for combination in itertools.combinations(conditions.values(), num_conditions):
+                for assignment in assignments:
+                    direction = stencil[assignment.lhs.index[0]]
+                    if conditions[direction] in combination:
+                        assignment = SympyAssignment(assignment.lhs, assignment.rhs)
+                        outer_assignment = Conditional(sp.And(*combination), Block([assignment]), outer_assignment)
+        inner_assignment = []
+        for assignment in assignments:
+            inner_assignment.append(SympyAssignment(assignment.lhs, assignment.rhs))
+        last_conditional = Conditional(sp.And(*[condition(d) for d in stencil]),
+                                       Block(inner_assignment), outer_assignment)
+        final_assignments = [s for s in subexpressions if not hasattr(s, 'lhs')] + \
+                            [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \
+                            [last_conditional]
+        config = CreateKernelConfig(target=target, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs)
+        ast = create_kernel(final_assignments, config=config)
+        return ast
+    for assignment in assignments:
+        direction = stencil[assignment.lhs.index[0]]
+        sp_assignments = [s for s in subexpressions if not hasattr(s, 'lhs')] + \
+                         [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \
+                         [SympyAssignment(assignment.lhs, assignment.rhs)]
+        last_conditional = Conditional(condition(direction), Block(sp_assignments))
+        final_assignments.append(last_conditional)
+    remove_start_conditional = any([gl[0] == 0 for gl in ghost_layers])
+    prepend_optimizations = [lambda ast: remove_conditionals_in_staggered_kernel(ast, remove_start_conditional),
+                             move_constants_before_loop]
+    if 'cpu_prepend_optimizations' in kwargs:
+        prepend_optimizations += kwargs['cpu_prepend_optimizations']
+        del kwargs['cpu_prepend_optimizations']
+    config = CreateKernelConfig(ghost_layers=ghost_layers, target=target, omp_single_loop=False,
+                                cpu_prepend_optimizations=prepend_optimizations, **kwargs)
+    ast = create_kernel(final_assignments, config=config)
+    return ast
--- a/src/pystencils/node_collection.py
+++ b/src/pystencils/node_collection.py
+from typing import Any, Dict, List, Union, Optional, Set
+import sympy
+import sympy as sp
+from sympy.codegen.rewriting import ReplaceOptim, optimize
+from pystencils.assignment import Assignment, AddAugmentedAssignment
+import pystencils.astnodes as ast
+from pystencils.backends.cbackend import CustomCodeNode
+from pystencils.functions import DivFunc
+from pystencils.simp import AssignmentCollection
+from pystencils.typing import FieldPointerSymbol
+class NodeCollection:
+    def __init__(self, assignments: List[Union[ast.Node, Assignment]],
+                 simplification_hints: Optional[Dict[str, Any]] = None,
+                 bound_fields: Set[sp.Symbol] = None, rhs_fields: Set[sp.Symbol] = None):
+        def visit(obj):
+            if isinstance(obj, (list, tuple)):
+                return [visit(e) for e in obj]
+            if isinstance(obj, Assignment):
+                if isinstance(obj.lhs, FieldPointerSymbol):
+                    return ast.SympyAssignment(obj.lhs, obj.rhs, is_const=obj.lhs.dtype.const)
+                return ast.SympyAssignment(obj.lhs, obj.rhs)
+            elif isinstance(obj, AddAugmentedAssignment):
+                return ast.SympyAssignment(obj.lhs, obj.lhs + obj.rhs)
+            elif isinstance(obj, ast.SympyAssignment):
+                return obj
+            elif isinstance(obj, ast.Conditional):
+                true_block = visit(obj.true_block)
+                false_block = None if obj.false_block is None else visit(obj.false_block)
+                return ast.Conditional(obj.condition_expr, true_block=true_block, false_block=false_block)
+            elif isinstance(obj, ast.Block):
+                return ast.Block([visit(e) for e in obj.args])
+            elif isinstance(obj, ast.Node) and not isinstance(obj, ast.LoopOverCoordinate):
+                return obj
+            else:
+                raise ValueError("Invalid object in the List of Assignments " + str(type(obj)))
+        self.all_assignments = visit(assignments)
+        self.simplification_hints = simplification_hints if simplification_hints else {}
+        self.bound_fields = bound_fields if bound_fields else {}
+        self.rhs_fields = rhs_fields if rhs_fields else {}
+    @staticmethod
+    def from_assignment_collection(assignment_collection: AssignmentCollection):
+        return NodeCollection(assignments=assignment_collection.all_assignments,
+                              simplification_hints=assignment_collection.simplification_hints,
+                              bound_fields=assignment_collection.bound_fields,
+                              rhs_fields=assignment_collection.rhs_fields)
+    def evaluate_terms(self):
+        evaluate_constant_terms = ReplaceOptim(
+            lambda e: hasattr(e, 'is_constant') and e.is_constant and not e.is_integer,
+            lambda p: p.evalf()
+        )
+        evaluate_pow = ReplaceOptim(
+            lambda e: e.is_Pow and e.exp.is_Integer and abs(e.exp) <= 8,
+            lambda p: sp.UnevaluatedExpr(sp.Mul(*([p.base] * +p.exp), evaluate=False)) if p.exp > 0 else
+            (DivFunc(sp.Integer(1), p.base) if p.exp == -1 else
+             DivFunc(sp.Integer(1), sp.UnevaluatedExpr(sp.Mul(*([p.base] * -p.exp), evaluate=False))))
+        )
+        sympy_optimisations = [evaluate_constant_terms, evaluate_pow]
+        def visitor(node):
+            if isinstance(node, CustomCodeNode):
+                return node
+            elif isinstance(node, ast.Block):
+                return node.func([visitor(child) for child in node.args])
+            elif isinstance(node, ast.SympyAssignment):
+                new_lhs = visitor(node.lhs)
+                new_rhs = visitor(node.rhs)
+                return node.func(new_lhs, new_rhs, node.is_const, node.use_auto)
+            elif isinstance(node, ast.Node):
+                return node.func(*[visitor(child) for child in node.args])
+            elif isinstance(node, sympy.Basic):
+                return optimize(node, sympy_optimisations)
+            else:
+                raise NotImplementedError(f'{node} {type(node)} has no valid visitor')
+        self.all_assignments = [visitor(assignment) for assignment in self.all_assignments]
No results found