From c8dbf8cb8e2ba440dfb64e7a27ba53a27ce6ee08 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 23 Jul 2024 12:35:08 +0200
Subject: [PATCH] Various GPU-related and some general fixes.

---
 conftest.py                                   |   2 +-
 src/pystencils/backend/jit/__init__.py        |   4 +-
 src/pystencils/backend/jit/gpu_cupy.py        |  37 +++--
 src/pystencils/backend/jit/jit.py             |  61 +++++----
 src/pystencils/backend/jit/legacy_cpu.py      |  26 +++-
 .../backend/kernelcreation/analysis.py        |  21 +--
 .../backend/kernelcreation/ast_factory.py     |  73 +++++++---
 .../backend/kernelcreation/context.py         |   2 +-
 .../backend/kernelcreation/freeze.py          |   4 +-
 .../backend/kernelcreation/iteration_space.py |  14 +-
 src/pystencils/backend/platforms/cuda.py      |  41 +++++-
 src/pystencils/backend/platforms/sycl.py      |  50 +++++--
 src/pystencils/config.py                      | 128 +++++++++++++-----
 src/pystencils/gpu/periodicity.py             |  75 ++++++++++
 src/pystencils/kernel_wrapper.py              |  27 +---
 src/pystencils/kernelcreation.py              |  61 +++++++--
 src/pystencils/old/gpu/periodicity.py         |  49 -------
 src/pystencils/types/parsing.py               |   3 +
 .../{ => nbackend/kernelcreation}/test_gpu.py |  12 +-
 .../kernelcreation/test_index_kernels.py      |  41 ++++--
 .../kernelcreation/test_iteration_space.py    | 127 +++++++++++++++--
 .../kernelcreation/test_typification.py       |   4 +-
 22 files changed, 615 insertions(+), 247 deletions(-)
 create mode 100644 src/pystencils/gpu/periodicity.py
 delete mode 100644 src/pystencils/old/gpu/periodicity.py
 rename tests/{ => nbackend/kernelcreation}/test_gpu.py (94%)

diff --git a/conftest.py b/conftest.py
index 742ff7caa..d4b323f73 100644
--- a/conftest.py
+++ b/conftest.py
@@ -48,7 +48,7 @@ add_path_to_ignore('_local_tmp')
 try:
     import cupy
 except ImportError:
-    collect_ignore += [os.path.join(SCRIPT_FOLDER, "tests/test_gpu.py")]
+    collect_ignore += [os.path.join(SCRIPT_FOLDER, "tests/nbackend/kernelcreation/test_gpu.py")]
     add_path_to_ignore('src/pystencils/gpu')
 
 try:
diff --git a/src/pystencils/backend/jit/__init__.py b/src/pystencils/backend/jit/__init__.py
index edc755dfd..f382f0749 100644
--- a/src/pystencils/backend/jit/__init__.py
+++ b/src/pystencils/backend/jit/__init__.py
@@ -26,7 +26,8 @@ Both are available here through `LegacyCpuJit` and `LegacyGpuJit`.
 
 """
 
-from .jit import JitBase, NoJit, KernelWrapper, LegacyCpuJit, LegacyGpuJit
+from .jit import JitBase, NoJit, KernelWrapper
+from .legacy_cpu import LegacyCpuJit
 from .gpu_cupy import CupyJit
 
 no_jit = NoJit()
@@ -38,6 +39,5 @@ __all__ = [
     "LegacyCpuJit",
     "NoJit",
     "no_jit",
-    "LegacyGpuJit",
     "CupyJit",
 ]
diff --git a/src/pystencils/backend/jit/gpu_cupy.py b/src/pystencils/backend/jit/gpu_cupy.py
index 0c9b7b8a9..d6aaac2d2 100644
--- a/src/pystencils/backend/jit/gpu_cupy.py
+++ b/src/pystencils/backend/jit/gpu_cupy.py
@@ -1,4 +1,4 @@
-from typing import Callable, Any
+from typing import Any, Sequence, cast
 from dataclasses import dataclass
 
 try:
@@ -22,6 +22,7 @@ from ..kernelfunction import (
     KernelParameter,
 )
 from ..emission import emit_code
+from ...types import PsStructType
 
 from ...include import get_pystencils_include_path
 
@@ -73,14 +74,7 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        key = tuple(
-            (
-                (k, v.dtype, v.strides, v.shape)
-                if isinstance(v, cp.ndarray)
-                else (k, id(v))
-            )
-            for k, v in kwargs.items()
-        )
+        key = (self._block_size,) + tuple((k, id(v)) for k, v in kwargs.items())
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -109,6 +103,10 @@ class CupyKernelWrapper(KernelWrapper):
 
             if field.has_fixed_shape:
                 expected_shape = tuple(int(s) for s in field.shape)
+                if isinstance(field.dtype, PsStructType):
+                    assert expected_shape[-1] == 1
+                    expected_shape = expected_shape[:-1]
+
                 actual_shape = arr.shape
                 if expected_shape != actual_shape:
                     raise ValueError(
@@ -117,6 +115,10 @@ class CupyKernelWrapper(KernelWrapper):
                     )
 
                 expected_strides = tuple(int(s) for s in field.strides)
+                if isinstance(field.dtype, PsStructType):
+                    assert expected_strides[-1] == 1
+                    expected_strides = expected_strides[:-1]
+                
                 actual_strides = tuple(s // arr.dtype.itemsize for s in arr.strides)
                 if expected_strides != actual_strides:
                     raise ValueError(
@@ -126,7 +128,7 @@ class CupyKernelWrapper(KernelWrapper):
 
             match field.field_type:
                 case FieldType.GENERIC:
-                    field_shapes.add(arr.shape)
+                    field_shapes.add(arr.shape[: field.spatial_dimensions])
 
                     if len(field_shapes) > 1:
                         raise ValueError(
@@ -200,11 +202,20 @@ class CupyKernelWrapper(KernelWrapper):
 
 class CupyJit(JitBase):
 
-    def __init__(self, default_block_size: tuple[int, int, int] = (128, 2, 1)):
+    def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)):
         self._runtime_headers = {"<cstdint>"}
-        self._default_block_size = default_block_size
 
-    def compile(self, kfunc: KernelFunction) -> Callable[..., None]:
+        if len(default_block_size) > 3:
+            raise ValueError(
+                f"Invalid block size: {default_block_size}. Must be at most three-dimensional."
+            )
+
+        self._default_block_size: tuple[int, int, int] = cast(
+            tuple[int, int, int],
+            tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
+        )
+
+    def compile(self, kfunc: KernelFunction) -> KernelWrapper:
         if not HAVE_CUPY:
             raise JitError(
                 "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
diff --git a/src/pystencils/backend/jit/jit.py b/src/pystencils/backend/jit/jit.py
index b455c3680..218424570 100644
--- a/src/pystencils/backend/jit/jit.py
+++ b/src/pystencils/backend/jit/jit.py
@@ -1,60 +1,61 @@
 from __future__ import annotations
-from typing import Callable, TYPE_CHECKING
+from typing import Sequence, TYPE_CHECKING
 from abc import ABC, abstractmethod
 
 if TYPE_CHECKING:
-    from ..kernelfunction import KernelFunction
+    from ..kernelfunction import KernelFunction, KernelParameter
+    from ...enums import Target
 
 
 class JitError(Exception):
     """Indicates an error during just-in-time compilation"""
 
 
-class JitBase(ABC):
-    """Base class for just-in-time compilation interfaces implemented in pystencils."""
+class KernelWrapper(ABC):
+    """Wrapper around a compiled and executable pystencils kernel."""
 
-    @abstractmethod
-    def compile(self, kernel: KernelFunction) -> Callable[..., None]:
-        """Compile a kernel function and return a callable object which invokes the kernel."""
-
-
-class KernelWrapper:
     def __init__(self, kfunc: KernelFunction) -> None:
         self._kfunc = kfunc
 
+    @abstractmethod
+    def __call__(self, **kwargs) -> None:
+        pass
+
     @property
     def kernel_function(self) -> KernelFunction:
         return self._kfunc
+    
+    @property
+    def ast(self) -> KernelFunction:
+        return self._kfunc
+    
+    @property
+    def target(self) -> Target:
+        return self._kfunc.target
+    
+    @property
+    def parameters(self) -> Sequence[KernelParameter]:
+        return self._kfunc.parameters
 
     @property
     def code(self) -> str:
         from pystencils.display_utils import get_code_str
 
-        return get_code_str(str)
+        return get_code_str(self._kfunc)
+
+
+class JitBase(ABC):
+    """Base class for just-in-time compilation interfaces implemented in pystencils."""
+
+    @abstractmethod
+    def compile(self, kernel: KernelFunction) -> KernelWrapper:
+        """Compile a kernel function and return a callable object which invokes the kernel."""
 
 
 class NoJit(JitBase):
     """Not a JIT compiler: Used to explicitly disable JIT compilation on an AST."""
 
-    def compile(self, kernel: KernelFunction) -> Callable[..., None]:
+    def compile(self, kernel: KernelFunction) -> KernelWrapper:
         raise JitError(
             "Just-in-time compilation of this kernel was explicitly disabled."
         )
-
-
-class LegacyCpuJit(JitBase):
-    """Wrapper around ``pystencils.cpu.cpujit``"""
-
-    def compile(self, kernel: KernelFunction) -> Callable[..., None]:
-        from .legacy_cpu import compile_and_load
-
-        return compile_and_load(kernel)
-
-
-class LegacyGpuJit(JitBase):
-    """Wrapper around ``pystencils.gpu.gpujit``"""
-
-    def compile(self, kernel: KernelFunction) -> Callable[..., None]:
-        from ...old.gpu.gpujit import make_python_function
-
-        return make_python_function(kernel)
diff --git a/src/pystencils/backend/jit/legacy_cpu.py b/src/pystencils/backend/jit/legacy_cpu.py
index ca5701160..1acd1b22a 100644
--- a/src/pystencils/backend/jit/legacy_cpu.py
+++ b/src/pystencils/backend/jit/legacy_cpu.py
@@ -47,6 +47,8 @@ Then 'cl.exe' is used to compile.
 
 from appdirs import user_cache_dir, user_config_dir
 from collections import OrderedDict
+from typing import Callable
+
 import importlib.util
 import json
 import os
@@ -60,14 +62,34 @@ import warnings
 
 
 from ..kernelfunction import KernelFunction
+from .jit import JitBase, KernelWrapper
 from .cpu_extension_module import PsKernelExtensioNModule
 
 from .msvc_detection import get_environment
 from pystencils.include import get_pystencils_include_path
-from pystencils.kernel_wrapper import KernelWrapper
 from pystencils.utils import atomic_file_write, recursive_dict_update
 
 
+class CpuKernelWrapper(KernelWrapper):
+    def __init__(self, kfunc: KernelFunction, compiled_kernel: Callable[..., None]) -> None:
+        super().__init__(kfunc)
+        self._compiled_kernel = compiled_kernel
+
+    def __call__(self, **kwargs) -> None:
+        self._compiled_kernel(**kwargs)
+
+    @property
+    def kernel(self) -> Callable[..., None]:
+        return self._compiled_kernel
+
+
+class LegacyCpuJit(JitBase):
+    """Wrapper around ``pystencils.cpu.cpujit``"""
+
+    def compile(self, kernel: KernelFunction) -> KernelWrapper:
+        return compile_and_load(kernel)
+
+
 def make_python_function(kernel_function_node, custom_backend=None):
     """
     Creates C code from the abstract syntax tree, compiles it and makes it accessible as Python function
@@ -449,4 +471,4 @@ def compile_and_load(kernel: KernelFunction, custom_backend=None):
         )
         result = load_kernel_from_file(code_hash_str, kernel.name, lib_file)
 
-    return KernelWrapper(result, kernel.parameters, kernel)
+    return CpuKernelWrapper(kernel, result)
diff --git a/src/pystencils/backend/kernelcreation/analysis.py b/src/pystencils/backend/kernelcreation/analysis.py
index 05aa79928..077684864 100644
--- a/src/pystencils/backend/kernelcreation/analysis.py
+++ b/src/pystencils/backend/kernelcreation/analysis.py
@@ -9,8 +9,8 @@ import sympy as sp
 from .context import KernelCreationContext
 
 from ...field import Field
-from ...assignment import Assignment
 from ...simp import AssignmentCollection
+from sympy.codegen.ast import AssignmentBase
 
 from ..exceptions import PsInternalCompilerError, KernelConstraintsError
 
@@ -43,13 +43,16 @@ class KernelAnalysis:
 
     FieldAndIndex = namedtuple("FieldAndIndex", ["field", "index"])
 
-    def __init__(self, ctx: KernelCreationContext):
+    def __init__(
+        self,
+        ctx: KernelCreationContext,
+        check_access_independence: bool = True,
+        check_double_writes: bool = True,
+    ):
         self._ctx = ctx
 
-        self._check_access_independence = False
-        self._check_double_writes = (
-            False  # TODO: Access independence check implies double writes check
-        )
+        self._check_access_independence = check_access_independence
+        self._check_double_writes = check_double_writes
 
         #   Map pairs of fields and indices to offsets
         self._field_writes: dict[KernelAnalysis.FieldAndIndex, set[Any]] = defaultdict(
@@ -63,7 +66,9 @@ class KernelAnalysis:
 
         self._called = False
 
-    def __call__(self, obj: AssignmentCollection | Sequence[Assignment] | Assignment):
+    def __call__(
+        self, obj: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase
+    ):
         if self._called:
             raise PsInternalCompilerError("KernelAnalysis called twice!")
 
@@ -83,7 +88,7 @@ class KernelAnalysis:
                 for asm in asms:
                     self._visit(asm)
 
-            case Assignment():
+            case AssignmentBase():
                 self._handle_rhs(obj.rhs)
                 self._handle_lhs(obj.lhs)
 
diff --git a/src/pystencils/backend/kernelcreation/ast_factory.py b/src/pystencils/backend/kernelcreation/ast_factory.py
index d5695be93..a0328a123 100644
--- a/src/pystencils/backend/kernelcreation/ast_factory.py
+++ b/src/pystencils/backend/kernelcreation/ast_factory.py
@@ -97,10 +97,14 @@ class AstFactory:
                 return PsExpression.make(PsConstant(idx, self._ctx.index_dtype))
 
     def _parse_any_index(self, idx: Any) -> PsExpression:
-        return self.parse_index(cast(IndexParsable, idx))
+        if not isinstance(idx, _IndexParsable):
+            raise TypeError(f"Cannot parse {idx} as an index expression")
+        return self.parse_index(idx)
 
     def parse_slice(
-        self, slic: slice, upper_limit: Any | None = None
+        self,
+        iter_slice: IndexParsable | slice,
+        normalize_to: IndexParsable | None = None,
     ) -> tuple[PsExpression, PsExpression, PsExpression]:
         """Parse a slice to obtain start, stop and step expressions for a loop or iteration space dimension.
 
@@ -109,30 +113,63 @@ class AstFactory:
         They may also be sympy expressions or integer constants, in which case they are parsed to AST objects
         and must also typify with the kernel creation context's ``index_dtype``.
 
-        If the slice's ``stop`` member is `None` or a negative `int`, `upper_limit` must be specified, which is then
-        used as the upper iteration limit as either ``upper_limit`` or ``upper_limit - stop``.
+        The `step` member of the slice, if it is constant, must be positive.
+
+        The slice may optionally be normalized with respect to an upper iteration limit.
+        If `normalize_to` is specified, negative integers in `iter_slice.start` and `iter_slice.stop` will
+        be added to that normalization limit.
 
         Args:
-            slic: The iteration slice
-            upper_limit: Optionally, the upper iteration limit
+            iter_slice: The iteration slice
+            normalize_to: The upper iteration limit with respect to which the slice should be normalized
         """
 
-        if slic.stop is None or (isinstance(slic.stop, int) and slic.stop < 0):
-            if upper_limit is None:
+        from pystencils.backend.transformations import EliminateConstants
+
+        fold = EliminateConstants(self._ctx)
+
+        start: PsExpression
+        stop: PsExpression | None
+        step: PsExpression
+
+        if not isinstance(iter_slice, slice):
+            start = self.parse_index(iter_slice)
+            stop = fold(
+                self._typify(self.parse_index(iter_slice) + self.parse_index(1))
+            )
+            step = self.parse_index(1)
+        else:
+            start = self._parse_any_index(
+                iter_slice.start if iter_slice.start is not None else 0
+            )
+            stop = (
+                self._parse_any_index(iter_slice.stop)
+                if iter_slice.stop is not None
+                else None
+            )
+            step = self._parse_any_index(
+                iter_slice.step if iter_slice.step is not None else 1
+            )
+
+            if isinstance(step, PsConstantExpr) and step.constant.value <= 0:
                 raise ValueError(
-                    "Must specify an upper iteration limit if `slice.stop` is `None` or a negative `int`"
+                    f"Invalid value for `slice.step`: {step.constant.value}"
                 )
 
-        start = self._parse_any_index(slic.start if slic.start is not None else 0)
-        stop = (
-            self._parse_any_index(slic.stop)
-            if slic.stop is not None
-            else self._parse_any_index(upper_limit)
-        )
-        step = self._parse_any_index(slic.step if slic.step is not None else 1)
+        if normalize_to is not None:
+            upper_limit = self.parse_index(normalize_to)
+            if isinstance(start, PsConstantExpr) and start.constant.value < 0:
+                start = fold(self._typify(upper_limit.clone() + start))
 
-        if isinstance(slic.stop, int) and slic.stop < 0:
-            stop = self._parse_any_index(upper_limit) + stop
+            if stop is None:
+                stop = upper_limit
+            elif isinstance(stop, PsConstantExpr) and stop.constant.value < 0:
+                stop = fold(self._typify(upper_limit.clone() + stop))
+
+        elif stop is None:
+            raise ValueError(
+                "Cannot parse a slice with `stop == None` if no normalization limit is given"
+            )
 
         return start, stop, step
 
diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 916274314..464c08dd9 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -239,7 +239,7 @@ class KernelCreationContext:
                 num_entries = field.index_shape[0] if field.index_shape else 1
                 if not isinstance(num_entries, int):
                     raise KernelConstraintsError(
-                        f"Invalid index shape of buffer field {field.name}: {field.spatial_dimensions}. "
+                        f"Invalid index shape of buffer field {field.name}: {num_entries}. "
                         "Buffer fields cannot have variable index shape."
                     )
 
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index f81ed586b..25ce28115 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -299,7 +299,7 @@ class FreezeExpressions:
 
         if not access.is_absolute_access:
             match field.field_type:
-                case FieldType.GENERIC:
+                case FieldType.GENERIC | FieldType.CUSTOM:
                     #   Add the iteration counters
                     offsets = [
                         PsExpression.make(i) + o
@@ -319,8 +319,6 @@ class FreezeExpressions:
                     compressed_ctr = ispace.compressed_counter()
                     assert len(offsets) == 1
                     offsets = [compressed_ctr + offsets[0]]
-                case FieldType.CUSTOM:
-                    raise ValueError("Custom fields support only absolute accesses.")
                 case unknown:
                     raise NotImplementedError(
                         f"Cannot translate accesses to field type {unknown} yet."
diff --git a/src/pystencils/backend/kernelcreation/iteration_space.py b/src/pystencils/backend/kernelcreation/iteration_space.py
index bfebd5af6..8175fffed 100644
--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -121,7 +121,7 @@ class FullIterationSpace(IterationSpace):
     @staticmethod
     def create_from_slice(
         ctx: KernelCreationContext,
-        iteration_slice: slice | Sequence[slice],
+        iteration_slice: int | slice | tuple[int | slice, ...],
         archetype_field: Field | None = None,
     ):
         """Create an iteration space from a sequence of slices, optionally over an archetype field.
@@ -131,7 +131,7 @@ class FullIterationSpace(IterationSpace):
             iteration_slice: The iteration slices for each dimension; for valid formats, see `AstFactory.parse_slice`
             archetype_field: Optionally, an archetype field that dictates the upper slice limits and loop order.
         """
-        if isinstance(iteration_slice, slice):
+        if not isinstance(iteration_slice, tuple):
             iteration_slice = (iteration_slice,)
 
         dim = len(iteration_slice)
@@ -163,7 +163,9 @@ class FullIterationSpace(IterationSpace):
 
         factory = AstFactory(ctx)
 
-        def to_dim(slic: slice, size: PsSymbol | PsConstant | None, ctr: PsSymbol):
+        def to_dim(
+            slic: int | slice, size: PsSymbol | PsConstant | None, ctr: PsSymbol
+        ):
             start, stop, step = factory.parse_slice(slic, size)
             return FullIterationSpace.Dimension(start, stop, step, ctr)
 
@@ -393,7 +395,7 @@ def create_full_iteration_space(
     ctx: KernelCreationContext,
     assignments: AssignmentCollection,
     ghost_layers: None | int | Sequence[int | tuple[int, int]] = None,
-    iteration_slice: None | Sequence[slice] = None,
+    iteration_slice: None | int | slice | tuple[int | slice, ...] = None,
 ) -> IterationSpace:
     assert not ctx.fields.index_fields
 
@@ -443,7 +445,9 @@ def create_full_iteration_space(
         )
     else:
         if len(domain_field_accesses) > 0:
-            inferred_gls = max([fa.required_ghost_layers for fa in domain_field_accesses])
+            inferred_gls = max(
+                [fa.required_ghost_layers for fa in domain_field_accesses]
+            )
         else:
             inferred_gls = 0
 
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 423378482..c89d22788 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -11,7 +11,14 @@ from ..kernelcreation import (
 
 from ..kernelcreation.context import KernelCreationContext
 from ..ast.structural import PsBlock, PsConditional, PsDeclaration
-from ..ast.expressions import PsExpression, PsLiteralExpr, PsCast, PsCall
+from ..ast.expressions import (
+    PsExpression,
+    PsLiteralExpr,
+    PsCast,
+    PsCall,
+    PsLookup,
+    PsArrayAccess,
+)
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
@@ -154,12 +161,36 @@ class CudaPlatform(GenericGpu):
     ) -> tuple[PsBlock, GpuThreadsRange]:
         ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
 
-        ctr = PsExpression.make(ispace.sparse_counter)
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
         thread_idx = self._linear_thread_idx(0)
-        idx_decl = self._typify(PsDeclaration(ctr, PsCast(ctr.get_dtype(), thread_idx)))
-        body.statements = [idx_decl] + body.statements
+        sparse_idx_decl = self._typify(
+            PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx))
+        )
+
+        mappings = [
+            PsDeclaration(
+                PsExpression.make(ctr),
+                PsLookup(
+                    PsArrayAccess(
+                        ispace.index_list.base_pointer,
+                        sparse_ctr,
+                    ),
+                    coord.name,
+                ),
+            )
+            for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members)
+        ]
+        body.statements = mappings + body.statements
+
+        if not self._cfg.omit_range_check:
+            stop = PsExpression.make(ispace.index_list.shape[0])
+            condition = PsLt(sparse_ctr, stop)
+            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
+        else:
+            body.statements = [sparse_idx_decl] + body.statements
+            ast = body
 
-        return body, GpuThreadsRange.from_ispace(ispace)
+        return ast, GpuThreadsRange.from_ispace(ispace)
 
     def _linear_thread_idx(self, coord: int):
         block_size = BLOCK_DIM[coord]
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index 52953115a..39ec09992 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -5,7 +5,19 @@ from ..kernelcreation.iteration_space import (
     SparseIterationSpace,
 )
 from ..ast.structural import PsDeclaration, PsBlock, PsConditional
-from ..ast.expressions import PsExpression, PsSymbolExpr, PsSubscript, PsLt, PsAnd, PsCall, PsGe, PsLe, PsTernary
+from ..ast.expressions import (
+    PsExpression,
+    PsSymbolExpr,
+    PsSubscript,
+    PsLt,
+    PsAnd,
+    PsCall,
+    PsGe,
+    PsLe,
+    PsTernary,
+    PsLookup,
+    PsArrayAccess
+)
 from ..extensions.cpp import CppMethodCall
 
 from ..kernelcreation.context import KernelCreationContext
@@ -40,7 +52,7 @@ class SyclPlatform(GenericGpu):
 
     def select_function(self, call: PsCall) -> PsExpression:
         assert isinstance(call.function, PsMathFunction)
-        
+
         func = call.function.func
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
@@ -70,13 +82,13 @@ class SyclPlatform(GenericGpu):
 
             call.function = cfunc
             return call
-        
+
         if isinstance(dtype, PsIntegerType):
             match func:
                 case MathFunctions.Abs:
                     zero = PsExpression.make(PsConstant(0, dtype))
                     arg = call.args[0]
-                    return PsTernary(PsGe(arg, zero), arg, - arg)
+                    return PsTernary(PsGe(arg, zero), arg, -arg)
                 case MathFunctions.Min:
                     arg1, arg2 = call.args
                     return PsTernary(PsLe(arg1, arg2), arg1, arg2)
@@ -144,11 +156,33 @@ class SyclPlatform(GenericGpu):
         ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
         subscript.dtype = ispace.sparse_counter.get_dtype()
 
-        ctr = PsExpression.make(ispace.sparse_counter)
-        unpacking = PsDeclaration(ctr, subscript)
-        body.statements = [unpacking] + body.statements
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
+        sparse_idx_decl = PsDeclaration(sparse_ctr, subscript)
+
+        mappings = [
+            PsDeclaration(
+                PsExpression.make(ctr),
+                PsLookup(
+                    PsArrayAccess(
+                        ispace.index_list.base_pointer,
+                        sparse_ctr,
+                    ),
+                    coord.name,
+                ),
+            )
+            for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members)
+        ]
+        body.statements = mappings + body.statements
+
+        if not self._cfg.omit_range_check:
+            stop = PsExpression.make(ispace.index_list.shape[0])
+            condition = PsLt(sparse_ctr, stop)
+            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
+        else:
+            body.statements = [sparse_idx_decl] + body.statements
+            ast = body
 
-        return body, GpuThreadsRange.from_ispace(ispace)
+        return ast, GpuThreadsRange.from_ispace(ispace)
 
     def _item_type(self, rank: int):
         if not self._cfg.sycl_automatic_block_size:
diff --git a/src/pystencils/config.py b/src/pystencils/config.py
index 8e9909b31..c8cf489f7 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -10,7 +10,7 @@ from dataclasses import dataclass, InitVar
 from .enums import Target
 from .field import Field, FieldType
 
-from .types import PsIntegerType, UserTypeSpec, PsIeeeFloatType
+from .types import PsIntegerType, UserTypeSpec, PsIeeeFloatType, create_type
 
 from .defaults import DEFAULTS
 
@@ -147,6 +147,9 @@ class GpuIndexingConfig:
     This check can be discarded through this option, at your own peril.
     """
 
+    block_size: tuple[int, int, int] | None = None
+    """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
+
     sycl_automatic_block_size: bool = True
     """If set to `True` while generating for `Target.SYCL`, let the SYCL runtime decide on the block size.
 
@@ -208,7 +211,7 @@ class CreateKernelConfig:
 
     """Data Types"""
 
-    index_dtype: PsIntegerType = DEFAULTS.index_dtype
+    index_dtype: UserTypeSpec = DEFAULTS.index_dtype
     """Data type used for all index calculations."""
 
     default_dtype: UserTypeSpec = PsIeeeFloatType(64)
@@ -217,6 +220,24 @@ class CreateKernelConfig:
     This data type will be applied to all untyped symbols.
     """
 
+    """Analysis"""
+
+    allow_double_writes: bool = False
+    """
+    If True, don't check if every field is only written at a single location. This is required
+    for example for kernels that are compiled with loop step sizes > 1, that handle multiple
+    cells at once. Use with care!
+    """
+
+    skip_independence_check: bool = False
+    """
+    By default the assignment list is checked for read/write independence. This means fields are only written at
+    locations where they are read. Doing so guarantees thread safety. In some cases e.g. for
+    periodicity kernel, this can not be assured and does the check needs to be deactivated. Use with care!
+    """
+
+    """Target-Specific Options"""
+
     cpu_optim: None | CpuOptimConfig = None
     """Configuration of the CPU kernel optimizer.
     
@@ -240,6 +261,46 @@ class CreateKernelConfig:
     cpu_vectorize_info: InitVar[dict | None] = None
     """Deprecated; use `cpu_optim.vectorize` instead."""
 
+    gpu_indexing_params: InitVar[dict | None] = None
+    """Deprecated; use `gpu_indexing` instead."""
+
+    #   Getters
+
+    def get_jit(self) -> JitBase:
+        """Returns either the user-specified JIT compiler, or infers one from the target if none is given."""
+        if self.jit is None:
+            if self.target.is_cpu():
+                from .backend.jit import LegacyCpuJit
+
+                return LegacyCpuJit()
+            elif self.target == Target.CUDA:
+                try:
+                    from .backend.jit.gpu_cupy import CupyJit
+
+                    if (
+                        self.gpu_indexing is not None
+                        and self.gpu_indexing.block_size is not None
+                    ):
+                        return CupyJit(self.gpu_indexing.block_size)
+                    else:
+                        return CupyJit()
+
+                except ImportError:
+                    from .backend.jit import no_jit
+
+                    return no_jit
+
+            elif self.target == Target.SYCL:
+                from .backend.jit import no_jit
+
+                return no_jit
+            else:
+                raise NotImplementedError(
+                    f"No default JIT compiler implemented yet for target {self.target}"
+                )
+        else:
+            return self.jit
+
     #   Postprocessing
 
     def __post_init__(self, *args):
@@ -247,6 +308,10 @@ class CreateKernelConfig:
         #   Check deprecated options
         self._check_deprecations(*args)
 
+        #   Check index data type
+        if not isinstance(create_type(self.index_dtype), PsIntegerType):
+            raise PsOptionsError("`index_dtype` was not an integer type.")
+
         #   Check iteration space argument consistency
         if (
             int(self.iteration_slice is not None)
@@ -270,11 +335,6 @@ class CreateKernelConfig:
 
         #   Check optim
         if self.cpu_optim is not None:
-            if not self.target.is_cpu():
-                raise PsOptionsError(
-                    f"`cpu_optim` cannot be set for non-CPU target {self.target}"
-                )
-
             if (
                 self.cpu_optim.vectorize is not False
                 and not self.target.is_vector_cpu()
@@ -284,41 +344,25 @@ class CreateKernelConfig:
                 )
 
         if self.gpu_indexing is not None:
-            if self.target != Target.SYCL:
-                raise PsOptionsError(
-                    f"`gpu_indexing` cannot be set for non-SYCL target {self.target}"
-                )
-
-        #   Infer JIT
-        if self.jit is None:
-            if self.target.is_cpu():
-                from .backend.jit import LegacyCpuJit
-
-                self.jit = LegacyCpuJit()
-            elif self.target == Target.CUDA:
-                try:
-                    from .backend.jit.gpu_cupy import CupyJit
-
-                    self.jit = CupyJit()
-                except ImportError:
-                    from .backend.jit import no_jit
-
-                    self.jit = no_jit
-
-            elif self.target == Target.SYCL:
-                from .backend.jit import no_jit
-
-                self.jit = no_jit
-            else:
-                raise NotImplementedError(
-                    f"No default JIT compiler implemented yet for target {self.target}"
-                )
+            if isinstance(self.gpu_indexing, str):
+                match self.gpu_indexing:
+                    case "block":
+                        self.gpu_indexing = GpuIndexingConfig()
+                    case "line":
+                        raise NotImplementedError(
+                            "GPU line indexing is currently unavailable."
+                        )
+                    case other:
+                        raise PsOptionsError(
+                            f"Invalid value for option gpu_indexing: {other}"
+                        )
 
     def _check_deprecations(
         self,
         data_type: UserTypeSpec | None,
         cpu_openmp: bool | int | None,
         cpu_vectorize_info: dict | None,
+        gpu_indexing_params: dict | None,
     ):
         optim: CpuOptimConfig | None = None
 
@@ -360,6 +404,18 @@ class CreateKernelConfig:
             else:
                 self.cpu_optim = optim
 
+        if gpu_indexing_params is not None:
+            _deprecated_option("gpu_indexing_params", "gpu_indexing")
+
+            if self.gpu_indexing is not None:
+                raise PsOptionsError(
+                    "Cannot specify both `gpu_indexing` and the deprecated `gpu_indexing_params` at the same time."
+                )
+
+            self.gpu_indexing = GpuIndexingConfig(
+                block_size=gpu_indexing_params.get("block_size", None)
+            )
+
 
 def _deprecated_option(name, instead):
     from warnings import warn
diff --git a/src/pystencils/gpu/periodicity.py b/src/pystencils/gpu/periodicity.py
new file mode 100644
index 000000000..6569fbb0f
--- /dev/null
+++ b/src/pystencils/gpu/periodicity.py
@@ -0,0 +1,75 @@
+import numpy as np
+from itertools import product
+
+from pystencils import CreateKernelConfig, create_kernel
+from pystencils import Assignment, Field
+from pystencils.enums import Target
+from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
+
+
+def create_copy_kernel(
+    domain_size,
+    src_slice,
+    dst_slice,
+    index_dimensions=0,
+    index_dim_shape=1,
+    dtype=np.float64,
+):
+    """Copies a rectangular part of an array to another non-overlapping part"""
+
+    field = Field.create_generic(
+        "field", len(domain_size), index_dimensions=index_dimensions, dtype=dtype
+    )
+    normalized_src_slice = normalize_slice(src_slice, field.spatial_shape)
+    normalized_dst_slice = normalize_slice(dst_slice, field.spatial_shape)
+
+    offset = [
+        s1.start - s2.start
+        for s1, s2 in zip(normalized_src_slice, normalized_dst_slice)
+    ]
+    assert offset == [
+        s1.stop - s2.stop for s1, s2 in zip(normalized_src_slice, normalized_dst_slice)
+    ], "Slices have to have same size"
+
+    update_eqs = []
+    if index_dimensions < 2:
+        index_dim_shape = [index_dim_shape]
+    for i in product(*[range(d) for d in index_dim_shape]):
+        eq = Assignment(field(*i), field[tuple(offset)](*i))
+        update_eqs.append(eq)
+
+    config = CreateKernelConfig(
+        target=Target.GPU, iteration_slice=dst_slice, skip_independence_check=True
+    )
+
+    ast = create_kernel(update_eqs, config=config)
+    return ast
+
+
+def get_periodic_boundary_functor(
+    stencil,
+    domain_size,
+    index_dimensions=0,
+    index_dim_shape=1,
+    ghost_layers=1,
+    thickness=None,
+    dtype=np.float64,
+    target=Target.GPU,
+):
+    assert target in {Target.GPU}
+    src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(
+        stencil, ghost_layers, thickness
+    )
+    kernels = []
+
+    for src_slice, dst_slice in src_dst_slice_tuples:
+        ast = create_copy_kernel(
+            domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype
+        )
+        kernels.append(ast.compile())
+
+    def functor(field, **_):
+        for kernel in kernels:
+            kernel(field=field)
+
+    return functor
diff --git a/src/pystencils/kernel_wrapper.py b/src/pystencils/kernel_wrapper.py
index fba94abd5..afce06d77 100644
--- a/src/pystencils/kernel_wrapper.py
+++ b/src/pystencils/kernel_wrapper.py
@@ -1,26 +1,3 @@
-import pystencils
+from .backend.jit import KernelWrapper as _KernelWrapper
 
-
-class KernelWrapper:
-    """
-    Light-weight wrapper around a compiled kernel.
-
-    Can be called while still providing access to underlying AST.
-    """
-
-    def __init__(self, kernel, parameters, ast_node):
-        self.kernel = kernel
-        self.parameters = parameters
-        self.ast = ast_node
-        self.num_regs = None
-
-    def __call__(self, **kwargs):
-        return self.kernel(**kwargs)
-    
-    @property
-    def target(self) -> pystencils.Target:
-        return self.ast.target
-
-    @property
-    def code(self):
-        return pystencils.get_code_str(self.ast)
+KernelWrapper = _KernelWrapper
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index 54a2c3585..ae64bdea3 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -1,9 +1,10 @@
-from typing import cast
+from typing import cast, Sequence
+from dataclasses import replace
 
 from .enums import Target
 from .config import CreateKernelConfig
 from .backend import KernelFunction
-from .types import create_numeric_type
+from .types import create_numeric_type, PsIntegerType
 from .backend.ast.structural import PsBlock
 from .backend.kernelcreation import (
     KernelCreationContext,
@@ -28,39 +29,55 @@ from .backend.kernelfunction import (
 )
 
 from .simp import AssignmentCollection
-from .assignment import Assignment
+from sympy.codegen.ast import AssignmentBase
 
 
 __all__ = ["create_kernel"]
 
 
 def create_kernel(
-    assignments: AssignmentCollection | list[Assignment] | Assignment,
-    config: CreateKernelConfig = CreateKernelConfig(),
+    assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
+    config: CreateKernelConfig | None = None,
+    **kwargs,
 ) -> KernelFunction:
     """Create a kernel function from a set of assignments.
 
     Args:
         assignments: The kernel's sequence of assignments, expressed using SymPy
         config: The configuration for the kernel translator
+        kwargs: If ``config`` is not set, it is created from the keyword arguments;
+            if it is set, its option will be overridden by any keyword arguments.
 
     Returns:
         The numerical kernel in pystencil's internal representation, ready to be
         exported or compiled
     """
 
+    if not config:
+        config = CreateKernelConfig()
+
+    if kwargs:
+        config = replace(config, **kwargs)
+
+    idx_dtype = create_numeric_type(config.index_dtype)
+    assert isinstance(idx_dtype, PsIntegerType)
+
     ctx = KernelCreationContext(
         default_dtype=create_numeric_type(config.default_dtype),
-        index_dtype=config.index_dtype,
+        index_dtype=idx_dtype,
     )
 
-    if isinstance(assignments, Assignment):
+    if isinstance(assignments, AssignmentBase):
         assignments = [assignments]
 
     if not isinstance(assignments, AssignmentCollection):
-        assignments = AssignmentCollection(assignments)
+        assignments = AssignmentCollection(assignments)  # type: ignore
+
+    _ = _parse_simplification_hints(assignments)
 
-    analysis = KernelAnalysis(ctx)
+    analysis = KernelAnalysis(
+        ctx, not config.skip_independence_check, not config.allow_double_writes
+    )
     analysis(assignments)
 
     if len(ctx.fields.index_fields) > 0 or config.index_field is not None:
@@ -132,11 +149,14 @@ def create_kernel(
     select_functions = SelectFunctions(platform)
     kernel_ast = cast(PsBlock, select_functions(kernel_ast))
 
-    assert config.jit is not None
-
     if config.target.is_cpu():
         return create_cpu_kernel_function(
-            ctx, platform, kernel_ast, config.function_name, config.target, config.jit
+            ctx,
+            platform,
+            kernel_ast,
+            config.function_name,
+            config.target,
+            config.get_jit(),
         )
     else:
         return create_gpu_kernel_function(
@@ -146,9 +166,20 @@ def create_kernel(
             gpu_threads,
             config.function_name,
             config.target,
-            config.jit,
+            config.get_jit(),
         )
 
 
-def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclusive_conditions=False, **kwargs):
-    raise NotImplementedError("Staggered kernels are not yet implemented for pystencils 2.0")
+def create_staggered_kernel(
+    assignments, target: Target = Target.CPU, gpu_exclusive_conditions=False, **kwargs
+):
+    raise NotImplementedError(
+        "Staggered kernels are not yet implemented for pystencils 2.0"
+    )
+
+
+#   Internals
+
+def _parse_simplification_hints(ac: AssignmentCollection):
+    if "split_groups" in ac.simplification_hints:
+        raise NotImplementedError("Loop splitting was requested, but is not implemented yet")
diff --git a/src/pystencils/old/gpu/periodicity.py b/src/pystencils/old/gpu/periodicity.py
deleted file mode 100644
index 21aa95e56..000000000
--- a/src/pystencils/old/gpu/periodicity.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import numpy as np
-from itertools import product
-
-from pystencils import CreateKernelConfig, create_kernel
-from pystencils.gpu import make_python_function
-from pystencils import Assignment, Field
-from pystencils.enums import Target
-from pystencils.slicing import get_periodic_boundary_src_dst_slices, normalize_slice
-
-
-def create_copy_kernel(domain_size, from_slice, to_slice, index_dimensions=0, index_dim_shape=1, dtype=np.float64):
-    """Copies a rectangular part of an array to another non-overlapping part"""
-
-    f = Field.create_generic("pdfs", len(domain_size), index_dimensions=index_dimensions, dtype=dtype)
-    normalized_from_slice = normalize_slice(from_slice, f.spatial_shape)
-    normalized_to_slice = normalize_slice(to_slice, f.spatial_shape)
-
-    offset = [s1.start - s2.start for s1, s2 in zip(normalized_from_slice, normalized_to_slice)]
-    assert offset == [s1.stop - s2.stop for s1, s2 in zip(normalized_from_slice, normalized_to_slice)], \
-        "Slices have to have same size"
-
-    update_eqs = []
-    if index_dimensions < 2:
-        index_dim_shape = [index_dim_shape]
-    for i in product(*[range(d) for d in index_dim_shape]):
-        eq = Assignment(f(*i), f[tuple(offset)](*i))
-        update_eqs.append(eq)
-
-    config = CreateKernelConfig(target=Target.GPU, iteration_slice=to_slice, skip_independence_check=True)
-
-    ast = create_kernel(update_eqs, config=config)
-    return ast
-
-
-def get_periodic_boundary_functor(stencil, domain_size, index_dimensions=0, index_dim_shape=1, ghost_layers=1,
-                                  thickness=None, dtype=np.float64, target=Target.GPU):
-    assert target in {Target.GPU}
-    src_dst_slice_tuples = get_periodic_boundary_src_dst_slices(stencil, ghost_layers, thickness)
-    kernels = []
-
-    for src_slice, dst_slice in src_dst_slice_tuples:
-        ast = create_copy_kernel(domain_size, src_slice, dst_slice, index_dimensions, index_dim_shape, dtype)
-        kernels.append(make_python_function(ast))
-
-    def functor(pdfs, **_):
-        for kernel in kernels:
-            kernel(pdfs=pdfs)
-
-    return functor
diff --git a/src/pystencils/types/parsing.py b/src/pystencils/types/parsing.py
index d6522e5bb..5771eaca8 100644
--- a/src/pystencils/types/parsing.py
+++ b/src/pystencils/types/parsing.py
@@ -95,6 +95,9 @@ def interpret_python_type(t: type) -> PsType:
 def interpret_numpy_dtype(t: np.dtype) -> PsType:
     if t.fields is not None:
         #   it's a struct
+        if not t.isalignedstruct:
+            raise ValueError("pystencils currently only accepts aligned structured data types.")
+
         members = []
         for fname, fspec in t.fields.items():
             members.append(PsStructType.Member(fname, interpret_numpy_dtype(fspec[0])))
diff --git a/tests/test_gpu.py b/tests/nbackend/kernelcreation/test_gpu.py
similarity index 94%
rename from tests/test_gpu.py
rename to tests/nbackend/kernelcreation/test_gpu.py
index 84ef34045..10d222474 100644
--- a/tests/test_gpu.py
+++ b/tests/nbackend/kernelcreation/test_gpu.py
@@ -6,8 +6,8 @@ import sympy as sp
 from scipy.ndimage import convolve
 
 from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
-from pystencils.gpu import BlockIndexing
-from pystencils.sympyextensions import sympy_cse_on_assignment_list
+# from pystencils.gpu import BlockIndexing
+from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice
 
 try:
@@ -74,7 +74,7 @@ def test_multiple_index_dimensions():
     """Sums along the last axis of a numpy array"""
     src_size = (7, 6, 4)
     dst_size = src_size[:2]
-    src_arr = np.asfortranarray(np.random.rand(*src_size))
+    src_arr = np.array(np.random.rand(*src_size))
     dst_arr = np.zeros(dst_size)
 
     src_field = Field.create_from_numpy_array('src', src_arr, index_dimensions=1)
@@ -102,6 +102,7 @@ def test_multiple_index_dimensions():
     np.testing.assert_almost_equal(reference, dst_arr)
 
 
+@pytest.mark.xfail(reason="Line indexing not available yet")
 def test_ghost_layer():
     size = (6, 5)
     src_arr = np.ones(size)
@@ -126,6 +127,7 @@ def test_ghost_layer():
     np.testing.assert_equal(reference, dst_arr)
 
 
+@pytest.mark.xfail(reason="Line indexing not available yet")
 def test_setting_value():
     arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5)
     arr_gpu = cp.asarray(arr_cpu)
@@ -156,12 +158,13 @@ def test_periodicity():
     cpu_result = np.copy(arr_cpu)
     periodic_cpu_kernel(cpu_result)
 
-    periodic_gpu_kernel(pdfs=arr_gpu)
+    periodic_gpu_kernel(arr_gpu)
     gpu_result = arr_gpu.get()
     np.testing.assert_equal(cpu_result, gpu_result)
 
 
 @pytest.mark.parametrize("device_number", device_numbers)
+@pytest.mark.xfail(reason="Block indexing specification is not available yet")
 def test_block_indexing(device_number):
     f = fields("f: [3D]")
     s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
@@ -194,6 +197,7 @@ def test_block_indexing(device_number):
 @pytest.mark.parametrize('gpu_indexing', ("block", "line"))
 @pytest.mark.parametrize('layout', ("C", "F"))
 @pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
+@pytest.mark.xfail(reason="4D kernels not available yet")
 def test_four_dimensional_kernel(gpu_indexing, layout, shape):
     n_elements = np.prod(shape)
 
diff --git a/tests/nbackend/kernelcreation/test_index_kernels.py b/tests/nbackend/kernelcreation/test_index_kernels.py
index 817b4a244..5093c43ff 100644
--- a/tests/nbackend/kernelcreation/test_index_kernels.py
+++ b/tests/nbackend/kernelcreation/test_index_kernels.py
@@ -1,16 +1,32 @@
 import numpy as np
+import pytest
 
-from pystencils import Assignment, Field, FieldType, AssignmentCollection
+from pystencils import Assignment, Field, FieldType, AssignmentCollection, Target
 from pystencils.kernelcreation import create_kernel, CreateKernelConfig
 
 
-def test_indexed_kernel():
-    arr = np.zeros((3, 4))
-    dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)])
-    index_arr = np.zeros((3,), dtype=dtype)
-    index_arr[0] = (0, 2, 3.0)
-    index_arr[1] = (1, 3, 42.0)
-    index_arr[2] = (2, 1, 5.0)
+@pytest.mark.parametrize("target", [Target.CPU, Target.GPU])
+def test_indexed_kernel(target):
+    if target == Target.GPU:
+        cp = pytest.importorskip("cupy")
+        xp = cp
+    else:
+        xp = np
+
+    arr = xp.zeros((3, 4))
+    dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)], align=True)
+    
+    cpu_index_arr = np.zeros((3,), dtype=dtype)
+    cpu_index_arr[0] = (0, 2, 3.0)
+    cpu_index_arr[1] = (1, 3, 42.0)
+    cpu_index_arr[2] = (2, 1, 5.0)
+
+    if target == Target.GPU:
+        gpu_index_arr = cp.empty(cpu_index_arr.shape, cpu_index_arr.dtype)
+        gpu_index_arr.set(cpu_index_arr)
+        index_arr = gpu_index_arr
+    else:
+        index_arr = cpu_index_arr
 
     index_field = Field.create_from_numpy_array('index', index_arr, field_type=FieldType.INDEXED)
     normal_field = Field.create_from_numpy_array('f', arr)
@@ -18,11 +34,14 @@ def test_indexed_kernel():
         Assignment(normal_field[0, 0], index_field('value'))
     ])
 
-    options = CreateKernelConfig(index_field=index_field)
+    options = CreateKernelConfig(index_field=index_field, target=target)
     ast = create_kernel(update_rule, options)
     kernel = ast.compile()
 
     kernel(f=arr, index=index_arr)
 
-    for i in range(index_arr.shape[0]):
-        np.testing.assert_allclose(arr[index_arr[i]['x'], index_arr[i]['y']], index_arr[i]['value'], atol=1e-13)
+    if target == Target.GPU:
+        arr = cp.asnumpy(arr)
+
+    for i in range(cpu_index_arr.shape[0]):
+        np.testing.assert_allclose(arr[cpu_index_arr[i]['x'], cpu_index_arr[i]['y']], cpu_index_arr[i]['value'], atol=1e-13)
diff --git a/tests/nbackend/kernelcreation/test_iteration_space.py b/tests/nbackend/kernelcreation/test_iteration_space.py
index f9646afc2..8ff678fba 100644
--- a/tests/nbackend/kernelcreation/test_iteration_space.py
+++ b/tests/nbackend/kernelcreation/test_iteration_space.py
@@ -1,29 +1,33 @@
 import pytest
+import numpy as np
 
 from pystencils import make_slice, Field, create_type
 from pystencils.sympyextensions.typed_sympy import TypedSymbol
 
 from pystencils.backend.constants import PsConstant
-from pystencils.backend.kernelcreation import KernelCreationContext, FullIterationSpace
+from pystencils.backend.kernelcreation import (
+    KernelCreationContext,
+    FullIterationSpace,
+    AstFactory,
+)
 from pystencils.backend.ast.expressions import PsAdd, PsConstantExpr, PsExpression
 from pystencils.backend.kernelcreation.typification import TypificationError
-from pystencils.types.quick import Int
 
 
-def test_slices():
+def test_slices_over_field():
     ctx = KernelCreationContext()
 
     archetype_field = Field.create_generic("f", spatial_dimensions=3, layout="fzyx")
     ctx.add_field(archetype_field)
 
-    islice = (slice(1, -1, 1), slice(3, -3, 3), slice(0, None, -1))
+    islice = (slice(1, -1, 1), slice(3, -3, 3), slice(0, None, 1))
     ispace = FullIterationSpace.create_from_slice(ctx, islice, archetype_field)
 
     archetype_arr = ctx.get_array(archetype_field)
 
     dims = ispace.dimensions
 
-    for sl, size, dim in zip(islice, archetype_arr.shape, dims):
+    for sl, dim in zip(islice, dims):
         assert (
             isinstance(dim.start, PsConstantExpr)
             and dim.start.constant.value == sl.start
@@ -45,6 +49,105 @@ def test_slices():
     assert dims[2].stop.structurally_equal(PsExpression.make(archetype_arr.shape[2]))
 
 
+def test_slices_with_fixed_size_field():
+    ctx = KernelCreationContext()
+
+    archetype_field = Field.create_fixed_size("f", (4, 5, 6), layout="fzyx")
+    ctx.add_field(archetype_field)
+
+    islice = (slice(1, -1, 1), slice(3, -3, 3), slice(0, None, 1))
+    ispace = FullIterationSpace.create_from_slice(ctx, islice, archetype_field)
+
+    archetype_arr = ctx.get_array(archetype_field)
+
+    dims = ispace.dimensions
+
+    for sl, size, dim in zip(islice, archetype_arr.shape, dims):
+        assert (
+            isinstance(dim.start, PsConstantExpr)
+            and dim.start.constant.value == sl.start
+        )
+
+        assert isinstance(size, PsConstant)
+
+        assert isinstance(
+            dim.stop, PsConstantExpr
+        ) and dim.stop.constant.value == np.int64(
+            size.value + sl.stop if sl.stop is not None else size.value
+        )
+
+        assert (
+            isinstance(dim.step, PsConstantExpr) and dim.step.constant.value == sl.step
+        )
+
+
+def test_singular_slice_over_field():
+    ctx = KernelCreationContext()
+    factory = AstFactory(ctx)
+
+    archetype_field = Field.create_generic("f", spatial_dimensions=2, layout="fzyx")
+    ctx.add_field(archetype_field)
+    archetype_arr = ctx.get_array(archetype_field)
+
+    islice = (4, -3)
+    ispace = FullIterationSpace.create_from_slice(ctx, islice, archetype_field)
+
+    dims = ispace.dimensions
+
+    assert dims[0].start.structurally_equal(factory.parse_index(4))
+
+    assert dims[0].stop.structurally_equal(factory.parse_index(5))
+
+    assert dims[1].start.structurally_equal(
+        PsExpression.make(archetype_arr.shape[1]) + factory.parse_index(-3)
+    )
+
+    assert dims[1].stop.structurally_equal(
+        PsExpression.make(archetype_arr.shape[1]) + factory.parse_index(-2)
+    )
+
+
+def test_slices_with_negative_start():
+    ctx = KernelCreationContext()
+    factory = AstFactory(ctx)
+
+    archetype_field = Field.create_generic("f", spatial_dimensions=2, layout="fzyx")
+    ctx.add_field(archetype_field)
+    archetype_arr = ctx.get_array(archetype_field)
+
+    islice = (slice(-3, -1, 1), slice(-4, None, 1))
+    ispace = FullIterationSpace.create_from_slice(ctx, islice, archetype_field)
+
+    dims = ispace.dimensions
+
+    assert dims[0].start.structurally_equal(
+        PsExpression.make(archetype_arr.shape[0]) + factory.parse_index(-3)
+    )
+
+    assert dims[1].start.structurally_equal(
+        PsExpression.make(archetype_arr.shape[1]) + factory.parse_index(-4)
+    )
+
+
+def test_field_independent_slices():
+    ctx = KernelCreationContext()
+
+    islice = (slice(-3, -1, 1), slice(-4, 7, 2))
+    ispace = FullIterationSpace.create_from_slice(ctx, islice)
+
+    dims = ispace.dimensions
+
+    for sl, dim in zip(islice, dims):
+        assert isinstance(dim.start, PsConstantExpr)
+        assert dim.start.constant.value == np.int64(sl.start)
+
+        assert isinstance(dim.stop, PsConstantExpr)
+        assert dim.stop.constant.value == np.int64(sl.stop)
+
+        assert isinstance(dim.step, PsConstantExpr)
+        assert dim.step.constant.value == np.int64(sl.step)
+
+
 def test_invalid_slices():
     ctx = KernelCreationContext()
 
@@ -59,6 +162,14 @@ def test_invalid_slices():
     with pytest.raises(TypificationError):
         FullIterationSpace.create_from_slice(ctx, islice, archetype_field)
 
+    islice = (slice(1, 3, 0),)
+    with pytest.raises(ValueError):
+        FullIterationSpace.create_from_slice(ctx, islice, archetype_field)
+
+    islice = (slice(1, 3, -1),)
+    with pytest.raises(ValueError):
+        FullIterationSpace.create_from_slice(ctx, islice, archetype_field)
+
 
 def test_iteration_count():
     ctx = KernelCreationContext()
@@ -69,16 +180,14 @@ def test_iteration_count():
     three = PsExpression.make(PsConstant(3, ctx.index_dtype))
 
     ispace = FullIterationSpace.create_from_slice(
-        ctx, make_slice[three : i-two, 1:8:3]
+        ctx, make_slice[three : i - two, 1:8:3]
     )
 
     iters = [ispace.actual_iterations(coord) for coord in range(2)]
     assert iters[0].structurally_equal((i - two) - three)
     assert iters[1].structurally_equal(three)
 
-    empty_ispace = FullIterationSpace.create_from_slice(
-        ctx, make_slice[4:4:1, 4:4:7]
-    )
+    empty_ispace = FullIterationSpace.create_from_slice(ctx, make_slice[4:4:1, 4:4:7])
 
     iters = [empty_ispace.actual_iterations(coord) for coord in range(2)]
     assert iters[0].structurally_equal(zero)
diff --git a/tests/nbackend/kernelcreation/test_typification.py b/tests/nbackend/kernelcreation/test_typification.py
index d3da7e888..2ebfa2ec8 100644
--- a/tests/nbackend/kernelcreation/test_typification.py
+++ b/tests/nbackend/kernelcreation/test_typification.py
@@ -141,7 +141,7 @@ def test_lhs_constness():
     with pytest.raises(TypificationError):
         _ = typify(freeze(Assignment(f_const.absolute_access([0], [0]), x + y)))
 
-    np_struct = np.dtype([("size", np.uint32), ("data", np.float32)])
+    np_struct = np.dtype([("size", np.uint32), ("data", np.float32)], align=True)
     struct_type = constify(create_type(np_struct))
     struct_field = Field.create_generic(
         "struct_field", 1, dtype=struct_type, field_type=FieldType.CUSTOM
@@ -167,7 +167,7 @@ def test_typify_structs():
     freeze = FreezeExpressions(ctx)
     typify = Typifier(ctx)
 
-    np_struct = np.dtype([("size", np.uint32), ("data", np.float32)])
+    np_struct = np.dtype([("size", np.uint32), ("data", np.float32)], align=True)
     f = Field.create_generic("f", 1, dtype=np_struct, field_type=FieldType.CUSTOM)
     x = sp.Symbol("x")
 
-- 
GitLab