From 81184635009a1e62d06e8adaa150fa84afd2c026 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 12 Mar 2025 09:50:39 +0100
Subject: [PATCH 01/15] update pyproject.toml to distribute all headers in
 subfolders of `include`

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b3c6b1c02..55b21cbbf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,7 +89,8 @@ build-backend = "setuptools.build_meta"
 
 [tool.setuptools.package-data]
 pystencils = [
-    "include/*.h",
+    "include/**/*.h",
+    "include/**/*.hpp",
     "jit/cpu/*.tmpl.cpp",
     "boundaries/createindexlistcython.pyx"
 ]
-- 
GitLab


From 71ab7480164ba7d4aeba69ecd9352898015fd7f9 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 12 Mar 2025 10:27:06 +0100
Subject: [PATCH 02/15] start introduction of Target.HIP

---
 src/pystencils/backend/emission/base_printer.py | 2 +-
 src/pystencils/codegen/config.py                | 8 +++++---
 src/pystencils/codegen/driver.py                | 4 ++--
 src/pystencils/codegen/target.py                | 7 +++++++
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/pystencils/backend/emission/base_printer.py b/src/pystencils/backend/emission/base_printer.py
index cc4b50e21..c76a347f0 100644
--- a/src/pystencils/backend/emission/base_printer.py
+++ b/src/pystencils/backend/emission/base_printer.py
@@ -383,7 +383,7 @@ class BasePrinter(ABC):
         from ...codegen import GpuKernel
         
         sig_parts = [self._func_prefix] if self._func_prefix is not None else []
-        if isinstance(func, GpuKernel) and func.target == Target.CUDA:
+        if isinstance(func, GpuKernel) and func.target.is_gpu():
             sig_parts.append("__global__")
         sig_parts += ["void", func.name, f"({params_str})"]
         signature = " ".join(sig_parts)
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 0d43b40e3..821bb7e07 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -593,12 +593,14 @@ class CreateKernelConfig(ConfigBase):
         """Returns either the user-specified JIT compiler, or infers one from the target if none is given."""
         jit: JitBase | None = self.get_option("jit")
 
+        target = self.get_target()
+
         if jit is None:
-            if self.get_target().is_cpu():
+            if target.is_cpu():
                 from ..jit import LegacyCpuJit
 
                 return LegacyCpuJit()
-            elif self.get_target() == Target.CUDA:
+            elif target == Target.CUDA:
                 try:
                     from ..jit.gpu_cupy import CupyJit
                     
@@ -609,7 +611,7 @@ class CreateKernelConfig(ConfigBase):
 
                     return no_jit
 
-            elif self.get_target() == Target.SYCL:
+            elif target == Target.SYCL or target == Target.HIP:
                 from ..jit import no_jit
 
                 return no_jit
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index b8f9c7101..f53f1b9b8 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -398,7 +398,7 @@ class DefaultKernelCreationDriver:
         return kernel_ast
 
     def _get_gpu_indexing(self) -> GpuIndexing | None:
-        if self._target != Target.CUDA:
+        if not self._target.is_gpu():
             return None
 
         from .gpu_indexing import dim3
@@ -441,7 +441,7 @@ class DefaultKernelCreationDriver:
             omit_range_check: bool = gpu_opts.get_option("omit_range_check")
 
             match self._target:
-                case Target.CUDA:
+                case Target.CUDA | Target.HIP:
                     from ..backend.platforms import CudaPlatform
 
                     thread_mapping = (
diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py
index 0d724b877..03364af28 100644
--- a/src/pystencils/codegen/target.py
+++ b/src/pystencils/codegen/target.py
@@ -30,6 +30,7 @@ class Target(Flag):
     _GPU = auto()
 
     _CUDA = auto()
+    _HIP = auto()
 
     _SYCL = auto()
 
@@ -86,6 +87,12 @@ class Target(Flag):
     Generate a CUDA kernel for a generic Nvidia GPU.
     """
 
+    HIP = _GPU | _HIP
+    """Generic HIP GPU target.
+    
+    Generate a HIP kernel for generic AMD or NVidia GPUs.
+    """
+
     GPU = CUDA
     """Alias for `Target.CUDA`, for backward compatibility."""
 
-- 
GitLab


From 0810cb8c225eddfb7f023c0a5882f6b5a397dc4e Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 12 Mar 2025 12:07:10 +0100
Subject: [PATCH 03/15] fix deprecation warning stacklevel. fix linting.

---
 src/pystencils/backend/emission/base_printer.py | 1 -
 src/pystencils/enums.py                         | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/emission/base_printer.py b/src/pystencils/backend/emission/base_printer.py
index c76a347f0..c4ac0640c 100644
--- a/src/pystencils/backend/emission/base_printer.py
+++ b/src/pystencils/backend/emission/base_printer.py
@@ -57,7 +57,6 @@ from ..extensions.foreign_ast import PsForeignExpression
 from ..memory import PsSymbol
 from ..constants import PsConstant
 from ...types import PsType
-from ...codegen import Target
 
 if TYPE_CHECKING:
     from ...codegen import Kernel
diff --git a/src/pystencils/enums.py b/src/pystencils/enums.py
index bcea50e84..9d6470ed7 100644
--- a/src/pystencils/enums.py
+++ b/src/pystencils/enums.py
@@ -5,7 +5,8 @@ from warnings import warn
 warn(
     "Importing anything from `pystencils.enums` is deprecated and the module will be removed in pystencils 2.1. "
     "Import from `pystencils` instead.",
-    FutureWarning
+    FutureWarning,
+    stacklevel=2,
 )
 
 Target = _Target
-- 
GitLab


From 17d9fac8b5d76f8491203d30c6c4b09cf4c11790 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 13 Mar 2025 14:41:08 +0100
Subject: [PATCH 04/15] introduce HipPlatform

---
 src/pystencils/backend/platforms/__init__.py  |   2 +
 src/pystencils/backend/platforms/cuda.py      | 328 +-----------------
 .../backend/platforms/generic_gpu.py          | 326 ++++++++++++++++-
 src/pystencils/backend/platforms/hip.py       |  11 +
 src/pystencils/codegen/driver.py              |  34 +-
 src/pystencils/codegen/gpu_indexing.py        |   4 +-
 6 files changed, 362 insertions(+), 343 deletions(-)
 create mode 100644 src/pystencils/backend/platforms/hip.py

diff --git a/src/pystencils/backend/platforms/__init__.py b/src/pystencils/backend/platforms/__init__.py
index 589841db8..3b602964b 100644
--- a/src/pystencils/backend/platforms/__init__.py
+++ b/src/pystencils/backend/platforms/__init__.py
@@ -2,6 +2,7 @@ from .platform import Platform
 from .generic_cpu import GenericCpu, GenericVectorCpu
 from .generic_gpu import GenericGpu
 from .cuda import CudaPlatform
+from .hip import HipPlatform
 from .x86 import X86VectorCpu, X86VectorArch
 from .sycl import SyclPlatform
 
@@ -13,5 +14,6 @@ __all__ = [
     "X86VectorArch",
     "GenericGpu",
     "CudaPlatform",
+    "HipPlatform",
     "SyclPlatform",
 ]
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index d630594ba..98ff3e3d3 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,335 +1,11 @@
 from __future__ import annotations
-from abc import ABC, abstractmethod
 
-from ...types import constify, deconstify
-from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
 
-from ..memory import PsSymbol
-from ..kernelcreation import (
-    Typifier,
-    IterationSpace,
-    FullIterationSpace,
-    SparseIterationSpace,
-    AstFactory,
-)
-
-from ..kernelcreation.context import KernelCreationContext
-from ..ast.structural import PsBlock, PsConditional, PsDeclaration
-from ..ast.expressions import (
-    PsExpression,
-    PsLiteralExpr,
-    PsCast,
-    PsCall,
-    PsLookup,
-    PsBufferAcc,
-)
-from ..ast.expressions import PsLt, PsAnd
-from ...types import PsSignedIntegerType, PsIeeeFloatType
-from ..literals import PsLiteral
-from ..functions import PsMathFunction, MathFunctions, CFunction
-
-
-int32 = PsSignedIntegerType(width=32, const=False)
-
-BLOCK_IDX = [
-    PsLiteralExpr(PsLiteral(f"blockIdx.{coord}", int32)) for coord in ("x", "y", "z")
-]
-THREAD_IDX = [
-    PsLiteralExpr(PsLiteral(f"threadIdx.{coord}", int32)) for coord in ("x", "y", "z")
-]
-BLOCK_DIM = [
-    PsLiteralExpr(PsLiteral(f"blockDim.{coord}", int32)) for coord in ("x", "y", "z")
-]
-GRID_DIM = [
-    PsLiteralExpr(PsLiteral(f"gridDim.{coord}", int32)) for coord in ("x", "y", "z")
-]
-
-
-class ThreadMapping(ABC):
-
-    @abstractmethod
-    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
-        """Map the current thread index onto a point in the given iteration space.
-
-        Implementations of this method must return a declaration for each dimension counter
-        of the given iteration space.
-        """
-
-
-class Linear3DMapping(ThreadMapping):
-    """3D globally linearized mapping, where each thread is assigned a work item according to
-    its location in the global launch grid."""
-
-    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
-        match ispace:
-            case FullIterationSpace():
-                return self._dense_mapping(ispace)
-            case SparseIterationSpace():
-                return self._sparse_mapping(ispace)
-            case _:
-                assert False, "unexpected iteration space"
-
-    def _dense_mapping(
-        self, ispace: FullIterationSpace
-    ) -> dict[PsSymbol, PsExpression]:
-        if ispace.rank > 3:
-            raise MaterializationError(
-                f"Cannot handle {ispace.rank}-dimensional iteration space "
-                "using the Linear3D GPU thread index mapping."
-            )
-
-        dimensions = ispace.dimensions_in_loop_order()
-        idx_map: dict[PsSymbol, PsExpression] = dict()
-
-        for coord, dim in enumerate(dimensions[::-1]):
-            tid = self._linear_thread_idx(coord)
-            idx_map[dim.counter] = dim.start + dim.step * PsCast(
-                deconstify(dim.counter.get_dtype()), tid
-            )
-
-        return idx_map
-
-    def _sparse_mapping(
-        self, ispace: SparseIterationSpace
-    ) -> dict[PsSymbol, PsExpression]:
-        sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = self._linear_thread_idx(0)
-        idx_map: dict[PsSymbol, PsExpression] = {
-            ispace.sparse_counter: PsCast(
-                deconstify(sparse_ctr.get_dtype()), thread_idx
-            )
-        }
-        return idx_map
-
-    def _linear_thread_idx(self, coord: int):
-        block_size = BLOCK_DIM[coord]
-        block_idx = BLOCK_IDX[coord]
-        thread_idx = THREAD_IDX[coord]
-        return block_idx * block_size + thread_idx
-
-
-class Blockwise4DMapping(ThreadMapping):
-    """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
-    are mapped to block indices."""
-
-    _indices_fastest_first = [  # slowest to fastest
-        THREAD_IDX[0],
-        BLOCK_IDX[0],
-        BLOCK_IDX[1],
-        BLOCK_IDX[2]
-    ]
-
-    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
-        match ispace:
-            case FullIterationSpace():
-                return self._dense_mapping(ispace)
-            case SparseIterationSpace():
-                return self._sparse_mapping(ispace)
-            case _:
-                assert False, "unexpected iteration space"
-
-    def _dense_mapping(
-        self, ispace: FullIterationSpace
-    ) -> dict[PsSymbol, PsExpression]:
-        if ispace.rank > 4:
-            raise MaterializationError(
-                f"Cannot handle {ispace.rank}-dimensional iteration space "
-                "using the Blockwise4D GPU thread index mapping."
-            )
-
-        dimensions = ispace.dimensions_in_loop_order()
-        idx_map: dict[PsSymbol, PsExpression] = dict()
-
-        for dim, tid in zip(dimensions[::-1], self._indices_fastest_first):
-            idx_map[dim.counter] = dim.start + dim.step * PsCast(
-                deconstify(dim.counter.get_dtype()), tid
-            )
-
-        return idx_map
-
-    def _sparse_mapping(
-        self, ispace: SparseIterationSpace
-    ) -> dict[PsSymbol, PsExpression]:
-        sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = self._indices_fastest_first[0]
-        idx_map: dict[PsSymbol, PsExpression] = {
-            ispace.sparse_counter: PsCast(
-                deconstify(sparse_ctr.get_dtype()), thread_idx
-            )
-        }
-        return idx_map
-
 
 class CudaPlatform(GenericGpu):
-    """Platform for CUDA-based GPUs.
-    
-    Args:
-        ctx: The kernel creation context
-        omit_range_check: If `True`, generated index translation code will not check if the point identified
-            by block and thread indices is actually contained in the iteration space
-        thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
-    """
-
-    def __init__(
-        self,
-        ctx: KernelCreationContext,
-        omit_range_check: bool = False,
-        thread_mapping: ThreadMapping | None = None,
-    ) -> None:
-        super().__init__(ctx)
-
-        self._omit_range_check = omit_range_check
-        self._thread_mapping = (
-            thread_mapping if thread_mapping is not None else Linear3DMapping()
-        )
-
-        self._typify = Typifier(ctx)
+    """Platform for the CUDA GPU taret."""
 
     @property
     def required_headers(self) -> set[str]:
-        return {'"pystencils_runtime/hip.h"'}  # TODO: move to HipPlatform once it is introduced
-
-    def materialize_iteration_space(
-        self, body: PsBlock, ispace: IterationSpace
-    ) -> PsBlock:
-        if isinstance(ispace, FullIterationSpace):
-            return self._prepend_dense_translation(body, ispace)
-        elif isinstance(ispace, SparseIterationSpace):
-            return self._prepend_sparse_translation(body, ispace)
-        else:
-            raise MaterializationError(f"Unknown type of iteration space: {ispace}")
-
-    def select_function(self, call: PsCall) -> PsExpression:
-        assert isinstance(call.function, PsMathFunction)
-
-        func = call.function.func
-        dtype = call.get_dtype()
-        arg_types = (dtype,) * func.num_args
-
-        if isinstance(dtype, PsIeeeFloatType):
-            match func:
-                case (
-                    MathFunctions.Exp
-                    | MathFunctions.Log
-                    | MathFunctions.Sin
-                    | MathFunctions.Cos
-                    | MathFunctions.Sqrt
-                    | MathFunctions.Ceil
-                    | MathFunctions.Floor
-                ) if dtype.width in (16, 32, 64):
-                    prefix = "h" if dtype.width == 16 else ""
-                    suffix = "f" if dtype.width == 32 else ""
-                    name = f"{prefix}{func.function_name}{suffix}"
-                    cfunc = CFunction(name, arg_types, dtype)
-
-                case (
-                    MathFunctions.Pow
-                    | MathFunctions.Tan
-                    | MathFunctions.Sinh
-                    | MathFunctions.Cosh
-                    | MathFunctions.ASin
-                    | MathFunctions.ACos
-                    | MathFunctions.ATan
-                    | MathFunctions.ATan2
-                ) if dtype.width in (32, 64):
-                    #   These are unavailable for fp16
-                    suffix = "f" if dtype.width == 32 else ""
-                    name = f"{func.function_name}{suffix}"
-                    cfunc = CFunction(name, arg_types, dtype)
-
-                case (
-                    MathFunctions.Min | MathFunctions.Max | MathFunctions.Abs
-                ) if dtype.width in (32, 64):
-                    suffix = "f" if dtype.width == 32 else ""
-                    name = f"f{func.function_name}{suffix}"
-                    cfunc = CFunction(name, arg_types, dtype)
-
-                case MathFunctions.Abs if dtype.width == 16:
-                    cfunc = CFunction(" __habs", arg_types, dtype)
-
-                case _:
-                    raise MaterializationError(
-                        f"Cannot materialize call to function {func}"
-                    )
-
-            call.function = cfunc
-            return call
-
-        raise MaterializationError(
-            f"No implementation available for function {func} on data type {dtype}"
-        )
-
-    #   Internals
-
-    def _prepend_dense_translation(
-        self, body: PsBlock, ispace: FullIterationSpace
-    ) -> PsBlock:
-        ctr_mapping = self._thread_mapping(ispace)
-
-        indexing_decls = []
-        conds = []
-
-        dimensions = ispace.dimensions_in_loop_order()
-
-        for dim in dimensions:
-            # counter declarations must be ordered slowest-to-fastest
-            # such that inner dimensions can depend on outer ones
-
-            dim.counter.dtype = constify(dim.counter.get_dtype())
-
-            ctr_expr = PsExpression.make(dim.counter)
-            indexing_decls.append(
-                self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
-            )
-            if not self._omit_range_check:
-                conds.append(PsLt(ctr_expr, dim.stop))
-
-        if conds:
-            condition: PsExpression = conds[0]
-            for cond in conds[1:]:
-                condition = PsAnd(condition, cond)
-            ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
-        else:
-            body.statements = indexing_decls + body.statements
-            ast = body
-
-        return ast
-
-    def _prepend_sparse_translation(
-        self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> PsBlock:
-        factory = AstFactory(self._ctx)
-        ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
-
-        sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
-        ctr_mapping = self._thread_mapping(ispace)
-
-        sparse_idx_decl = self._typify(
-            PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter])
-        )
-
-        mappings = [
-            PsDeclaration(
-                PsExpression.make(ctr),
-                PsLookup(
-                    PsBufferAcc(
-                        ispace.index_list.base_pointer,
-                        (sparse_ctr_expr.clone(), factory.parse_index(0)),
-                    ),
-                    coord.name,
-                ),
-            )
-            for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members)
-        ]
-        body.statements = mappings + body.statements
-
-        if not self._omit_range_check:
-            stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr_expr.clone(), stop)
-            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-        else:
-            body.statements = [sparse_idx_decl] + body.statements
-            ast = body
-
-        return ast
+        return set()
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index b5b35c8b0..fac37ffa5 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,7 +1,331 @@
 from __future__ import annotations
+from abc import ABC, abstractmethod
 
+from ...types import constify, deconstify
+from ..exceptions import MaterializationError
 from .platform import Platform
 
+from ..memory import PsSymbol
+from ..kernelcreation import (
+    Typifier,
+    IterationSpace,
+    FullIterationSpace,
+    SparseIterationSpace,
+    AstFactory,
+)
+
+from ..kernelcreation.context import KernelCreationContext
+from ..ast.structural import PsBlock, PsConditional, PsDeclaration
+from ..ast.expressions import (
+    PsExpression,
+    PsLiteralExpr,
+    PsCast,
+    PsCall,
+    PsLookup,
+    PsBufferAcc,
+)
+from ..ast.expressions import PsLt, PsAnd
+from ...types import PsSignedIntegerType, PsIeeeFloatType
+from ..literals import PsLiteral
+from ..functions import PsMathFunction, MathFunctions, CFunction
+
+
+int32 = PsSignedIntegerType(width=32, const=False)
+
+BLOCK_IDX = [
+    PsLiteralExpr(PsLiteral(f"blockIdx.{coord}", int32)) for coord in ("x", "y", "z")
+]
+THREAD_IDX = [
+    PsLiteralExpr(PsLiteral(f"threadIdx.{coord}", int32)) for coord in ("x", "y", "z")
+]
+BLOCK_DIM = [
+    PsLiteralExpr(PsLiteral(f"blockDim.{coord}", int32)) for coord in ("x", "y", "z")
+]
+GRID_DIM = [
+    PsLiteralExpr(PsLiteral(f"gridDim.{coord}", int32)) for coord in ("x", "y", "z")
+]
+
+
+class ThreadMapping(ABC):
+
+    @abstractmethod
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        """Map the current thread index onto a point in the given iteration space.
+
+        Implementations of this method must return a declaration for each dimension counter
+        of the given iteration space.
+        """
+
+
+class Linear3DMapping(ThreadMapping):
+    """3D globally linearized mapping, where each thread is assigned a work item according to
+    its location in the global launch grid."""
+
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        match ispace:
+            case FullIterationSpace():
+                return self._dense_mapping(ispace)
+            case SparseIterationSpace():
+                return self._sparse_mapping(ispace)
+            case _:
+                assert False, "unexpected iteration space"
+
+    def _dense_mapping(
+        self, ispace: FullIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 3:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Linear3D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for coord, dim in enumerate(dimensions[::-1]):
+            tid = self._linear_thread_idx(coord)
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+    def _sparse_mapping(
+        self, ispace: SparseIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
+        thread_idx = self._linear_thread_idx(0)
+        idx_map: dict[PsSymbol, PsExpression] = {
+            ispace.sparse_counter: PsCast(
+                deconstify(sparse_ctr.get_dtype()), thread_idx
+            )
+        }
+        return idx_map
+
+    def _linear_thread_idx(self, coord: int):
+        block_size = BLOCK_DIM[coord]
+        block_idx = BLOCK_IDX[coord]
+        thread_idx = THREAD_IDX[coord]
+        return block_idx * block_size + thread_idx
+
+
+class Blockwise4DMapping(ThreadMapping):
+    """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
+    are mapped to block indices."""
+
+    _indices_fastest_first = [  # slowest to fastest
+        THREAD_IDX[0],
+        BLOCK_IDX[0],
+        BLOCK_IDX[1],
+        BLOCK_IDX[2]
+    ]
+
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        match ispace:
+            case FullIterationSpace():
+                return self._dense_mapping(ispace)
+            case SparseIterationSpace():
+                return self._sparse_mapping(ispace)
+            case _:
+                assert False, "unexpected iteration space"
+
+    def _dense_mapping(
+        self, ispace: FullIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 4:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Blockwise4D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for dim, tid in zip(dimensions[::-1], self._indices_fastest_first):
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+    def _sparse_mapping(
+        self, ispace: SparseIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
+        thread_idx = self._indices_fastest_first[0]
+        idx_map: dict[PsSymbol, PsExpression] = {
+            ispace.sparse_counter: PsCast(
+                deconstify(sparse_ctr.get_dtype()), thread_idx
+            )
+        }
+        return idx_map
+
 
 class GenericGpu(Platform):
-    """Base class for GPU platforms."""
+    """Common base platform for CUDA- and HIP-type GPU targets.
+    
+    Args:
+        ctx: The kernel creation context
+        omit_range_check: If `True`, generated index translation code will not check if the point identified
+            by block and thread indices is actually contained in the iteration space
+        thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
+    """
+
+    def __init__(
+        self,
+        ctx: KernelCreationContext,
+        omit_range_check: bool = False,
+        thread_mapping: ThreadMapping | None = None,
+    ) -> None:
+        super().__init__(ctx)
+
+        self._omit_range_check = omit_range_check
+        self._thread_mapping = (
+            thread_mapping if thread_mapping is not None else Linear3DMapping()
+        )
+
+        self._typify = Typifier(ctx)
+
+    def materialize_iteration_space(
+        self, body: PsBlock, ispace: IterationSpace
+    ) -> PsBlock:
+        if isinstance(ispace, FullIterationSpace):
+            return self._prepend_dense_translation(body, ispace)
+        elif isinstance(ispace, SparseIterationSpace):
+            return self._prepend_sparse_translation(body, ispace)
+        else:
+            raise MaterializationError(f"Unknown type of iteration space: {ispace}")
+
+    def select_function(self, call: PsCall) -> PsExpression:
+        assert isinstance(call.function, PsMathFunction)
+
+        func = call.function.func
+        dtype = call.get_dtype()
+        arg_types = (dtype,) * func.num_args
+
+        if isinstance(dtype, PsIeeeFloatType):
+            match func:
+                case (
+                    MathFunctions.Exp
+                    | MathFunctions.Log
+                    | MathFunctions.Sin
+                    | MathFunctions.Cos
+                    | MathFunctions.Sqrt
+                    | MathFunctions.Ceil
+                    | MathFunctions.Floor
+                ) if dtype.width in (16, 32, 64):
+                    prefix = "h" if dtype.width == 16 else ""
+                    suffix = "f" if dtype.width == 32 else ""
+                    name = f"{prefix}{func.function_name}{suffix}"
+                    cfunc = CFunction(name, arg_types, dtype)
+
+                case (
+                    MathFunctions.Pow
+                    | MathFunctions.Tan
+                    | MathFunctions.Sinh
+                    | MathFunctions.Cosh
+                    | MathFunctions.ASin
+                    | MathFunctions.ACos
+                    | MathFunctions.ATan
+                    | MathFunctions.ATan2
+                ) if dtype.width in (32, 64):
+                    #   These are unavailable for fp16
+                    suffix = "f" if dtype.width == 32 else ""
+                    name = f"{func.function_name}{suffix}"
+                    cfunc = CFunction(name, arg_types, dtype)
+
+                case (
+                    MathFunctions.Min | MathFunctions.Max | MathFunctions.Abs
+                ) if dtype.width in (32, 64):
+                    suffix = "f" if dtype.width == 32 else ""
+                    name = f"f{func.function_name}{suffix}"
+                    cfunc = CFunction(name, arg_types, dtype)
+
+                case MathFunctions.Abs if dtype.width == 16:
+                    cfunc = CFunction(" __habs", arg_types, dtype)
+
+                case _:
+                    raise MaterializationError(
+                        f"Cannot materialize call to function {func}"
+                    )
+
+            call.function = cfunc
+            return call
+
+        raise MaterializationError(
+            f"No implementation available for function {func} on data type {dtype}"
+        )
+
+    #   Internals
+
+    def _prepend_dense_translation(
+        self, body: PsBlock, ispace: FullIterationSpace
+    ) -> PsBlock:
+        ctr_mapping = self._thread_mapping(ispace)
+
+        indexing_decls = []
+        conds = []
+
+        dimensions = ispace.dimensions_in_loop_order()
+
+        for dim in dimensions:
+            # counter declarations must be ordered slowest-to-fastest
+            # such that inner dimensions can depend on outer ones
+
+            dim.counter.dtype = constify(dim.counter.get_dtype())
+
+            ctr_expr = PsExpression.make(dim.counter)
+            indexing_decls.append(
+                self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
+            )
+            if not self._omit_range_check:
+                conds.append(PsLt(ctr_expr, dim.stop))
+
+        if conds:
+            condition: PsExpression = conds[0]
+            for cond in conds[1:]:
+                condition = PsAnd(condition, cond)
+            ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
+        else:
+            body.statements = indexing_decls + body.statements
+            ast = body
+
+        return ast
+
+    def _prepend_sparse_translation(
+        self, body: PsBlock, ispace: SparseIterationSpace
+    ) -> PsBlock:
+        factory = AstFactory(self._ctx)
+        ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
+
+        sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
+        ctr_mapping = self._thread_mapping(ispace)
+
+        sparse_idx_decl = self._typify(
+            PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter])
+        )
+
+        mappings = [
+            PsDeclaration(
+                PsExpression.make(ctr),
+                PsLookup(
+                    PsBufferAcc(
+                        ispace.index_list.base_pointer,
+                        (sparse_ctr_expr.clone(), factory.parse_index(0)),
+                    ),
+                    coord.name,
+                ),
+            )
+            for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members)
+        ]
+        body.statements = mappings + body.statements
+
+        if not self._omit_range_check:
+            stop = PsExpression.make(ispace.index_list.shape[0])
+            condition = PsLt(sparse_ctr_expr.clone(), stop)
+            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
+        else:
+            body.statements = [sparse_idx_decl] + body.statements
+            ast = body
+
+        return ast
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
new file mode 100644
index 000000000..c758995a0
--- /dev/null
+++ b/src/pystencils/backend/platforms/hip.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from .generic_gpu import GenericGpu
+
+
+class HipPlatform(GenericGpu):
+    """Platform for the HIP GPU taret."""
+
+    @property
+    def required_headers(self) -> set[str]:
+        return {'"pystencils_runtime/hip.h"'}
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index f53f1b9b8..9e3c8b163 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -440,21 +440,27 @@ class DefaultKernelCreationDriver:
             gpu_opts = self._cfg.gpu
             omit_range_check: bool = gpu_opts.get_option("omit_range_check")
 
+            thread_mapping = (
+                self._gpu_indexing.get_thread_mapping()
+                if self._gpu_indexing is not None
+                else None
+            )
+
+            GpuPlatform: type
             match self._target:
-                case Target.CUDA | Target.HIP:
-                    from ..backend.platforms import CudaPlatform
-
-                    thread_mapping = (
-                        self._gpu_indexing.get_thread_mapping()
-                        if self._gpu_indexing is not None
-                        else None
-                    )
-
-                    return CudaPlatform(
-                        self._ctx,
-                        omit_range_check=omit_range_check,
-                        thread_mapping=thread_mapping,
-                    )
+                case Target.CUDA:
+                    from ..backend.platforms import CudaPlatform as GpuPlatform
+                case Target.HIP:
+                    from ..backend.platforms import HipPlatform as GpuPlatform
+                case _:
+                    assert False, f"unexpected GPU target: {self._target}"
+
+            return GpuPlatform(
+                self._ctx,
+                omit_range_check=omit_range_check,
+                thread_mapping=thread_mapping,
+            )
+
         elif self._target == Target.SYCL:
             from ..backend.platforms import SyclPlatform
 
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 27d6fc817..890afb0fb 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -14,7 +14,7 @@ from ..backend.kernelcreation import (
     FullIterationSpace,
     SparseIterationSpace,
 )
-from ..backend.platforms.cuda import ThreadMapping
+from ..backend.platforms.generic_gpu import ThreadMapping
 
 from ..backend.ast.expressions import PsExpression
 
@@ -236,7 +236,7 @@ class GpuIndexing:
     def get_thread_mapping(self) -> ThreadMapping:
         """Retrieve a thread mapping object for use by the backend"""
 
-        from ..backend.platforms.cuda import Linear3DMapping, Blockwise4DMapping
+        from ..backend.platforms.generic_gpu import Linear3DMapping, Blockwise4DMapping
 
         match self._scheme:
             case GpuIndexingScheme.Linear3D:
-- 
GitLab


From de1dc39e761e2fd151c12e87952a897a150674d6 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 14 Mar 2025 16:47:42 +0000
Subject: [PATCH 05/15] start fixing JIT for HIP

---
 src/pystencils/codegen/config.py        |  6 ++++--
 src/pystencils/codegen/target.py        | 12 ++++++++++++
 src/pystencils/jit/gpu_cupy.py          | 10 +++++++---
 tests/fixtures.py                       |  5 ++++-
 tests/kernelcreation/test_buffer_gpu.py | 24 ++++++++++++------------
 5 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 821bb7e07..a765dea2e 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -586,6 +586,8 @@ class CreateKernelConfig(ConfigBase):
         match t:
             case Target.CurrentCPU:
                 return Target.auto_cpu()
+            case Target.CurrentGPU:
+                return Target.auto_gpu()
             case _:
                 return t
 
@@ -600,7 +602,7 @@ class CreateKernelConfig(ConfigBase):
                 from ..jit import LegacyCpuJit
 
                 return LegacyCpuJit()
-            elif target == Target.CUDA:
+            elif target == Target.CUDA or target == Target.HIP:
                 try:
                     from ..jit.gpu_cupy import CupyJit
                     
@@ -611,7 +613,7 @@ class CreateKernelConfig(ConfigBase):
 
                     return no_jit
 
-            elif target == Target.SYCL or target == Target.HIP:
+            elif target == Target.SYCL:
                 from ..jit import no_jit
 
                 return no_jit
diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py
index 03364af28..c4b08b95c 100644
--- a/src/pystencils/codegen/target.py
+++ b/src/pystencils/codegen/target.py
@@ -126,6 +126,18 @@ class Target(Flag):
         else:
             return Target.GenericCPU
         
+    @staticmethod
+    def auto_gpu() -> Target:
+        try:
+            import cupy
+
+            if cupy.cuda.runtime.is_hip:
+                return Target.HIP
+            else:
+                return Target.CUDA
+        except ImportError:
+            raise RuntimeError("Cannot infer GPU target since cupy is not installed.")
+        
     @staticmethod
     def available_targets() -> list[Target]:
         targets = [Target.GenericCPU]
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index d45abf878..1461bfd7a 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -208,7 +208,11 @@ class CupyKernelWrapper(KernelWrapper):
 class CupyJit(JitBase):
 
     def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)):
-        self._runtime_headers = {"<cstdint>"}
+        self._runtime_headers: set[str]
+        if cp.cuda.runtime.is_hip:
+            self._runtime_headers = set()
+        else:
+            self._runtime_headers = {"<cstdint>"}
 
         if len(default_block_size) > 3:
             raise ValueError(
@@ -226,9 +230,9 @@ class CupyJit(JitBase):
                 "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
             )
 
-        if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA:
+        if not isinstance(kernel, GpuKernel):
             raise ValueError(
-                "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP"
+                "The CupyJit just-in-time compiler only accepts GPU kernels generated for CUDA or HIP"
             )
 
         options = self._compiler_options()
diff --git a/tests/fixtures.py b/tests/fixtures.py
index ba2593f76..a19519988 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -23,7 +23,10 @@ AVAILABLE_TARGETS = [ps.Target.GenericCPU]
 try:
     import cupy
 
-    AVAILABLE_TARGETS += [ps.Target.CUDA]
+    if cupy.cuda.runtime.is_hip:
+        AVAILABLE_TARGETS += [ps.Target.HIP]
+    else:
+        AVAILABLE_TARGETS += [ps.Target.CUDA]
 except ImportError:
     pass
 
diff --git a/tests/kernelcreation/test_buffer_gpu.py b/tests/kernelcreation/test_buffer_gpu.py
index 0b5019fba..bd9d2156b 100644
--- a/tests/kernelcreation/test_buffer_gpu.py
+++ b/tests/kernelcreation/test_buffer_gpu.py
@@ -58,7 +58,7 @@ def test_full_scalar_field():
 
         pack_eqs = [Assignment(buffer.center(), src_field.center())]
 
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         pack_ast = create_kernel(pack_eqs, config=config)
 
         pack_kernel = pack_ast.compile()
@@ -66,7 +66,7 @@ def test_full_scalar_field():
 
         unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
 
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         unpack_ast = create_kernel(unpack_eqs, config=config)
 
         unpack_kernel = unpack_ast.compile()
@@ -94,7 +94,7 @@ def test_field_slice():
 
             pack_eqs = [Assignment(buffer.center(), src_field.center())]
 
-            config = CreateKernelConfig(target=pystencils.Target.GPU)
+            config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
             pack_ast = create_kernel(pack_eqs, config=config)
 
             pack_kernel = pack_ast.compile()
@@ -103,7 +103,7 @@ def test_field_slice():
             # Unpack into ghost layer of dst_field in N direction
             unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
 
-            config = CreateKernelConfig(target=pystencils.Target.GPU)
+            config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
             unpack_ast = create_kernel(unpack_eqs, config=config)
 
             unpack_kernel = unpack_ast.compile()
@@ -131,7 +131,7 @@ def test_all_cell_values():
             eq = Assignment(buffer(idx), src_field(idx))
             pack_eqs.append(eq)
 
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         pack_code = create_kernel(pack_eqs, config=config)
         pack_kernel = pack_code.compile()
 
@@ -143,7 +143,7 @@ def test_all_cell_values():
             eq = Assignment(dst_field(idx), buffer(idx))
             unpack_eqs.append(eq)
 
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         unpack_ast = create_kernel(unpack_eqs, config=config)
         unpack_kernel = unpack_ast.compile()
         unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)
@@ -173,7 +173,7 @@ def test_subset_cell_values():
             pack_eqs.append(eq)
 
         pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         pack_ast = create_kernel(pack_eqs, config=config)
         pack_kernel = pack_ast.compile()
         pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)
@@ -185,7 +185,7 @@ def test_subset_cell_values():
             unpack_eqs.append(eq)
 
         unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        config = CreateKernelConfig(target=pystencils.Target.GPU)
+        config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
         unpack_ast = create_kernel(unpack_eqs, config=config)
         unpack_kernel = unpack_ast.compile()
 
@@ -215,7 +215,7 @@ def test_field_layouts():
                 pack_eqs.append(eq)
 
             pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            config = CreateKernelConfig(target=pystencils.Target.GPU)
+            config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
             pack_ast = create_kernel(pack_eqs, config=config)
             pack_kernel = pack_ast.compile()
 
@@ -228,7 +228,7 @@ def test_field_layouts():
                 unpack_eqs.append(eq)
 
             unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            config = CreateKernelConfig(target=pystencils.Target.GPU)
+            config = CreateKernelConfig(target=pystencils.Target.CurrentGPU)
             unpack_ast = create_kernel(unpack_eqs, config=config)
             unpack_kernel = unpack_ast.compile()
 
@@ -299,7 +299,7 @@ def test_iteration_slices(gpu_indexing):
         gpu_src_arr.set(src_arr)
         gpu_dst_arr.fill(0)
 
-        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice)
+        config = CreateKernelConfig(target=Target.CurrentGPU, iteration_slice=pack_slice)
 
         pack_code = create_kernel(pack_eqs, config=config)
         pack_kernel = pack_code.compile()
@@ -311,7 +311,7 @@ def test_iteration_slices(gpu_indexing):
             eq = Assignment(dst_field(idx), buffer(idx))
             unpack_eqs.append(eq)
 
-        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice)
+        config = CreateKernelConfig(target=Target.CurrentGPU, iteration_slice=pack_slice)
 
         unpack_code = create_kernel(unpack_eqs, config=config)
         unpack_kernel = unpack_code.compile()
-- 
GitLab


From a01997e443d504e40d1d2f882c593b37a3c7a2cd Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 14 Mar 2025 17:46:52 +0000
Subject: [PATCH 06/15] fix remaining test suite

---
 src/pystencils/boundaries/boundaryhandling.py |  8 +++----
 src/pystencils/codegen/target.py              | 21 ++++++++++++++++---
 .../datahandling/datahandling_interface.py    |  2 --
 .../datahandling/serial_datahandling.py       |  6 +++---
 src/pystencils/jit/gpu_cupy.py                | 12 ++++++++++-
 tests/fixtures.py                             | 18 +++-------------
 tests/kernelcreation/test_functions.py        | 14 ++++++-------
 tests/kernelcreation/test_gpu.py              | 10 ++++-----
 tests/kernelcreation/test_half_precision.py   |  4 ++--
 tests/kernelcreation/test_index_kernels.py    | 17 +++++----------
 tests/kernelcreation/test_iteration_slices.py |  4 ++--
 tests/runtime/test_boundary.py                |  4 ++--
 tests/runtime/test_datahandling.py            | 12 +++++------
 13 files changed, 67 insertions(+), 65 deletions(-)

diff --git a/src/pystencils/boundaries/boundaryhandling.py b/src/pystencils/boundaries/boundaryhandling.py
index 1f6e3d126..58340c3e0 100644
--- a/src/pystencils/boundaries/boundaryhandling.py
+++ b/src/pystencils/boundaries/boundaryhandling.py
@@ -123,7 +123,7 @@ class BoundaryHandling:
         class_ = self.IndexFieldBlockData
         class_.to_cpu = to_cpu
         class_.to_gpu = to_gpu
-        gpu = self._target in data_handling._GPU_LIKE_TARGETS
+        gpu = self._target.is_gpu()
         data_handling.add_custom_class(self._index_array_name, class_, cpu=True, gpu=gpu)
 
     @property
@@ -240,7 +240,7 @@ class BoundaryHandling:
         if self._dirty:
             self.prepare()
 
-        for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
+        for b in self._data_handling.iterate(gpu=self._target.is_gpu()):
             for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
                 kwargs[self._field_name] = b[self._field_name]
                 kwargs['indexField'] = idx_arr
@@ -255,7 +255,7 @@ class BoundaryHandling:
         if self._dirty:
             self.prepare()
 
-        for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS):
+        for b in self._data_handling.iterate(gpu=self._target.is_gpu()):
             for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
                 arguments = kwargs.copy()
                 arguments[self._field_name] = b[self._field_name]
@@ -341,7 +341,7 @@ class BoundaryHandling:
     def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs):
         if boundary_obj.additional_data_init_callback:
             boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs)
-        if self._target in self._data_handling._GPU_LIKE_TARGETS:
+        if self._target.is_gpu():
             self._data_handling.to_gpu(self._index_array_name)
 
     class BoundaryInfo(object):
diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py
index c4b08b95c..5e214430c 100644
--- a/src/pystencils/codegen/target.py
+++ b/src/pystencils/codegen/target.py
@@ -93,8 +93,8 @@ class Target(Flag):
     Generate a HIP kernel for generic AMD or NVidia GPUs.
     """
 
-    GPU = CUDA
-    """Alias for `Target.CUDA`, for backward compatibility."""
+    GPU = CurrentGPU
+    """Alias for `Target.CurrentGPU`, for backward compatibility."""
 
     SYCL = _SYCL
     """SYCL kernel target.
@@ -106,15 +106,24 @@ class Target(Flag):
     """
 
     def is_automatic(self) -> bool:
+        """Determine if this target is a proxy target that is automatically resolved
+        according to the runtime environment."""
         return Target._AUTOMATIC in self
 
     def is_cpu(self) -> bool:
+        """Determine if this target is a CPU target."""
         return Target._CPU in self
 
     def is_vector_cpu(self) -> bool:
+        """Determine if this target is a vector CPU target."""
         return self.is_cpu() and Target._VECTOR in self
 
     def is_gpu(self) -> bool:
+        """Determine if this target is a GPU target.
+        
+        This refers to targets for the CUDA and HIP family of platforms.
+        `Target.SYCL` is *not* a GPU target.
+        """
         return Target._GPU in self
 
     @staticmethod
@@ -128,6 +137,11 @@ class Target(Flag):
         
     @staticmethod
     def auto_gpu() -> Target:
+        """Return the GPU target available in the current runtime environment.
+        
+        Raises:
+            RuntimeError: If `cupy` is not installed and therefore no GPU runtime is available.
+        """
         try:
             import cupy
 
@@ -140,10 +154,11 @@ class Target(Flag):
         
     @staticmethod
     def available_targets() -> list[Target]:
+        """List available"""
         targets = [Target.GenericCPU]
         try:
             import cupy  # noqa: F401
-            targets.append(Target.CUDA)
+            targets.append(Target.auto_gpu())
         except ImportError:
             pass
 
diff --git a/src/pystencils/datahandling/datahandling_interface.py b/src/pystencils/datahandling/datahandling_interface.py
index 867bbf062..a6b1fcb55 100644
--- a/src/pystencils/datahandling/datahandling_interface.py
+++ b/src/pystencils/datahandling/datahandling_interface.py
@@ -17,8 +17,6 @@ class DataHandling(ABC):
     'gather' function that has collects (parts of the) distributed data on a single process.
     """
 
-    _GPU_LIKE_TARGETS = [Target.GPU]
-
     # ---------------------------- Adding and accessing data -----------------------------------------------------------
     @property
     @abstractmethod
diff --git a/src/pystencils/datahandling/serial_datahandling.py b/src/pystencils/datahandling/serial_datahandling.py
index 73b749ca4..dc6904c3a 100644
--- a/src/pystencils/datahandling/serial_datahandling.py
+++ b/src/pystencils/datahandling/serial_datahandling.py
@@ -110,7 +110,7 @@ class SerialDataHandling(DataHandling):
         if layout is None:
             layout = self.default_layout
         if gpu is None:
-            gpu = self.default_target in self._GPU_LIKE_TARGETS
+            gpu = self.default_target.is_gpu()
 
         kwargs = {
             'shape': tuple(s + 2 * ghost_layers for s in self._domainSize),
@@ -241,7 +241,7 @@ class SerialDataHandling(DataHandling):
 
     def swap(self, name1, name2, gpu=None):
         if gpu is None:
-            gpu = self.default_target in self._GPU_LIKE_TARGETS
+            gpu = self.default_target.is_gpu()
         arr = self.gpu_arrays if gpu else self.cpu_arrays
         arr[name1], arr[name2] = arr[name2], arr[name1]
 
@@ -292,7 +292,7 @@ class SerialDataHandling(DataHandling):
         if target is None:
             target = self.default_target
             
-        if not (target.is_cpu() or target == Target.CUDA):
+        if not (target.is_cpu() or target.is_gpu()):
             raise ValueError(f"Unsupported target: {target}")
 
         if not hasattr(names, '__len__') or type(names) is str:
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 1461bfd7a..69e965325 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -231,9 +231,19 @@ class CupyJit(JitBase):
             )
 
         if not isinstance(kernel, GpuKernel):
-            raise ValueError(
+            raise JitError(
                 "The CupyJit just-in-time compiler only accepts GPU kernels generated for CUDA or HIP"
             )
+        
+        if kernel.target == Target.CUDA and cp.cuda.runtime.is_hip:
+            raise JitError(
+                "Cannot compile a CUDA kernel on a HIP-based Cupy installation."
+            )
+        
+        if kernel.target == Target.HIP and not cp.cuda.runtime.is_hip:
+            raise JitError(
+                "Cannot compile a HIP kernel on a CUDA-based Cupy installation."
+            )
 
         options = self._compiler_options()
         prelude = self._prelude(kernel)
diff --git a/tests/fixtures.py b/tests/fixtures.py
index a19519988..a4c77f550 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -18,19 +18,7 @@ from types import ModuleType
 
 import pystencils as ps
 
-AVAILABLE_TARGETS = [ps.Target.GenericCPU]
-
-try:
-    import cupy
-
-    if cupy.cuda.runtime.is_hip:
-        AVAILABLE_TARGETS += [ps.Target.HIP]
-    else:
-        AVAILABLE_TARGETS += [ps.Target.CUDA]
-except ImportError:
-    pass
-
-AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets()
+AVAILABLE_TARGETS = ps.Target.available_targets()
 TARGET_IDS = [t.name for t in AVAILABLE_TARGETS]
 
 
@@ -75,9 +63,9 @@ def xp(target: ps.Target) -> ModuleType:
     """Primary array module for the current target.
 
     Returns:
-        `cupy` if `target == Target.CUDA`, and `numpy` otherwise
+        `cupy` if `target.is_gpu()`, and `numpy` otherwise
     """
-    if target == ps.Target.CUDA:
+    if target.is_gpu():
         import cupy as xp
 
         return xp
diff --git a/tests/kernelcreation/test_functions.py b/tests/kernelcreation/test_functions.py
index a4d154d4b..182a59005 100644
--- a/tests/kernelcreation/test_functions.py
+++ b/tests/kernelcreation/test_functions.py
@@ -106,14 +106,14 @@ def function_domain(function_name, dtype):
         case "pow":
             return np.concatenate(
                 [
-                    [0., 1., 1.],
-                    rng.uniform(-1., 1., 8),
-                    rng.uniform(0., 5., 8),
+                    [0.0, 1.0, 1.0],
+                    rng.uniform(-1.0, 1.0, 8),
+                    rng.uniform(0.0, 5.0, 8),
                 ]
             ).astype(dtype), np.concatenate(
                 [
-                    [1., 0., 2.],
-                    np.arange(2., 10., 1.),
+                    [1.0, 0.0, 2.0],
+                    np.arange(2.0, 10.0, 1.0),
                     rng.uniform(-2.0, 2.0, 8),
                 ]
             ).astype(
@@ -211,14 +211,14 @@ def test_binary_functions(gen_config, xp, function_name, dtype, function_domain)
 
 dtype_and_target_for_integer_funcs = pytest.mark.parametrize(
     "dtype, target",
-    list(product([np.int32], [t for t in AVAIL_TARGETS if t is not Target.CUDA]))
+    list(product([np.int32], [t for t in AVAIL_TARGETS if not t.is_gpu()]))
     + list(
         product(
             [np.int64],
             [
                 t
                 for t in AVAIL_TARGETS
-                if t not in (Target.X86_SSE, Target.X86_AVX, Target.CUDA)
+                if t not in (Target.X86_SSE, Target.X86_AVX) and not t.is_gpu()
             ],
         )
     ),
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index f1905b1fc..8d943e8fd 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -45,7 +45,7 @@ def test_indexing_options_3d(
         + src[0, 0, 1],
     )
 
-    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg = CreateKernelConfig(target=Target.CurrentGPU)
     cfg.gpu.indexing_scheme = indexing_scheme
     cfg.gpu.omit_range_check = omit_range_check
     cfg.gpu.manual_launch_grid = manual_grid
@@ -91,7 +91,7 @@ def test_indexing_options_2d(
         + src[0, 1]
     )
 
-    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg = CreateKernelConfig(target=Target.CurrentGPU)
     cfg.gpu.indexing_scheme = indexing_scheme
     cfg.gpu.omit_range_check = omit_range_check
     cfg.gpu.manual_launch_grid = manual_grid
@@ -126,7 +126,7 @@ def test_invalid_indexing_schemes():
     src, dst = fields("src, dst: [4D]")
     asm = Assignment(src.center(0), dst.center(0))
 
-    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg = CreateKernelConfig(target=Target.CurrentGPU)
     cfg.gpu.indexing_scheme = "linear3d"
 
     with pytest.raises(Exception):
@@ -241,7 +241,7 @@ def test_ghost_layer():
     ghost_layers = [(1, 2), (2, 1)]
 
     config = CreateKernelConfig()
-    config.target = Target.CUDA
+    config.target = Target.CurrentGPU
     config.ghost_layers = ghost_layers
     config.gpu.indexing_scheme = "blockwise4d"
 
@@ -270,7 +270,7 @@ def test_setting_value():
     update_rule = [Assignment(f(0), sp.Symbol("value"))]
 
     config = CreateKernelConfig()
-    config.target = Target.CUDA
+    config.target = Target.CurrentGPU
     config.iteration_slice = iteration_slice
     config.gpu.indexing_scheme = "blockwise4d"
 
diff --git a/tests/kernelcreation/test_half_precision.py b/tests/kernelcreation/test_half_precision.py
index a9745459d..5dbe2180e 100644
--- a/tests/kernelcreation/test_half_precision.py
+++ b/tests/kernelcreation/test_half_precision.py
@@ -5,7 +5,7 @@ import numpy as np
 import pystencils as ps
 
 
-@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU))
+@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.CurrentGPU))
 def test_half_precison(target):
     if target == ps.Target.CPU:
         if not platform.machine() in ['arm64', 'aarch64']:
@@ -14,7 +14,7 @@ def test_half_precison(target):
         if 'clang' not in ps.cpu.cpujit.get_compiler_config()['command']:
             pytest.xfail("skipping half precision because clang compiler is not used")
 
-    if target == ps.Target.GPU:
+    if target.is_gpu():
         pytest.importorskip("cupy")
 
     dh = ps.create_data_handling(domain_size=(10, 10), default_target=target)
diff --git a/tests/kernelcreation/test_index_kernels.py b/tests/kernelcreation/test_index_kernels.py
index 569c0ab6a..bda0ef273 100644
--- a/tests/kernelcreation/test_index_kernels.py
+++ b/tests/kernelcreation/test_index_kernels.py
@@ -5,14 +5,7 @@ from pystencils import Assignment, Field, FieldType, AssignmentCollection, Targe
 from pystencils import create_kernel, CreateKernelConfig
 
 
-@pytest.mark.parametrize("target", [Target.CPU, Target.GPU])
-def test_indexed_kernel(target):
-    if target == Target.GPU:
-        cp = pytest.importorskip("cupy")
-        xp = cp
-    else:
-        xp = np
-
+def test_indexed_kernel(target, xp):
     arr = xp.zeros((3, 4))
     dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)], align=True)
     
@@ -21,8 +14,8 @@ def test_indexed_kernel(target):
     cpu_index_arr[1] = (1, 3, 42.0)
     cpu_index_arr[2] = (2, 1, 5.0)
 
-    if target == Target.GPU:
-        gpu_index_arr = cp.empty(cpu_index_arr.shape, cpu_index_arr.dtype)
+    if target.is_gpu():
+        gpu_index_arr = xp.empty(cpu_index_arr.shape, cpu_index_arr.dtype)
         gpu_index_arr.set(cpu_index_arr)
         index_arr = gpu_index_arr
     else:
@@ -40,8 +33,8 @@ def test_indexed_kernel(target):
 
     kernel(f=arr, index=index_arr)
 
-    if target == Target.GPU:
-        arr = cp.asnumpy(arr)
+    if target.is_gpu():
+        arr = xp.asnumpy(arr)
 
     for i in range(cpu_index_arr.shape[0]):
         np.testing.assert_allclose(arr[cpu_index_arr[i]['x'], cpu_index_arr[i]['y']], cpu_index_arr[i]['value'], atol=1e-13)
diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py
index b1f2da576..2b3a8ebf0 100644
--- a/tests/kernelcreation/test_iteration_slices.py
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -144,7 +144,7 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
     islice = make_slice[:, slow_counter:]
     gen_config = replace(gen_config, iteration_slice=islice)
 
-    if gen_config.target == Target.CUDA:
+    if gen_config.target.is_gpu():
         gen_config.gpu.manual_launch_grid = True
 
     kernel = create_kernel(update, gen_config).compile()
@@ -177,7 +177,7 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
     islice = make_slice[:, start::2]
     gen_config.iteration_slice = islice
 
-    if gen_config.target == Target.CUDA:
+    if gen_config.target.is_gpu():
         gen_config.gpu.manual_launch_grid = True
 
     try:
diff --git a/tests/runtime/test_boundary.py b/tests/runtime/test_boundary.py
index fb8f827e8..226510b83 100644
--- a/tests/runtime/test_boundary.py
+++ b/tests/runtime/test_boundary.py
@@ -98,7 +98,7 @@ def test_kernel_vs_copy_boundary():
 
 def test_boundary_gpu():
     pytest.importorskip('cupy')
-    dh = SerialDataHandling(domain_size=(7, 7), default_target=Target.GPU)
+    dh = SerialDataHandling(domain_size=(7, 7), default_target=Target.CurrentGPU)
     src = dh.add_array('src')
     dh.fill("src", 0.0, ghost_layers=True)
     dh.fill("src", 1.0, ghost_layers=False)
@@ -111,7 +111,7 @@ def test_boundary_gpu():
                                              name="boundary_handling_cpu", target=Target.CPU)
 
     boundary_handling = BoundaryHandling(dh, src.name, boundary_stencil,
-                                         name="boundary_handling_gpu", target=Target.GPU)
+                                         name="boundary_handling_gpu", target=Target.CurrentGPU)
 
     neumann = Neumann()
     for d in ('N', 'S', 'W', 'E'):
diff --git a/tests/runtime/test_datahandling.py b/tests/runtime/test_datahandling.py
index 9d7ff924e..9e7c73cac 100644
--- a/tests/runtime/test_datahandling.py
+++ b/tests/runtime/test_datahandling.py
@@ -118,7 +118,7 @@ def synchronization(dh, test_gpu=False):
 
 def kernel_execution_jacobi(dh, target):
 
-    test_gpu = target == Target.GPU
+    test_gpu = target == Target.CurrentGPU
     dh.add_array('f', gpu=test_gpu)
     dh.add_array('tmp', gpu=test_gpu)
 
@@ -219,15 +219,15 @@ def test_kernel():
         try:
             import cupy
             dh = create_data_handling(domain_size=domain_shape, periodicity=True)
-            kernel_execution_jacobi(dh, Target.GPU)
+            kernel_execution_jacobi(dh, Target.CurrentGPU)
         except ImportError:
             pass
 
 
-@pytest.mark.parametrize('target', (Target.CPU, Target.GPU))
+@pytest.mark.parametrize('target', (Target.CPU, Target.CurrentGPU))
 def test_kernel_param(target):
     for domain_shape in [(4, 5), (3, 4, 5)]:
-        if target == Target.GPU:
+        if target == Target.CurrentGPU:
             pytest.importorskip('cupy')
 
         dh = create_data_handling(domain_size=domain_shape, periodicity=True, default_target=target)
@@ -262,7 +262,7 @@ def test_add_arrays():
 def test_add_arrays_with_layout(shape, layout):
     pytest.importorskip('cupy')
 
-    dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.GPU)
+    dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.CurrentGPU)
     f1 = dh.add_array("f1", values_per_cell=19)
     dh.fill(f1.name, 1.0)
 
@@ -392,8 +392,6 @@ def test_array_handler(device_number):
     empty = array_handler.empty(shape=size, order="F")
     assert empty.strides == (8, 16)
 
-    random_array = array_handler.randn(size)
-
     cpu_array = np.empty((20, 40), dtype=np.float64)
     gpu_array = array_handler.to_gpu(cpu_array)
 
-- 
GitLab


From d5de40476583e59c40246ac843d7a0f161585920 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 14 Mar 2025 18:01:28 +0000
Subject: [PATCH 07/15] start updating docs

---
 docs/source/installation.md            | 16 +++++++--
 docs/source/user_manual/gpu_kernels.md | 48 ++++++++++++++++----------
 2 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/docs/source/installation.md b/docs/source/installation.md
index deb2b0613..8c344e760 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -48,10 +48,22 @@ to build this documentation, and `tests`, which adds `flake8` for code style che
 For more information on developing pystencils, see the [](#contribution_guide).
 :::
 
-### For Nvidia GPUs
+### For GPUs
 
 If you have an Nvidia graphics processor and CUDA installed, you can use pystencils to directly compile
 and execute kernels running on your GPU.
-This requires a working installation of [cupy](https://cupy.dev).
+This requires a working installation of [Cupy](https://cupy.dev).
 Please refer to the cupy's [installation manual](https://docs.cupy.dev/en/stable/install.html)
 for details about installing cupy.
+
+You can also use Cupy together with AMD ROCm for AMD graphics cards,
+but the setup steps are a bit more complicated - you might have to build cupy from source.
+The Cupy documentation covers this in their [installation guide for Cupy on ROCm][cupy-rocm].
+
+:::{note}
+Since Cupy's support for ROCm is at this time still an experimental feature,
+just-in-time compilation of pystencils HIP kernels
+for the ROCm platform must also considered *experimental*.
+:::
+
+[cupy-rocm]: https://docs.cupy.dev/en/stable/install.html#using-cupy-on-amd-gpu-experimental "Cupy on ROCm"
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 610c61ddf..2219ce042 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -26,23 +26,46 @@ import matplotlib.pyplot as plt
 (guide_gpukernels)=
 # Pystencils for GPUs
 
-Pystencils offers code generation for Nvidia GPUs using the CUDA programming model,
+Pystencils offers code generation for Nvidia and AMD GPUs
+using the CUDA and HIP programming models,
 as well as just-in-time compilation and execution of CUDA kernels from within Python
 based on the [cupy] library.
 This section's objective is to give a detailed introduction into the creation of
 GPU kernels with pystencils.
 
-## Generate, Compile and Run CUDA Kernels
+:::{note}
+[CuPy][cupy] is a Python library for numerical computations on GPU arrays,
+which operates much in the same way that [NumPy][numpy] works on CPU arrays.
+Cupy and NumPy expose nearly the same APIs for array operations;
+the difference being that CuPy allocates all its arrays on the GPU
+and performs its operations as CUDA kernels.
+Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc].
+In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code,
+and to allocate and manage the data these kernels can be executed on.
+
+For more information on CuPy, refer to [their documentation][cupy-docs].
+:::
+
+## Generate, Compile and Run GPU Kernels
+
+The CUDA and HIP platforms are made available in pystencils via the code generation targets
+{any}`Target.CUDA` and {any}`Target.HIP`.
+For pystencils code to be portable between both, we can use {any}`Target.CurrentGPU` to
+automatically select one or the other, depending on the current runtime environment.
+
+:::{note}
+If `cupy` is not installed, `create_kernel` will raise an exception when using `Target.CurrentGPU`.
+When exporting kernels to be compiled externally in an environment where cupy is not available,
+the GPU target must therefore be set explicitly.
+:::
 
-In order to obtain a CUDA implementation of a symbolic kernel, naught more is required
-than setting the {any}`target <CreateKernelConfig.target>` code generator option to
-{any}`Target.CUDA`:
+Here is a snippet creating a kernel for the locally available GPU target:
 
 ```{code-cell} ipython3
 f, g = ps.fields("f, g: float64[3D]")
 update = ps.Assignment(f.center(), 2 * g.center())
 
-cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+cfg = ps.CreateKernelConfig(target=ps.Target.CurrentGPU)
 kernel = ps.create_kernel(update, cfg)
 
 ps.inspect(kernel)
@@ -68,19 +91,6 @@ kfunc = kernel.compile()
 kfunc(f=f_arr, g=g_arr)
 ```
 
-:::{note}
-[CuPy][cupy] is a Python library for numerical computations on GPU arrays,
-which operates much in the same way that [NumPy][numpy] works on CPU arrays.
-Cupy and NumPy expose nearly the same APIs for array operations;
-the difference being that CuPy allocates all its arrays on the GPU
-and performs its operations as CUDA kernels.
-Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc].
-In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code,
-and to allocate and manage the data these kernels can be executed on.
-
-For more information on CuPy, refer to [their documentation][cupy-docs].
-:::
-
 (indexing_and_launch_config)=
 ## Modify the Indexing Scheme and Launch Configuration
 
-- 
GitLab


From 3d44a19e67699cc9f0b0d73426d7e76de985a742 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 14 Mar 2025 18:41:36 +0000
Subject: [PATCH 08/15] Update docs, installation + contrib guide

---
 docs/source/backend/gpu_codegen.md       | 14 +++-----
 docs/source/backend/platforms.md         | 15 ++++++--
 docs/source/contributing/dev-workflow.md | 46 +++++++++++++++++++++---
 docs/source/installation.md              | 21 ++---------
 docs/source/user_manual/gpu_kernels.md   |  6 ++--
 pyproject.toml                           |  3 +-
 6 files changed, 67 insertions(+), 38 deletions(-)

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 1082669e6..0415c12c6 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -2,23 +2,19 @@
 
 The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components:
 
- - The {any}`CudaPlatform` at `backend.platforms` which performs materialization of a kernel's iteration
-   space by mapping GPU block and thread indices to iteration space points. To perform this task,
-   it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping.
+ - The platforms {any}`CudaPlatform` and {any}`HipPlatform` at `backend.platforms`
+   which perform materialization of a kernel's iteration space
+   by mapping GPU block and thread indices to iteration space points.
+   To perform this task, it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping.
    The platform also takes care of lowering mathematical functions to their CUDA runtime library implementation.
  - In the code generation driver, the strings are drawn by the `GpuIndexing` helper class.
    It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration
    for the runtime system.
 
-:::{attention}
-
-Code generation for HIP through the `CudaPlatform` is experimental and not tested at the moment.
-:::
-
 ## The CUDA Platform and Thread Mappings
 
 ```{eval-rst}
-.. module:: pystencils.backend.platforms.cuda
+.. module:: pystencils.backend.platforms.generic_gpu
 
 .. autosummary::
     :toctree: generated
diff --git a/docs/source/backend/platforms.md b/docs/source/backend/platforms.md
index e7ffc6f15..2d2c33d86 100644
--- a/docs/source/backend/platforms.md
+++ b/docs/source/backend/platforms.md
@@ -26,7 +26,6 @@ targets in the future.
     Platform
     GenericCpu
     GenericVectorCpu
-    GenericGpu
 ```
 
 ## CPU Platforms
@@ -49,6 +48,18 @@ targets in the future.
     :nosignatures:
     :template: autosummary/entire_class.rst
 
+    GenericGpu
     CudaPlatform
-    SyclPlatform
+    HipPlatform
 ```
+
+## Experimental Platforms
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    SyclPlatform
+```
\ No newline at end of file
diff --git a/docs/source/contributing/dev-workflow.md b/docs/source/contributing/dev-workflow.md
index 8daac8cbd..d92916130 100644
--- a/docs/source/contributing/dev-workflow.md
+++ b/docs/source/contributing/dev-workflow.md
@@ -48,16 +48,22 @@ git pull --set-upstream upstream master
 
 ## Set Up the Python Environment
 
+### Prerequesites
+
 To develop pystencils, you will need at least the following software installed on your machine:
 
 - Python 3.10 or later: Since pystencils minimal supported version is Python 3.10, we recommend that you work with Python 3.10 directly.
 - An up-to-date C++ compiler, used by pystencils to JIT-compile generated code
 - [Nox](https://nox.thea.codes/en/stable/), which we use for test automation.
   Nox will be used extensively in the instructions on testing below.
-- Optionally [CUDA](https://developer.nvidia.com/cuda-toolkit),
-  if you have an Nvidia or AMD GPU and plan to develop on pystencils' GPU capabilities
+- Optionally, for GPU development:
+  - At least CUDA 11 for Nvidia GPUs, or
+  - At least ROCm/HIP 6.1 for AMD GPUs.
+
+### Virtual Environment Setup
 
-Once you have these, set up a [virtual environment](https://docs.python.org/3/library/venv.html) for development.
+Once you have all the prerequesites,
+set up a [virtual environment](https://docs.python.org/3/library/venv.html) for development.
 This ensures that your system's installation of Python is kept clean, and isolates your development environment
 from outside influence.
 Use the following commands to create a virtual environment at `.venv` and perform an editable install of pystencils into it:
@@ -74,7 +80,39 @@ Setting `PIP_REQUIRE_VIRTUALENV` ensures that pip refuses to install packages gl
 Consider setting this variable globally in your shell's configuration file.
 :::
 
-You are now ready to go! Create a new git branch to work on, open up an IDE, and start coding.
+:::{admonition} Feature Groups
+The above installation instructions assume that you will be running all code checking
+and test tasks through `nox`.
+If you need or want to run them manually, you will need to add one or more
+of these feature groups to your installation:
+
+ - `doc`, which contains all dependencies required to build this documentation;
+ - `dev`, which adds `flake8` for code style checking,
+   `mypy` for static type checking,
+    and the `black` formatter;
+ - `testsuite`, which adds `pytest` plus plugins and some more dependencies required
+   for running the test suite.
+
+Depending on your development focus, you might also need to add some of the user feature
+groups listed in [the installation guide](#installation_guide).
+:::
+
+### Cupy for CUDA and HIP
+
+When developing for NVidia or AMD GPUs, you will likely need an installation of [cupy](https://cupy.dev/).
+Since cupy has to be built specifically against the libraries of a given CUDA or ROCm version,
+it cannot be installed directly via dependency resolution from pystencils.
+For instructions on how to install Cupy, refer to their [installation manual](https://docs.cupy.dev/en/stable/install.html).
+
+### Test Your Setup
+
+To check if your setup is complete, a good check is to invoke the pystencils test suite:
+
+```bash
+nox -s "testsuite(cpu)"
+```
+
+If this finishes without errors, you are ready to go! Create a new git branch to work on, open up an IDE, and start coding.
 Make sure your IDE recognizes the virtual environment you created, though.
 
 ## Static Code Analysis
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 8c344e760..5cb274c93 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,4 +1,4 @@
-(_installation)=
+(installation_guide)=
 # Setup and Installation
 
 ## Install pystencils
@@ -17,7 +17,7 @@ git clone -b v2.0-dev https://i10git.cs.fau.de/pycodegen/pystencils.git
 pip install -e pystencils
 ```
 
-### Feature Groups
+## Feature Groups
 
 In both cases, you can add a set of optional features to your installation by listing them
 in square brackets (e.g. `pip install -e pystencils[feature1, feature2]`).
@@ -33,22 +33,7 @@ The following feature sets are available:
 - `use_cython`: Install [Cython](https://cython.org/), which is used internally by pystencils
   to accelerate the setup of boundary conditions.
 
-:::{dropdown} For Developers
-
-If you are developing pystencils, we recommend you perform an editable install of your
-local clone of the repository, with all optional features:
-```bash
-pip install -e pystencils[alltrafos,interactive,use_cython,doc,testsuite]
-```
-
-This includes the additional feature groups `doc`, which contains all dependencies required
-to build this documentation, and `tests`, which adds `flake8` for code style checking,
-`mypy` for static type checking, and `pytest` plus plugins for running the test suite.
-
-For more information on developing pystencils, see the [](#contribution_guide).
-:::
-
-### For GPUs
+## For GPUs
 
 If you have an Nvidia graphics processor and CUDA installed, you can use pystencils to directly compile
 and execute kernels running on your GPU.
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 2219ce042..14a29c41c 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -55,8 +55,8 @@ automatically select one or the other, depending on the current runtime environm
 
 :::{note}
 If `cupy` is not installed, `create_kernel` will raise an exception when using `Target.CurrentGPU`.
-When exporting kernels to be compiled externally in an environment where cupy is not available,
-the GPU target must therefore be set explicitly.
+You can still generate kernels for CUDA or HIP directly even without Cupy;
+you just won't be able to just-in-time compile and run them.
 :::
 
 Here is a snippet creating a kernel for the locally available GPU target:
@@ -218,7 +218,7 @@ assignments = [
 ```{code-cell} ipython3
 y = ps.DEFAULTS.spatial_counters[0]
 cfg = ps.CreateKernelConfig()
-cfg.target= ps.Target.CUDA
+cfg.target= ps.Target.CurrentGPU
 cfg.iteration_slice = ps.make_slice[:, y:]
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index 55b21cbbf..ae539b12c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,6 @@ classifiers = [
 "Source Code" = "https://i10git.cs.fau.de/pycodegen/pystencils"
 
 [project.optional-dependencies]
-gpu = ['cupy']
 alltrafos = ['islpy', 'py-cpuinfo']
 bench_db = ['blitzdb', 'pymongo', 'pandas']
 interactive = [
@@ -76,7 +75,7 @@ testsuite = [
     'matplotlib',
     'py-cpuinfo',
     'randomgen>=1.18',
-    'scipy'
+    'scipy',
 ]
 
 [build-system]
-- 
GitLab


From 4716c97aeae07e88525bd83bd6b3fb2b36926d0e Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Sat, 15 Mar 2025 11:26:38 +0100
Subject: [PATCH 09/15] final adjustments to the docs

---
 docs/source/installation.md            | 2 +-
 docs/source/user_manual/gpu_kernels.md | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/source/installation.md b/docs/source/installation.md
index 5cb274c93..8fdb5684f 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -41,7 +41,7 @@ This requires a working installation of [Cupy](https://cupy.dev).
 Please refer to the cupy's [installation manual](https://docs.cupy.dev/en/stable/install.html)
 for details about installing cupy.
 
-You can also use Cupy together with AMD ROCm for AMD graphics cards,
+You can also use Cupy together with AMD ROCm and HIP for AMD graphics cards,
 but the setup steps are a bit more complicated - you might have to build cupy from source.
 The Cupy documentation covers this in their [installation guide for Cupy on ROCm][cupy-rocm].
 
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 14a29c41c..d272cf9c6 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -39,7 +39,7 @@ which operates much in the same way that [NumPy][numpy] works on CPU arrays.
 Cupy and NumPy expose nearly the same APIs for array operations;
 the difference being that CuPy allocates all its arrays on the GPU
 and performs its operations as CUDA kernels.
-Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc].
+Also, CuPy exposes a just-in-time-compiler for GPU kernels.
 In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code,
 and to allocate and manage the data these kernels can be executed on.
 
@@ -271,5 +271,4 @@ only a part of the triangle is being processed.
 
 [cupy]: https://cupy.dev "CuPy Homepage"
 [numpy]: https://numpy.org "NumPy Homepage"
-[nvrtc]: https://docs.nvidia.com/cuda/nvrtc/index.html "NVIDIA Runtime Compilation Library"
 [cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation"
-- 
GitLab


From 7db7f592e9691ced3052fa6301f45c89afa54165 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Sat, 15 Mar 2025 11:50:17 +0100
Subject: [PATCH 10/15] do not use cupy in code that must always be executable

---
 src/pystencils/jit/gpu_cupy.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 69e965325..7065ce815 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -208,12 +208,6 @@ class CupyKernelWrapper(KernelWrapper):
 class CupyJit(JitBase):
 
     def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)):
-        self._runtime_headers: set[str]
-        if cp.cuda.runtime.is_hip:
-            self._runtime_headers = set()
-        else:
-            self._runtime_headers = {"<cstdint>"}
-
         if len(default_block_size) > 3:
             raise ValueError(
                 f"Invalid block size: {default_block_size}. Must be at most three-dimensional."
@@ -234,12 +228,12 @@ class CupyJit(JitBase):
             raise JitError(
                 "The CupyJit just-in-time compiler only accepts GPU kernels generated for CUDA or HIP"
             )
-        
+
         if kernel.target == Target.CUDA and cp.cuda.runtime.is_hip:
             raise JitError(
                 "Cannot compile a CUDA kernel on a HIP-based Cupy installation."
             )
-        
+
         if kernel.target == Target.HIP and not cp.cuda.runtime.is_hip:
             raise JitError(
                 "Cannot compile a HIP kernel on a CUDA-based Cupy installation."
@@ -261,7 +255,13 @@ class CupyJit(JitBase):
         return tuple(options)
 
     def _prelude(self, kfunc: GpuKernel) -> str:
-        headers = self._runtime_headers
+
+        headers: set[str]
+        if cp.cuda.runtime.is_hip:
+            headers = set()
+        else:
+            headers = {"<cstdint>"}
+
         headers |= kfunc.required_headers
 
         if '"pystencils_runtime/half.h"' in headers:
-- 
GitLab


From 9b1219516168eaa987d2661a875922569284f8b6 Mon Sep 17 00:00:00 2001
From: Richard Angersbach <richard.angersbach@fau.de>
Date: Mon, 17 Mar 2025 14:23:59 +0100
Subject: [PATCH 11/15] Apply 1 suggestion(s) to 1 file(s)

Co-authored-by: Richard Angersbach <richard.angersbach@fau.de>
---
 docs/source/backend/gpu_codegen.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 0415c12c6..b487edbbb 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -11,7 +11,7 @@ The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP co
    It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration
    for the runtime system.
 
-## The CUDA Platform and Thread Mappings
+## The GPU Platform and Thread Mappings
 
 ```{eval-rst}
 .. module:: pystencils.backend.platforms.generic_gpu
-- 
GitLab


From 6a06d72faeb040058b67bfec5ffee2cacff2bcad Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 18 Mar 2025 15:42:57 +0100
Subject: [PATCH 12/15] Merge branch 'v2.0-dev' into fhennig/hip-target

---
 docs/source/backend/gpu_codegen.md            |   7 +
 docs/source/user_manual/gpu_kernels.md        |  45 +-
 .../backend/platforms/generic_gpu.py          |  29 +-
 src/pystencils/backend/platforms/sycl.py      |  17 +-
 src/pystencils/codegen/config.py              |  44 +-
 src/pystencils/codegen/driver.py              |  24 +-
 src/pystencils/codegen/gpu_indexing.py        | 410 +++++++++++++++---
 src/pystencils/jit/gpu_cupy.py                |  13 +-
 src/pystencils/utils.py                       |   5 +
 tests/kernelcreation/test_gpu.py              | 133 +++++-
 10 files changed, 573 insertions(+), 154 deletions(-)

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index b487edbbb..3fe00840e 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -68,6 +68,13 @@ These depend on the type of the launch configuration:
 while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel
 parameters,
 the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size.
+The `DynamicBlockSizeLaunchConfiguration` dynamically computes the grid size from either the default block size
+or a computed block size. Computing block sizes can be signaled by the user via the `trim_block_size` or 
+`fit_block_size` member functions. These function receive an initial block size as an argument and adapt it.
+The `trim_block_size` function trims the initial block size with the sizes of the iteration space, i.e. it takes 
+the minimum value of both sizes per dimension. The `fit_block_size` performs a block fitting algorithm that adapts 
+the initial block size by incrementally enlarging the trimmed block size until it is large enough 
+and aligns with the warp size.
 
 The `evaluate` method can only be used from within a Python runtime environment.
 When exporting pystencils CUDA kernels for external use in C++ projects,
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index d272cf9c6..7a3d54f6f 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -121,24 +121,49 @@ For most kernels with an at most three-dimensional iteration space,
 this behavior is sufficient and desired.
 It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`.
 
-If the `Linear3D` indexing scheme is used, you may modifiy the GPU thread block size in two places.
-The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` 
-code generator option;
-if none is specified, a default depending on the iteration space's dimensionality will be used.
-
-The block size can furthermore be modified at the compiled kernel's wrapper object via the
-`launch_config.block_size` attribute:
+The GPU thread block size of a compiled kernel's wrapper object can only be directly modified
+for manual launch configurations, cf. the section for [manual launch configurations](#manual_launch_grids).
+Linear indexing schemes without manual launch configurations either employ default block sizes
+or compute block sizes using user-exposed member functions that adapt the initial block size that was
+passed as an argument. The available functions are :meth:`fit_block_size` and :meth:`trim_block_size`.
+The :meth:`trim_block_size` function trims the user-defined initial block size with the iteration space.
+The :meth:`fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is
+divisible by the hardware's warp size.
 
 ```{code-cell} ipython3
+:tags: [raises-exception]
 kfunc = kernel.compile()
-kfunc.launch_config.block_size = (256, 2, 1)
 
-# Run the kernel
+# three different configuration cases for block size:
+
+# a) Nothing
+# -> use default block size, perform no fitting, no trimming
+
+# b) Activate fitting with initial block size
+kfunc.launch_config.fit_block_size((8, 8, 4))
+
+# c) Activate trimming with initial block size
+kfunc.launch_config.trim_block_size((8, 8, 4))
+
+# finally run the kernel...
 kfunc(f=f_arr, g=g_arr)
 ```
 
+Block sizes aligning with multiples of the hardware's warp size allow for a better usage of the GPUs resources.
+Even though such block size configurations are not enforced, notifying our code generator via the 
+`GpuOptions.assume_warp_aligned_block_size` option that the configured block size is divisible by the warp size allows 
+for further optimization potential, e.g. for warp-level reductions.
+When setting this option to `True`, the user has to make sure that this alignment applies. 
+For [manual launch configurations](#manual_launch_grids), this can be achieved by manually providing suitable 
+block sizes via the `launch_config.block_size`. For the other launch configurations, this criterion is guaranteed 
+by using the default block sizes in pystencils. Using :meth:`fit_block_size` also guarantees this while also producing
+block sizes that are better customized towards the kernel's iteration space. For :meth:`trim_block_size`, 
+the trimmed block's dimension that is closest the next multiple of the warp size is rounded up to the next multiple
+of the warp size. For all cases, the final block size is checked against the imposed hardware limits and an error
+is thrown in case these limits are exceeded.
+
 In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments
-and the given thread block size.
+and the final thread block size.
 
 :::{attention}
 
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index fac37ffa5..11425d923 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -174,12 +174,10 @@ class GenericGpu(Platform):
     def __init__(
         self,
         ctx: KernelCreationContext,
-        omit_range_check: bool = False,
         thread_mapping: ThreadMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
-        self._omit_range_check = omit_range_check
         self._thread_mapping = (
             thread_mapping if thread_mapping is not None else Linear3DMapping()
         )
@@ -278,17 +276,12 @@ class GenericGpu(Platform):
             indexing_decls.append(
                 self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
-            if not self._omit_range_check:
-                conds.append(PsLt(ctr_expr, dim.stop))
-
-        if conds:
-            condition: PsExpression = conds[0]
-            for cond in conds[1:]:
-                condition = PsAnd(condition, cond)
-            ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
-        else:
-            body.statements = indexing_decls + body.statements
-            ast = body
+            conds.append(PsLt(ctr_expr, dim.stop))
+
+        condition: PsExpression = conds[0]
+        for cond in conds[1:]:
+            condition = PsAnd(condition, cond)
+        ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
 
         return ast
 
@@ -320,12 +313,8 @@ class GenericGpu(Platform):
         ]
         body.statements = mappings + body.statements
 
-        if not self._omit_range_check:
-            stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr_expr.clone(), stop)
-            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-        else:
-            body.statements = [sparse_idx_decl] + body.statements
-            ast = body
+        stop = PsExpression.make(ispace.index_list.shape[0])
+        condition = PsLt(sparse_ctr_expr.clone(), stop)
+        ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
 
         return ast
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index f3c4bb3d5..d16c4f51b 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -35,12 +35,10 @@ class SyclPlatform(Platform):
     def __init__(
         self,
         ctx: KernelCreationContext,
-        omit_range_check: bool = False,
         automatic_block_size: bool = False,
     ):
         super().__init__(ctx)
 
-        self._omit_range_check = omit_range_check
         self._automatic_block_size = automatic_block_size
 
     @property
@@ -136,8 +134,7 @@ class SyclPlatform(Platform):
             indexing_decls.append(
                 PsDeclaration(ctr, dim.start + work_item_idx * dim.step)
             )
-            if not self._omit_range_check:
-                conds.append(PsLt(ctr, dim.stop))
+            conds.append(PsLt(ctr, dim.stop))
 
         if conds:
             condition: PsExpression = conds[0]
@@ -182,15 +179,9 @@ class SyclPlatform(Platform):
         ]
         body.statements = mappings + body.statements
 
-        if not self._omit_range_check:
-            stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr, stop)
-            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-        else:
-            body.statements = [sparse_idx_decl] + body.statements
-            ast = body
-
-        return ast
+        stop = PsExpression.make(ispace.index_list.shape[0])
+        condition = PsLt(sparse_ctr, stop)
+        return PsBlock([sparse_idx_decl, PsConditional(condition, body)])
 
     def _item_type(self, rank: int):
         if not self._automatic_block_size:
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index a765dea2e..91aff43f4 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -372,30 +372,38 @@ class GpuOptions(ConfigBase):
     indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D)
     """Thread indexing scheme for dense GPU kernels."""
 
-    omit_range_check: BasicOption[bool] = BasicOption(False)
-    """If set to `True`, omit the iteration counter range check.
+    manual_launch_grid: BasicOption[bool] = BasicOption(False)
+    """Always require a manually specified launch grid when running this kernel.
     
-    By default, the code generator introduces a check if the iteration counters computed from GPU block and thread
-    indices are within the prescribed loop range.
-    This check can be discarded through this option, at your own peril.
+    If set to `True`, the code generator will not attempt to infer the size of
+    the launch grid from the kernel.
+    The launch grid will then have to be specified manually at runtime.
     """
 
-    block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
-    """Desired block size for the execution of GPU kernels.
+    warp_size: BasicOption[int] = BasicOption()
+    """Specifies the size of a warp (CUDA) or wavefront (HIP).
     
-    This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>`
-    is chosen as an indexing scheme.
-    The block size may be overridden at runtime.
+    If this option is not set the default value for the given target will be automatically used.
     """
 
-    manual_launch_grid: BasicOption[bool] = BasicOption(False)
-    """Always require a manually specified launch grid when running this kernel.
+    assume_warp_aligned_block_size: BasicOption[bool] = BasicOption(False)
+    """Specifies whether block sizes are divisible by the hardware's warp size.
     
-    If set to `True`, the code generator will not attempt to infer the size of
-    the launch grid from the kernel.
-    The launch grid will then have to be specified manually at runtime.
+    If set to `True`, the code generator can employ optimizations that require this assumption, 
+    e.g. warp-level reductions.
+    The pystencils Cupy runtime also checks if user-provided block sizes fulfill this criterion.
     """
 
+    @staticmethod
+    def default_warp_size(target: Target):
+        match target:
+            case Target.CUDA:
+                return 32
+            case _:
+                raise NotImplementedError(
+                    f"No default warp/wavefront size known for target {target}"
+                )
+
     @indexing_scheme.validate
     def _validate_idx_scheme(self, val: str | GpuIndexingScheme):
         if isinstance(val, GpuIndexingScheme):
@@ -605,7 +613,7 @@ class CreateKernelConfig(ConfigBase):
             elif target == Target.CUDA or target == Target.HIP:
                 try:
                     from ..jit.gpu_cupy import CupyJit
-                    
+
                     return CupyJit()
 
                 except ImportError:
@@ -738,9 +746,7 @@ class CreateKernelConfig(ConfigBase):
                 UserWarning,
             )
 
-            self.gpu = GpuOptions(
-                block_size=gpu_indexing_params.get("block_size", None)
-            )
+            self.gpu = GpuOptions()
 
 
 def _deprecated_option(name, instead):  # pragma: no cover
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 9e3c8b163..9f59e3510 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -11,6 +11,7 @@ from .config import (
     GhostLayerSpec,
     IterationSliceSpec,
     GpuIndexingScheme,
+    GpuOptions,
 )
 from .kernel import Kernel, GpuKernel
 from .properties import PsSymbolProperty, FieldBasePtr
@@ -401,13 +402,22 @@ class DefaultKernelCreationDriver:
         if not self._target.is_gpu():
             return None
 
-        from .gpu_indexing import dim3
-
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
-        block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
+        assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
+        warp_size: int | None = self._cfg.gpu.get_option("warp_size")
 
-        return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid)
+        if warp_size is None:
+            warp_size = GpuOptions.default_warp_size(self._target)
+
+        return GpuIndexing(
+            self._ctx,
+            self._target,
+            idx_scheme,
+            warp_size,
+            manual_launch_grid,
+            assume_warp_aligned_block_size,
+        )
 
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
@@ -437,9 +447,6 @@ class DefaultKernelCreationDriver:
                 )
 
         elif self._target.is_gpu():
-            gpu_opts = self._cfg.gpu
-            omit_range_check: bool = gpu_opts.get_option("omit_range_check")
-
             thread_mapping = (
                 self._gpu_indexing.get_thread_mapping()
                 if self._gpu_indexing is not None
@@ -457,7 +464,6 @@ class DefaultKernelCreationDriver:
 
             return GpuPlatform(
                 self._ctx,
-                omit_range_check=omit_range_check,
                 thread_mapping=thread_mapping,
             )
 
@@ -465,11 +471,9 @@ class DefaultKernelCreationDriver:
             from ..backend.platforms import SyclPlatform
 
             auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size")
-            omit_range_check = self._cfg.gpu.get_option("omit_range_check")
 
             return SyclPlatform(
                 self._ctx,
-                omit_range_check=omit_range_check,
                 automatic_block_size=auto_block_size,
             )
 
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 890afb0fb..0524eb0b1 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -1,13 +1,16 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from typing import cast, Any, Callable
 from itertools import chain
+from warnings import warn
 
 from .functions import Lambda
 from .parameters import Parameter
 from .errors import CodegenError
-from .config import GpuIndexingScheme, _AUTO_TYPE
+from .config import GpuIndexingScheme
+from .target import Target
 
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -16,13 +19,36 @@ from ..backend.kernelcreation import (
 )
 from ..backend.platforms.generic_gpu import ThreadMapping
 
-from ..backend.ast.expressions import PsExpression
+from ..backend.ast.expressions import PsExpression, PsIntDiv
+from math import prod
 
+from ..utils import ceil_to_multiple
 
 dim3 = tuple[int, int, int]
 _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 
+@dataclass
+class HardwareProperties:
+    warp_size: int
+    max_threads_per_block: int
+    max_block_sizes: dim3
+
+    def block_size_exceeds_hw_limits(
+            self,
+            block_size: tuple[int, ...]
+    ) -> bool:
+        """Checks if provided block size conforms limits given by the hardware."""
+
+        return (
+            any(
+                size > max_size
+                for size, max_size in zip(block_size, self.max_block_sizes)
+            )
+            or prod(block_size) > self.max_threads_per_block
+        )
+
+
 class GpuLaunchConfiguration(ABC):
     """Base class for launch configurations for CUDA and HIP kernels.
 
@@ -33,6 +59,18 @@ class GpuLaunchConfiguration(ABC):
             parameters to the associated kernel
     """
 
+    @property
+    @abstractmethod
+    def block_size(self) -> dim3 | None:
+        """Returns desired block size if available."""
+        pass
+
+    @block_size.setter
+    @abstractmethod
+    def block_size(self, val: dim3):
+        """Sets desired block size if possible."""
+        pass
+
     @property
     @abstractmethod
     def parameters(self) -> frozenset[Parameter]:
@@ -52,6 +90,25 @@ class GpuLaunchConfiguration(ABC):
         this launch configuration, such that when the configuration changes, the JIT parameter
         cache is invalidated."""
 
+    @staticmethod
+    def get_default_block_size(rank: int) -> dim3:
+        """Returns the default block size configuration used by the generator."""
+
+        match rank:
+            case 1:
+                return (256, 1, 1)
+            case 2:
+                return (16, 16, 1)
+            case 3:
+                return (8, 8, 4)
+            case _:
+                assert False, "unreachable code"
+
+    @staticmethod
+    def _excessive_block_size_error_msg(block_size: tuple[int, ...]):
+        return f"Unable to determine GPU block size for this kernel. \
+        Final block size was too large: {block_size}."
+
 
 class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
     """Launch configuration that is dynamically computed from kernel parameters.
@@ -63,20 +120,37 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
         self,
         block_size: _Dim3Lambda,
         grid_size: _Dim3Lambda,
+        hw_props: HardwareProperties,
+        assume_warp_aligned_block_size: bool,
     ) -> None:
         self._block_size = block_size
         self._grid_size = grid_size
+        self._hw_props = hw_props
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
         self._params: frozenset[Parameter] = frozenset().union(
             *(lb.parameters for lb in chain(block_size, grid_size))
         )
 
+    @property
+    def block_size(self) -> dim3 | None:
+        """Block size is only available when `evaluate` is called."""
+        return None
+
+    @block_size.setter
+    def block_size(self, val: dim3):
+        AttributeError("Setting `block_size` on an automatic launch configuration has no effect.")
+
     @property
     def parameters(self) -> frozenset[Parameter]:
         return self._params
 
     def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
         block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
+
+        if self._hw_props.block_size_exceeds_hw_limits(block_size):
+            raise CodegenError(f"Block size {block_size} exceeds hardware limits.")
+
         grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
         return cast(dim3, block_size), cast(dim3, grid_size)
 
@@ -91,8 +165,12 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
     """
 
     def __init__(
-        self,
+        self, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool = False
     ) -> None:
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+        self._hw_props = hw_props
+
         self._block_size: dim3 | None = None
         self._grid_size: dim3 | None = None
 
@@ -123,6 +201,18 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
         if self._grid_size is None:
             raise AttributeError("No GPU grid size was set by the user.")
 
+        if (
+            self._assume_warp_aligned_block_size
+            and prod(self._block_size) % self._hw_props.warp_size != 0
+        ):
+            raise CodegenError(
+                "Specified block sizes must align with warp size with "
+                "`assume_warp_aligned_block_size` enabled."
+            )
+
+        if self._hw_props.block_size_exceeds_hw_limits(self._block_size):
+            raise CodegenError(self._excessive_block_size_error_msg(self._block_size))
+
         return self._block_size, self._grid_size
 
     def jit_cache_key(self) -> Any:
@@ -130,24 +220,60 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
 
 class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
-    """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size.
+    """GPU launch configuration that dynamically computes the grid size from either the default block size
+    or a computed block size. Computing block sizes can be triggerred via the :meth:`trim_block_size` or
+    :meth:`fit_block_size` member functions. These functions adapt a user-defined initial block size that they
+    receive as an argument. The adaptation of the initial block sizes is described in the following:
 
-    The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items
-    in the kernel's iteration space as follows.
     For each dimension :math:`c \\in \\{ x, y, z \\}`,
 
-    - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``;
-    - otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
+    - if :meth:`fit_block_size` was chosen:
+
+        the initial block size is adapted such that it aligns with multiples of the hardware's warp size.
+        This is done using a fitting algorithm first trims the initial block size with the iteration space
+        and increases it incrementally until it is large enough and coincides with multiples of the warp size, i.e.
+
+        ``block_size.c = _fit_block_size_to_it_space(iter_space.c, init_block_size.c, hardware_properties)``
+
+        The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`.
+
+    - elif :meth:`trim_block_size` was chosen:
+
+        a trimming between the number of work items and the kernel's iteration space occurs, i.e.
+
+        - if ``init_block_size.c > num_work_items.c``, ``block_size = num_work_items.c``
+        - otherwise, ``block_size.c = init_block_size.c``
+
+        When `GpuOptions.assume_warp_aligned_block_size` is set, we ensure warp-alignment by
+        rounding the block size dimension that is closest the next multiple of the warp size.
+
+    - otherwise: the default block size is taken i.e.
+
+        ``block_size.c = get_default_block_size(rank=3).c``
+
+    The actual launch grid size is then computed as follows.
+
+    ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
     """
 
     def __init__(
         self,
         num_work_items: _Dim3Lambda,
-        default_block_size: dim3 | None = None,
+        hw_props: HardwareProperties,
+        assume_warp_aligned_block_size: bool,
     ) -> None:
         self._num_work_items = num_work_items
 
-        self._block_size: dim3 | None = default_block_size
+        self._hw_props = hw_props
+
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+        default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items))
+        self._default_block_size = default_bs
+        self._init_block_size: dim3 = default_bs
+        self._compute_block_size: (
+            Callable[[dim3, dim3, HardwareProperties], tuple[int, ...]] | None
+        ) = None
 
         self._params: frozenset[Parameter] = frozenset().union(
             *(wit.parameters for wit in num_work_items)
@@ -159,44 +285,188 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
         dimension from kernel parameters."""
         return self._num_work_items
 
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+        return self._params
+
     @property
     def block_size(self) -> dim3 | None:
-        """The desired GPU block size."""
-        return self._block_size
+        """Block size is only available when `evaluate` is called."""
+        return None
 
     @block_size.setter
     def block_size(self, val: dim3):
-        self._block_size = val
+        AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.")
+
+    @staticmethod
+    def _round_block_sizes_to_warp_size(
+        to_round: tuple[int, ...], warp_size: int
+    ) -> tuple[int, ...]:
+        # check if already aligns with warp size
+        if prod(to_round) % warp_size == 0:
+            return tuple(to_round)
+
+        # find index of element closest to warp size and round up
+        index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size)))
+        if index_to_round + 1 < len(to_round):
+            return (
+                *to_round[:index_to_round],
+                ceil_to_multiple(to_round[index_to_round], warp_size),
+                *to_round[index_to_round + 1:],
+            )
+        else:
+            return (
+                *to_round[:index_to_round],
+                ceil_to_multiple(to_round[index_to_round], warp_size),
+            )
 
-    @property
-    def parameters(self) -> frozenset[Parameter]:
-        """Parameters of this launch configuration"""
-        return self._params
+    def trim_block_size(self, block_size: dim3):
+        def call_trimming_factory(
+            it: dim3,
+            bs: dim3,
+            hw: HardwareProperties,
+        ):
+            return self._trim_block_size_to_it_space(it, bs, hw)
 
-    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
-        if self._block_size is None:
-            raise AttributeError("No GPU block size was specified by the user!")
+        self._init_block_size = block_size
+        self._compute_block_size = call_trimming_factory
+
+    def _trim_block_size_to_it_space(
+        self,
+        it_space: dim3,
+        block_size: dim3,
+        hw_props: HardwareProperties,
+    ) -> tuple[int, ...]:
+        """Returns specified block sizes trimmed with iteration space.
+        Raises CodegenError if trimmed block size does not conform hardware limits.
+        """
+
+        ret = tuple([min(b, i) for b, i in zip(block_size, it_space)])
+        if hw_props.block_size_exceeds_hw_limits(ret):
+            raise CodegenError(self._excessive_block_size_error_msg(ret))
+
+        if (
+            self._assume_warp_aligned_block_size
+            and prod(ret) % self._hw_props.warp_size != 0
+        ):
+            self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
+
+        return ret
+
+    def fit_block_size(self, block_size: dim3):
+        def call_fitting_factory(
+            it: dim3,
+            bs: dim3,
+            hw: HardwareProperties,
+        ):
+            return self._fit_block_size_to_it_space(it, bs, hw)
+
+        self._init_block_size = block_size
+        self._compute_block_size = call_fitting_factory
+
+    def _fit_block_size_to_it_space(
+        self,
+        it_space: dim3,
+        block_size: dim3,
+        hw_props: HardwareProperties,
+    ) -> tuple[int, ...]:
+        """Returns an optimized block size configuration with block sizes being aligned with the warp size.
+        Raises CodegenError if optimal block size could not be found or does not conform hardware limits.
+        """
 
+        def trim(to_trim: list[int]) -> list[int]:
+            return [min(b, i) for b, i in zip(to_trim, it_space)]
+
+        def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]:
+            if hw_props.block_size_exceeds_hw_limits(ret):
+                raise CodegenError(self._excessive_block_size_error_msg(ret))
+            return ret
+
+        trimmed = trim(list(block_size))
+        if (
+            prod(trimmed) >= hw_props.warp_size
+            and prod(trimmed) % hw_props.warp_size == 0
+        ):
+            # case 1: greater than min block size -> use trimmed result
+            return check_sizes_and_return(tuple(trimmed))
+
+        prev_trim_size = 0
+        resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space))
+        while prod(trimmed) is not prev_trim_size:
+            prev_trim_size = prod(trimmed)
+
+            # case 2: trimmed block is equivalent to the whole iteration space
+            if all(b == i for b, i in zip(trimmed, it_space)):
+                return check_sizes_and_return(
+                    self._round_block_sizes_to_warp_size(
+                        tuple(trimmed), hw_props.warp_size
+                    )
+                )
+            else:
+                # double block size in each dimension until block is large enough (or case 2 triggers)
+                for d in resize_order:
+                    trimmed[d] *= 2
+
+                    # trim fastest moving dim to multiples of warp size
+                    if (
+                        d == 0
+                        and trimmed[d] > hw_props.warp_size
+                        and trimmed[d] % hw_props.warp_size != 0
+                    ):
+                        # subtract remainder
+                        trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size)
+
+                    # check if block sizes are within hardware capabilities
+                    trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d])
+
+                    # trim again
+                    trimmed = trim(trimmed)
+
+                    # case 3: trim block is large enough
+                    if prod(trimmed) >= hw_props.warp_size:
+                        return check_sizes_and_return(
+                            self._round_block_sizes_to_warp_size(
+                                tuple(trimmed), hw_props.warp_size
+                            )
+                        )
+
+        raise CodegenError("Unable to determine GPU block size for this kernel.")
+
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
         from ..utils import div_ceil
 
         num_work_items = cast(
             dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items)
         )
-        reduced_block_size = cast(
-            dim3,
-            tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)),
-        )
+
+        block_size: dim3
+        if self._compute_block_size:
+            try:
+                computed_bs = self._compute_block_size(
+                    num_work_items, self._init_block_size, self._hw_props
+                )
+
+                block_size = cast(dim3, computed_bs)
+            except CodegenError as e:
+                block_size = self._default_block_size
+                warn(
+                    f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. "
+                    f"Block size fitting could not determine optimal block size configuration. "
+                    f"Defaulting back to {self._default_block_size}."
+                )
+        else:
+            block_size = self._default_block_size
+
         grid_size = cast(
             dim3,
-            tuple(
-                div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size)
-            ),
+            tuple(div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size)),
         )
 
-        return reduced_block_size, grid_size
+        return block_size, grid_size
 
     def jit_cache_key(self) -> Any:
-        return self._block_size
+        return ()
 
 
 class GpuIndexing:
@@ -218,14 +488,24 @@ class GpuIndexing:
     def __init__(
         self,
         ctx: KernelCreationContext,
+        target: Target,
         scheme: GpuIndexingScheme,
-        default_block_size: dim3 | _AUTO_TYPE | None = None,
+        warp_size: int,
         manual_launch_grid: bool = False,
+        assume_warp_aligned_block_size: bool = False,
     ) -> None:
         self._ctx = ctx
+        self._target = target
         self._scheme = scheme
-        self._default_block_size = default_block_size
+        self._warp_size = warp_size
         self._manual_launch_grid = manual_launch_grid
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+        self._hw_props = HardwareProperties(
+            warp_size,
+            self.get_max_threads_per_block(target),
+            self.get_max_block_sizes(target),
+        )
 
         from ..backend.kernelcreation import AstFactory
         from .driver import KernelFactory
@@ -233,6 +513,26 @@ class GpuIndexing:
         self._ast_factory = AstFactory(self._ctx)
         self._kernel_factory = KernelFactory(self._ctx)
 
+    @staticmethod
+    def get_max_block_sizes(target: Target):
+        match target:
+            case Target.CUDA:
+                return (1024, 1024, 64)
+            case _:
+                raise CodegenError(
+                    f"Cannot determine max GPU block sizes for target {target}"
+                )
+
+    @staticmethod
+    def get_max_threads_per_block(target: Target):
+        match target:
+            case Target.CUDA:
+                return 1024
+            case _:
+                raise CodegenError(
+                    f"Cannot determine max GPU threads per block for target {target}"
+                )
+
     def get_thread_mapping(self) -> ThreadMapping:
         """Retrieve a thread mapping object for use by the backend"""
 
@@ -247,7 +547,13 @@ class GpuIndexing:
     def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
         """Retrieve a factory for the launch configuration for later consumption by the runtime system"""
         if self._manual_launch_grid:
-            return ManualLaunchConfiguration
+
+            def factory():
+                return ManualLaunchConfiguration(
+                    self._hw_props, self._assume_warp_aligned_block_size
+                )
+
+            return factory
 
         match self._scheme:
             case GpuIndexingScheme.Linear3D:
@@ -268,10 +574,9 @@ class GpuIndexing:
             )
 
         work_items_expr += tuple(
-            self._ast_factory.parse_index(1)
-            for _ in range(3 - rank)
+            self._ast_factory.parse_index(1) for _ in range(3 - rank)
         )
-        
+
         num_work_items = cast(
             _Dim3Lambda,
             tuple(self._kernel_factory.create_lambda(wit) for wit in work_items_expr),
@@ -280,28 +585,12 @@ class GpuIndexing:
         def factory():
             return DynamicBlockSizeLaunchConfiguration(
                 num_work_items,
-                self._get_default_block_size(rank),
+                self._hw_props,
+                self._assume_warp_aligned_block_size,
             )
 
         return factory
 
-    def _get_default_block_size(self, rank: int) -> dim3:
-        if self._default_block_size is None:
-            raise CodegenError("The default block size option was not set")
-
-        if isinstance(self._default_block_size, _AUTO_TYPE):
-            match rank:
-                case 1:
-                    return (256, 1, 1)
-                case 2:
-                    return (128, 2, 1)
-                case 3:
-                    return (128, 2, 2)
-                case _:
-                    assert False, "unreachable code"
-        else:
-            return self._default_block_size
-
     def _get_blockwise4d_config_factory(
         self,
     ) -> Callable[[], AutomaticLaunchConfiguration]:
@@ -311,8 +600,19 @@ class GpuIndexing:
         if rank > 4:
             raise ValueError(f"Iteration space rank is too large: {rank}")
 
+        # impossible to use block size determination function since the iteration space is unknown
+        # -> round block size in fastest moving dimension up to multiple of warp size
+        rounded_block_size: PsExpression
+        if self._assume_warp_aligned_block_size:
+            warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
+            rounded_block_size = self._ast_factory.parse_index(
+                PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1),
+                         warp_size.clone()) * warp_size.clone())
+        else:
+            rounded_block_size = work_items[0]
+
         block_size = (
-            self._kernel_factory.create_lambda(work_items[0]),
+            self._kernel_factory.create_lambda(rounded_block_size),
             self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)),
             self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)),
         )
@@ -328,6 +628,8 @@ class GpuIndexing:
             return AutomaticLaunchConfiguration(
                 block_size,
                 cast(_Dim3Lambda, grid_size),
+                self._hw_props,
+                self._assume_warp_aligned_block_size,
             )
 
         return factory
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 7065ce815..221780961 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -1,4 +1,4 @@
-from typing import Any, Sequence, cast, Callable
+from typing import Any, Callable
 from dataclasses import dataclass
 
 try:
@@ -207,17 +207,6 @@ class CupyKernelWrapper(KernelWrapper):
 
 class CupyJit(JitBase):
 
-    def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)):
-        if len(default_block_size) > 3:
-            raise ValueError(
-                f"Invalid block size: {default_block_size}. Must be at most three-dimensional."
-            )
-
-        self._default_block_size: tuple[int, int, int] = cast(
-            tuple[int, int, int],
-            tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
-        )
-
     def compile(self, kernel: Kernel) -> KernelWrapper:
         if not HAVE_CUPY:
             raise JitError(
diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py
index 0049d0a2c..5ef13f31f 100644
--- a/src/pystencils/utils.py
+++ b/src/pystencils/utils.py
@@ -316,3 +316,8 @@ def div_ceil(divident, divisor):
     The result is unspecified if either argument is negative."""
 
     return c_intdiv(divident + divisor - 1, divisor)
+
+
+def ceil_to_multiple(divident, divisor):
+    """Rounds 'divident' to the next multiple of 'divisor'."""
+    return div_ceil(divident, divisor) * divisor
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 8d943e8fd..944bd1241 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -12,6 +12,7 @@ from pystencils import (
     create_kernel,
     Target,
 )
+from pystencils.codegen.gpu_indexing import GpuIndexing, HardwareProperties
 
 from pystencils.slicing import (
     add_ghost_layers,
@@ -19,6 +20,7 @@ from pystencils.slicing import (
     remove_ghost_layers,
     normalize_slice,
 )
+from math import prod
 
 try:
     import cupy as cp
@@ -29,10 +31,12 @@ except ImportError:
 
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
-@pytest.mark.parametrize("omit_range_check", [False, True])
 @pytest.mark.parametrize("manual_grid", [False, True])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True])
 def test_indexing_options_3d(
-    indexing_scheme: str, omit_range_check: bool, manual_grid: bool
+    indexing_scheme: str,
+    manual_grid: bool,
+    assume_warp_aligned_block_size: bool,
 ):
     src, dst = fields("src, dst: [3D]")
     asm = Assignment(
@@ -47,8 +51,8 @@ def test_indexing_options_3d(
 
     cfg = CreateKernelConfig(target=Target.CurrentGPU)
     cfg.gpu.indexing_scheme = indexing_scheme
-    cfg.gpu.omit_range_check = omit_range_check
     cfg.gpu.manual_launch_grid = manual_grid
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
@@ -59,14 +63,100 @@ def test_indexing_options_3d(
     if manual_grid:
         match indexing_scheme:
             case "linear3d":
-                kernel.launch_config.block_size = (10, 8, 8)
-                kernel.launch_config.grid_size = (4, 4, 2)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (8, 10, 8)
+                    kernel.launch_config.grid_size = (5, 4, 2)
+                else:
+                    kernel.launch_config.block_size = (10, 10, 8)
+                    kernel.launch_config.grid_size = (4, 4, 2)
             case "blockwise4d":
-                kernel.launch_config.block_size = (40, 1, 1)
-                kernel.launch_config.grid_size = (32, 16, 1)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (64, 1, 1)
+                    kernel.launch_config.grid_size = (32, 16, 1)
+                else:
+                    kernel.launch_config.block_size = (40, 1, 1)
+                    kernel.launch_config.grid_size = (32, 16, 1)
 
     elif indexing_scheme == "linear3d":
-        kernel.launch_config.block_size = (10, 8, 8)
+        if assume_warp_aligned_block_size:
+            kernel.launch_config.block_size = (32, 8, 2)
+        else:
+            kernel.launch_config.block_size = (10, 10, 10)
+
+    kernel(src=src_arr, dst=dst_arr)
+
+    expected = cp.zeros_like(src_arr)
+    expected[1:-1, 1:-1, 1:-1].fill(6.0)
+
+    cp.testing.assert_allclose(dst_arr, expected)
+
+@pytest.mark.parametrize("iteration_space",
+                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)])
+@pytest.mark.parametrize("initial_block_size",
+                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
+@pytest.mark.parametrize("use_block_fitting", [True, False])
+def test_block_size_adaptations(
+    iteration_space: tuple[int, int, int],
+    initial_block_size: tuple[int, int, int],
+    assume_warp_aligned_block_size: bool,
+    use_block_fitting: bool,
+):
+    src, dst = fields("src, dst: [3D]")
+    asm = Assignment(
+        dst.center(),
+        src[-1, 0, 0]
+        + src[1, 0, 0]
+        + src[0, -1, 0]
+        + src[0, 1, 0]
+        + src[0, 0, -1]
+        + src[0, 0, 1],
+    )
+
+    target = Target.CUDA
+    cfg = CreateKernelConfig(target=target)
+    cfg.gpu.indexing_scheme = "linear3d"
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+    warp_size = cfg.gpu.default_warp_size(target)
+    max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
+    max_block_sizes = GpuIndexing.get_max_block_sizes(target)
+
+    ast = create_kernel(asm, cfg)
+    kernel = ast.compile()
+
+    if use_block_fitting:
+        # test internal block fitting function later used in `kernel.launch_config.fit_block_size`
+        internal_block_size = kernel.launch_config._fit_block_size_to_it_space(
+            iteration_space,
+            initial_block_size,
+            HardwareProperties(warp_size, max_threads_per_block, max_block_sizes),
+        )
+
+        # checks if criterion for warp size alignment is fulfilled
+        def check_suitability(b):
+            return prod(b) >= warp_size and prod(b) % warp_size == 0
+
+        # block size fitting should not modify an already ideal configuration
+        # -> check if ideal configurations are modified
+        if (
+                check_suitability(initial_block_size)
+                and all(x == y for x, y in zip(initial_block_size, iteration_space))  # trimming may alter results
+        ):
+            assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \
+                f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}."
+
+        assert check_suitability(internal_block_size), \
+            "Determined block size shall be divisible by warp size."
+
+        # set block size via fitting algorithm
+        kernel.launch_config.fit_block_size(initial_block_size)
+    else:
+        # set block size via trimming algorithm
+        kernel.launch_config.trim_block_size(initial_block_size)
+
+    src_arr = cp.ones(iteration_space)
+    dst_arr = cp.zeros_like(src_arr)
 
     kernel(src=src_arr, dst=dst_arr)
 
@@ -77,10 +167,10 @@ def test_indexing_options_3d(
 
 
 @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
-@pytest.mark.parametrize("omit_range_check", [False, True])
 @pytest.mark.parametrize("manual_grid", [False, True])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True])
 def test_indexing_options_2d(
-    indexing_scheme: str, omit_range_check: bool, manual_grid: bool
+    indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool
 ):
     src, dst = fields("src, dst: [2D]")
     asm = Assignment(
@@ -93,8 +183,8 @@ def test_indexing_options_2d(
 
     cfg = CreateKernelConfig(target=Target.CurrentGPU)
     cfg.gpu.indexing_scheme = indexing_scheme
-    cfg.gpu.omit_range_check = omit_range_check
     cfg.gpu.manual_launch_grid = manual_grid
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
@@ -105,14 +195,25 @@ def test_indexing_options_2d(
     if manual_grid:
         match indexing_scheme:
             case "linear3d":
-                kernel.launch_config.block_size = (10, 8, 1)
-                kernel.launch_config.grid_size = (4, 2, 1)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (8, 8, 1)
+                    kernel.launch_config.grid_size = (5, 2, 1)
+                else:
+                    kernel.launch_config.block_size = (10, 8, 1)
+                    kernel.launch_config.grid_size = (4, 2, 1)
             case "blockwise4d":
-                kernel.launch_config.block_size = (40, 1, 1)
-                kernel.launch_config.grid_size = (16, 1, 1)
+                if assume_warp_aligned_block_size:
+                    kernel.launch_config.block_size = (64, 1, 1)
+                    kernel.launch_config.grid_size = (16, 1, 1)
+                else:
+                    kernel.launch_config.block_size = (40, 1, 1)
+                    kernel.launch_config.grid_size = (16, 1, 1)
 
     elif indexing_scheme == "linear3d":
-        kernel.launch_config.block_size = (10, 8, 1)
+        if assume_warp_aligned_block_size:
+            kernel.launch_config.block_size = (8, 8, 1)
+        else:
+            kernel.launch_config.block_size = (10, 8, 1)
 
     kernel(src=src_arr, dst=dst_arr)
 
-- 
GitLab


From 9e7299033af893db0fd38a0d64e88493fd536476 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 18 Mar 2025 16:07:13 +0000
Subject: [PATCH 13/15] update tests

---
 conftest.py                            | 14 +++--
 src/pystencils/codegen/config.py       |  4 +-
 src/pystencils/codegen/driver.py       |  2 +
 src/pystencils/codegen/gpu_indexing.py | 35 +++++++----
 tests/kernelcreation/test_gpu.py       | 84 +++++++++++++++-----------
 5 files changed, 86 insertions(+), 53 deletions(-)

diff --git a/conftest.py b/conftest.py
index ff0467eff..7ea8f5ba0 100644
--- a/conftest.py
+++ b/conftest.py
@@ -43,10 +43,16 @@ def add_path_to_ignore(path):
     ]
 
 
-collect_ignore = [
-    os.path.join(SCRIPT_FOLDER, "doc", "conf.py"),
-    os.path.join(SCRIPT_FOLDER, "src", "pystencils", "opencl", "opencl.autoinit"),
-]
+def ignore_file(fp):
+    global collect_ignore
+    collect_ignore += [os.path.join(SCRIPT_FOLDER, fp)]
+
+
+collect_ignore = []
+
+ignore_file("noxfile.py")
+ignore_file("docs/source/conf.py")
+add_path_to_ignore("docs/build")
 add_path_to_ignore("tests/benchmark")
 add_path_to_ignore("_local_tmp")
 
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index 91aff43f4..8e7e54ff1 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -395,10 +395,12 @@ class GpuOptions(ConfigBase):
     """
 
     @staticmethod
-    def default_warp_size(target: Target):
+    def default_warp_size(target: Target) -> int | None:
         match target:
             case Target.CUDA:
                 return 32
+            case Target.HIP:
+                return None
             case _:
                 raise NotImplementedError(
                     f"No default warp/wavefront size known for target {target}"
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 9f59e3510..543d9db2f 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -410,6 +410,8 @@ class DefaultKernelCreationDriver:
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
 
+        # TODO: Warn if warp_size is None and assume_warp_aligned_block_size is True
+
         return GpuIndexing(
             self._ctx,
             self._target,
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 0524eb0b1..b5e70043f 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -34,10 +34,7 @@ class HardwareProperties:
     max_threads_per_block: int
     max_block_sizes: dim3
 
-    def block_size_exceeds_hw_limits(
-            self,
-            block_size: tuple[int, ...]
-    ) -> bool:
+    def block_size_exceeds_hw_limits(self, block_size: tuple[int, ...]) -> bool:
         """Checks if provided block size conforms limits given by the hardware."""
 
         return (
@@ -106,8 +103,10 @@ class GpuLaunchConfiguration(ABC):
 
     @staticmethod
     def _excessive_block_size_error_msg(block_size: tuple[int, ...]):
-        return f"Unable to determine GPU block size for this kernel. \
-        Final block size was too large: {block_size}."
+        return (
+            "Unable to determine GPU block size for this kernel. "
+            f"Final block size was too large: {block_size}."
+        )
 
 
 class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
@@ -139,7 +138,9 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
 
     @block_size.setter
     def block_size(self, val: dim3):
-        AttributeError("Setting `block_size` on an automatic launch configuration has no effect.")
+        AttributeError(
+            "Setting `block_size` on an automatic launch configuration has no effect."
+        )
 
     @property
     def parameters(self) -> frozenset[Parameter]:
@@ -297,7 +298,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     @block_size.setter
     def block_size(self, val: dim3):
-        AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.")
+        AttributeError(
+            "Setting `block_size` on an dynamic launch configuration has no effect."
+        )
 
     @staticmethod
     def _round_block_sizes_to_warp_size(
@@ -313,7 +316,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
             return (
                 *to_round[:index_to_round],
                 ceil_to_multiple(to_round[index_to_round], warp_size),
-                *to_round[index_to_round + 1:],
+                *to_round[index_to_round + 1 :],
             )
         else:
             return (
@@ -518,6 +521,8 @@ class GpuIndexing:
         match target:
             case Target.CUDA:
                 return (1024, 1024, 64)
+            case Target.HIP:
+                return (1024, 1024, 1024)
             case _:
                 raise CodegenError(
                     f"Cannot determine max GPU block sizes for target {target}"
@@ -526,7 +531,7 @@ class GpuIndexing:
     @staticmethod
     def get_max_threads_per_block(target: Target):
         match target:
-            case Target.CUDA:
+            case Target.CUDA | Target.HIP:
                 return 1024
             case _:
                 raise CodegenError(
@@ -606,8 +611,14 @@ class GpuIndexing:
         if self._assume_warp_aligned_block_size:
             warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
             rounded_block_size = self._ast_factory.parse_index(
-                PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1),
-                         warp_size.clone()) * warp_size.clone())
+                PsIntDiv(
+                    work_items[0].clone()
+                    + warp_size.clone()
+                    - self._ast_factory.parse_index(1),
+                    warp_size.clone(),
+                )
+                * warp_size.clone()
+            )
         else:
             rounded_block_size = work_items[0]
 
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 944bd1241..bbe9aedd9 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -90,10 +90,32 @@ def test_indexing_options_3d(
 
     cp.testing.assert_allclose(dst_arr, expected)
 
-@pytest.mark.parametrize("iteration_space",
-                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)])
-@pytest.mark.parametrize("initial_block_size",
-                         [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)])
+
+@pytest.mark.parametrize(
+    "iteration_space",
+    [
+        (8, 4, 4),
+        (1, 8, 8),
+        (1, 1, 16),
+        (17, 1, 1),
+        (1, 12, 56),
+        (65, 65, 65),
+        (1, 7, 9),
+    ],
+)
+@pytest.mark.parametrize(
+    "initial_block_size",
+    [
+        (8, 4, 4),
+        (1, 8, 8),
+        (1, 1, 16),
+        (2, 2, 64),
+        (8, 2, 1),
+        (3, 1, 32),
+        (32, 1, 1),
+        (1, 2, 3),
+    ],
+)
 @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
 @pytest.mark.parametrize("use_block_fitting", [True, False])
 def test_block_size_adaptations(
@@ -102,7 +124,13 @@ def test_block_size_adaptations(
     assume_warp_aligned_block_size: bool,
     use_block_fitting: bool,
 ):
-    src, dst = fields("src, dst: [3D]")
+    field_shape = tuple(2 + x for x in iteration_space[::-1])
+    src_arr = cp.ones(field_shape)
+    dst_arr = cp.zeros_like(src_arr)
+
+    src = Field.create_from_numpy_array("src", src_arr)
+    dst = Field.create_from_numpy_array("dst", dst_arr)
+
     asm = Assignment(
         dst.center(),
         src[-1, 0, 0]
@@ -113,25 +141,20 @@ def test_block_size_adaptations(
         + src[0, 0, 1],
     )
 
-    target = Target.CUDA
+    target = Target.CurrentGPU
     cfg = CreateKernelConfig(target=target)
     cfg.gpu.indexing_scheme = "linear3d"
     cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
-    warp_size = cfg.gpu.default_warp_size(target)
-    max_threads_per_block = GpuIndexing.get_max_threads_per_block(target)
-    max_block_sizes = GpuIndexing.get_max_block_sizes(target)
+    warp_size = cfg.gpu.default_warp_size(cfg.get_target())
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
 
     if use_block_fitting:
         # test internal block fitting function later used in `kernel.launch_config.fit_block_size`
-        internal_block_size = kernel.launch_config._fit_block_size_to_it_space(
-            iteration_space,
-            initial_block_size,
-            HardwareProperties(warp_size, max_threads_per_block, max_block_sizes),
-        )
+        kernel.launch_config.fit_block_size(initial_block_size)
+        internal_block_size, _ = kernel.launch_config.evaluate()
 
         # checks if criterion for warp size alignment is fulfilled
         def check_suitability(b):
@@ -139,25 +162,20 @@ def test_block_size_adaptations(
 
         # block size fitting should not modify an already ideal configuration
         # -> check if ideal configurations are modified
-        if (
-                check_suitability(initial_block_size)
-                and all(x == y for x, y in zip(initial_block_size, iteration_space))  # trimming may alter results
-        ):
-            assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \
-                f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}."
-
-        assert check_suitability(internal_block_size), \
-            "Determined block size shall be divisible by warp size."
-
-        # set block size via fitting algorithm
-        kernel.launch_config.fit_block_size(initial_block_size)
+        if check_suitability(initial_block_size) and all(
+            x == y for x, y in zip(initial_block_size, iteration_space)
+        ):  # trimming may alter results
+            assert all(
+                x == y for x, y in zip(initial_block_size, internal_block_size)
+            ), f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}."
+
+        assert check_suitability(
+            internal_block_size
+        ), "Determined block size shall be divisible by warp size."
     else:
         # set block size via trimming algorithm
         kernel.launch_config.trim_block_size(initial_block_size)
 
-    src_arr = cp.ones(iteration_space)
-    dst_arr = cp.zeros_like(src_arr)
-
     kernel(src=src_arr, dst=dst_arr)
 
     expected = cp.zeros_like(src_arr)
@@ -173,13 +191,7 @@ def test_indexing_options_2d(
     indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool
 ):
     src, dst = fields("src, dst: [2D]")
-    asm = Assignment(
-        dst.center(),
-        src[-1, 0]
-        + src[1, 0]
-        + src[0, -1]
-        + src[0, 1]
-    )
+    asm = Assignment(dst.center(), src[-1, 0] + src[1, 0] + src[0, -1] + src[0, 1])
 
     cfg = CreateKernelConfig(target=Target.CurrentGPU)
     cfg.gpu.indexing_scheme = indexing_scheme
-- 
GitLab


From 7c371fb43a0eb731268fa3fe8fd7861f0b4b5f68 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 19 Mar 2025 13:03:39 +0000
Subject: [PATCH 14/15] make no assumptions about warp_size for Target.HIP

---
 src/pystencils/codegen/driver.py       |  4 +++-
 src/pystencils/codegen/gpu_indexing.py | 20 ++++++++++++++------
 tests/kernelcreation/test_gpu.py       |  5 +++--
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 543d9db2f..e9fc69b76 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import cast, Sequence, Callable, TYPE_CHECKING
 from dataclasses import dataclass, replace
+from warnings import warn
 
 from .target import Target
 from .config import (
@@ -410,7 +411,8 @@ class DefaultKernelCreationDriver:
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
 
-        # TODO: Warn if warp_size is None and assume_warp_aligned_block_size is True
+        if warp_size is None and assume_warp_aligned_block_size:
+            warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.")
 
         return GpuIndexing(
             self._ctx,
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index b5e70043f..09570e345 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -30,7 +30,7 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda]
 
 @dataclass
 class HardwareProperties:
-    warp_size: int
+    warp_size: int | None
     max_threads_per_block: int
     max_block_sizes: dim3
 
@@ -204,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration):
 
         if (
             self._assume_warp_aligned_block_size
+            and self._hw_props.warp_size is not None
             and prod(self._block_size) % self._hw_props.warp_size != 0
         ):
             raise CodegenError(
@@ -316,7 +317,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
             return (
                 *to_round[:index_to_round],
                 ceil_to_multiple(to_round[index_to_round], warp_size),
-                *to_round[index_to_round + 1 :],
+                *to_round[index_to_round + 1:],
             )
         else:
             return (
@@ -351,7 +352,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
         if (
             self._assume_warp_aligned_block_size
-            and prod(ret) % self._hw_props.warp_size != 0
+            and hw_props.warp_size is not None
+            and prod(ret) % hw_props.warp_size != 0
         ):
             self._round_block_sizes_to_warp_size(ret, hw_props.warp_size)
 
@@ -387,6 +389,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
             return ret
 
         trimmed = trim(list(block_size))
+
+        if hw_props.warp_size is None:
+            return tuple(trimmed)
+
         if (
             prod(trimmed) >= hw_props.warp_size
             and prod(trimmed) % hw_props.warp_size == 0
@@ -493,14 +499,13 @@ class GpuIndexing:
         ctx: KernelCreationContext,
         target: Target,
         scheme: GpuIndexingScheme,
-        warp_size: int,
+        warp_size: int | None,
         manual_launch_grid: bool = False,
         assume_warp_aligned_block_size: bool = False,
     ) -> None:
         self._ctx = ctx
         self._target = target
         self._scheme = scheme
-        self._warp_size = warp_size
         self._manual_launch_grid = manual_launch_grid
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
@@ -608,7 +613,10 @@ class GpuIndexing:
         # impossible to use block size determination function since the iteration space is unknown
         # -> round block size in fastest moving dimension up to multiple of warp size
         rounded_block_size: PsExpression
-        if self._assume_warp_aligned_block_size:
+        if (
+            self._assume_warp_aligned_block_size
+            and self._hw_props.warp_size is not None
+        ):
             warp_size = self._ast_factory.parse_index(self._hw_props.warp_size)
             rounded_block_size = self._ast_factory.parse_index(
                 PsIntDiv(
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index bbe9aedd9..a3f8a5482 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -145,8 +145,9 @@ def test_block_size_adaptations(
     cfg = CreateKernelConfig(target=target)
     cfg.gpu.indexing_scheme = "linear3d"
     cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
-
-    warp_size = cfg.gpu.default_warp_size(cfg.get_target())
+    
+    warp_size = 32
+    cfg.gpu.warp_size = warp_size
 
     ast = create_kernel(asm, cfg)
     kernel = ast.compile()
-- 
GitLab


From 3c959156c2d043f3ae3f679a9a518c46a15b827d Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 19 Mar 2025 13:29:51 +0000
Subject: [PATCH 15/15] expose default block size of dynamic launch config as
 property

---
 src/pystencils/codegen/gpu_indexing.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 09570e345..43b612bd7 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -291,6 +291,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
     def parameters(self) -> frozenset[Parameter]:
         """Parameters of this launch configuration"""
         return self._params
+    
+    @property
+    def default_block_size(self) -> dim3:
+        return self._default_block_size
 
     @property
     def block_size(self) -> dim3 | None:
-- 
GitLab