From 2e175f0088f16b0028ac03ce788b85086302a89f Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 5 Mar 2025 15:41:00 +0100
Subject: [PATCH 01/28] implement variants of cuda_invoke for all three types
 of launch configs

---
 src/pystencilssfg/composer/basic_composer.py | 179 +++++++++++++++++--
 src/pystencilssfg/ir/call_tree.py            |  48 ++---
 src/pystencilssfg/ir/postprocessing.py       |  12 +-
 src/pystencilssfg/lang/cpp/std_tuple.py      |   5 +-
 src/pystencilssfg/lang/cuda.py               |  13 ++
 src/pystencilssfg/lang/expressions.py        |   8 +-
 6 files changed, 215 insertions(+), 50 deletions(-)
 create mode 100644 src/pystencilssfg/lang/cuda.py

diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py
index 31337a6..8a07674 100644
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Sequence, TypeAlias
+from typing import Sequence, TypeAlias, overload
 from abc import ABC, abstractmethod
 import sympy as sp
 from functools import reduce
@@ -13,7 +13,12 @@ from pystencils import (
     Assignment,
     AssignmentCollection,
 )
-from pystencils.codegen import Kernel
+from pystencils.codegen import Kernel, GpuKernel, Lambda
+from pystencils.codegen.gpu_indexing import (
+    ManualLaunchConfiguration,
+    AutomaticLaunchConfiguration,
+    DynamicBlockSizeLaunchConfiguration,
+)
 from pystencils.types import create_type, UserTypeSpec, PsType
 
 from ..context import SfgContext, SfgCursor
@@ -53,6 +58,7 @@ from ..lang import (
     HeaderFile,
     includes,
     SfgVar,
+    SfgKernelParamVar,
     AugExpr,
     SupportsFieldExtraction,
     SupportsVectorExtraction,
@@ -398,25 +404,155 @@ class SfgBasicComposer(SfgIComposer):
         """
         return SfgKernelCallNode(kernel_handle)
 
+    @overload
     def cuda_invoke(
         self,
         kernel_handle: SfgKernelHandle,
-        num_blocks: ExprLike,
-        threads_per_block: ExprLike,
-        stream: ExprLike | None,
-    ):
-        """Dispatch a CUDA kernel to the device."""
-        num_blocks_str = str(num_blocks)
-        tpb_str = str(threads_per_block)
-        stream_str = str(stream) if stream is not None else None
+        *,
+        grid_size: ExprLike,
+        block_size: ExprLike,
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode:
+        """Invoke a CUDA kernel with a manual launch grid.
+
+        Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
+        set to `True`.
+        """
 
-        deps = depends(num_blocks) | depends(threads_per_block)
-        if stream is not None:
-            deps |= depends(stream)
+    @overload
+    def cuda_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode:
+        """Invoke a CUDA kernel with an automatic launch grid.
 
-        return SfgCudaKernelInvocation(
-            kernel_handle, num_blocks_str, tpb_str, stream_str, deps
-        )
+        This signature accepts kernels generated with an indexing scheme that permits
+        the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
+        """
+
+    @overload
+    def cuda_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        block_size: ExprLike | None = None,
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode:
+        """Invoke a CUDA kernel with a dynamic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that permits a user-defined
+        blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
+        The grid size is calculated automatically.
+        """
+
+    def cuda_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
+        ker = kernel_handle.kernel
+
+        if not isinstance(ker, GpuKernel):
+            raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}")
+
+        launch_config = ker.get_launch_configuration()
+
+        from ..lang.cuda import dim3
+
+        def _render_invocation(
+            grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None
+        ):
+            stmt_grid_size = make_statements(grid_size)
+            stmt_block_size = make_statements(block_size)
+            stmt_stream = make_statements(stream) if stream is not None else None
+
+            return SfgCudaKernelInvocation(
+                kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream
+            )
+
+        grid_size: ExprLike
+        block_size: ExprLike
+        stream: ExprLike | None
+
+        match launch_config:
+            case ManualLaunchConfiguration():
+                grid_size = kwargs["grid_size"]
+                block_size = kwargs["block_size"]
+                stream = kwargs["stream"]
+
+                return _render_invocation(grid_size, block_size, stream)
+
+            case AutomaticLaunchConfiguration():
+                stream = kwargs["stream"]
+
+                grid_size_entries = [
+                    self.expr_from_lambda(gs) for gs in launch_config._grid_size
+                ]
+                grid_size_var = dim3(const=True).var("__grid_size")
+
+                block_size_entries = [
+                    self.expr_from_lambda(bs) for bs in launch_config._block_size
+                ]
+                block_size_var = dim3(const=True).var("__block_size")
+
+                nodes: list[SfgCallTreeNode] = [
+                    self.init(grid_size_var)(*grid_size_entries),
+                    self.init(block_size_var)(*block_size_entries),
+                    _render_invocation(grid_size_var, block_size_var, stream),
+                ]
+
+                return SfgBlock(SfgSequence(nodes))
+
+            case DynamicBlockSizeLaunchConfiguration():
+                block_size = kwargs["block_size"]
+                stream = kwargs["stream"]
+
+                from ..lang.cpp import std
+
+                witem_types = [lmb.return_type for lmb in launch_config.num_work_items]
+                work_items_entries = [
+                    self.expr_from_lambda(wit) for wit in launch_config.num_work_items
+                ]
+                work_items_var = std.tuple(*witem_types, const=True).var("__work_items")
+
+                def _min(a: ExprLike, b: ExprLike):
+                    return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
+
+                def _div_ceil(a: ExprLike, b: ExprLike):
+                    return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
+
+                block_size_var = dim3(const=True).var("__block_size")
+
+                reduced_block_size_entries = [
+                    _min(work_items_var.get(i), bs)
+                    for i, bs in enumerate(
+                        [block_size_var.x(), block_size_var.y(), block_size_var.z()]
+                    )
+                ]
+                reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
+
+                grid_size_entries = [
+                    _div_ceil(work_items_var.get(i), bs)
+                    for i, bs in enumerate(
+                        [
+                            reduced_block_size_var.x(),
+                            reduced_block_size_var.y(),
+                            reduced_block_size_var.z(),
+                        ]
+                    )
+                ]
+                grid_size_var = dim3(const=True).var("__grid_size")
+
+                nodes = [
+                    self.init(block_size_var)(block_size),
+                    self.init(work_items_var)(*work_items_entries),
+                    self.init(reduced_block_size_var)(*reduced_block_size_entries),
+                    self.init(grid_size_var)(*grid_size_entries),
+                    _render_invocation(grid_size_var, reduced_block_size_var, stream),
+                ]
+
+                return SfgBlock(SfgSequence(nodes))
+
+            case _:
+                raise ValueError(f"Unexpected launch configuration: {launch_config}")
 
     def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence:
         """Syntax sequencing. For details, see `make_sequence`"""
@@ -511,6 +647,11 @@ class SfgBasicComposer(SfgIComposer):
         """
         return AugExpr.format(fmt, *deps, **kwdeps)
 
+    def expr_from_lambda(self, lamb: Lambda) -> AugExpr:
+        depends = set(SfgKernelParamVar(p) for p in lamb.parameters)
+        code = lamb.c_code()
+        return AugExpr.make(code, depends, dtype=lamb.return_type)
+
     @property
     def branch(self) -> SfgBranchBuilder:
         """Use inside a function body to create an if/else conditonal branch.
@@ -564,7 +705,11 @@ class SfgBasicComposer(SfgIComposer):
         var: SfgVar | sp.Symbol = asvar(param) if isinstance(param, _VarLike) else param
         return SfgDeferredParamSetter(var, expr)
 
-    def map_vector(self, lhs_components: Sequence[VarLike | sp.Symbol], rhs: SupportsVectorExtraction):
+    def map_vector(
+        self,
+        lhs_components: Sequence[VarLike | sp.Symbol],
+        rhs: SupportsVectorExtraction,
+    ):
         """Extracts scalar numerical values from a vector data type.
 
         Args:
diff --git a/src/pystencilssfg/ir/call_tree.py b/src/pystencilssfg/ir/call_tree.py
index 24a315d..78ba841 100644
--- a/src/pystencilssfg/ir/call_tree.py
+++ b/src/pystencilssfg/ir/call_tree.py
@@ -19,6 +19,7 @@ class SfgCallTreeNode(ABC):
     Therefore, every instantiable call tree node must implement the method `get_code`.
     By convention, the string returned by `get_code` should not contain a trailing newline.
     """
+
     def __init__(self) -> None:
         self._includes: set[HeaderFile] = set()
 
@@ -34,6 +35,11 @@ class SfgCallTreeNode(ABC):
         By convention, the code block emitted by this function should not contain a trailing newline.
         """
 
+    @property
+    def depends(self) -> set[SfgVar]:
+        """Set of objects this leaf depends on"""
+        return set()
+
     @property
     def required_includes(self) -> set[HeaderFile]:
         """Return a set of header includes required by this node"""
@@ -53,11 +59,6 @@ class SfgCallTreeLeaf(SfgCallTreeNode, ABC):
     def children(self) -> Sequence[SfgCallTreeNode]:
         return ()
 
-    @property
-    @abstractmethod
-    def depends(self) -> set[SfgVar]:
-        """Set of objects this leaf depends on"""
-
 
 class SfgEmptyNode(SfgCallTreeLeaf):
     """A leaf node that does not emit any code.
@@ -202,21 +203,20 @@ class SfgKernelCallNode(SfgCallTreeLeaf):
         return set(self._kernel_handle.parameters)
 
     def get_code(self, cstyle: CodeStyle) -> str:
-        ast_params = self._kernel_handle.parameters
+        kparams = self._kernel_handle.parameters
         fnc_name = self._kernel_handle.fqname
-        call_parameters = ", ".join([p.name for p in ast_params])
+        call_parameters = ", ".join([p.name for p in kparams])
 
         return f"{fnc_name}({call_parameters});"
 
 
-class SfgCudaKernelInvocation(SfgCallTreeLeaf):
+class SfgCudaKernelInvocation(SfgCallTreeNode):
     def __init__(
         self,
         kernel_handle: SfgKernelHandle,
-        num_blocks_code: str,
-        threads_per_block_code: str,
-        stream_code: str | None,
-        depends: set[SfgVar],
+        grid_size: SfgStatements,
+        block_size: SfgStatements,
+        stream: SfgStatements | None,
     ):
         from pystencils import Target
         from pystencils.codegen import GpuKernel
@@ -229,25 +229,31 @@ class SfgCudaKernelInvocation(SfgCallTreeLeaf):
 
         super().__init__()
         self._kernel_handle = kernel_handle
-        self._num_blocks = num_blocks_code
-        self._threads_per_block = threads_per_block_code
-        self._stream = stream_code
-        self._depends = depends
+        self._grid_size = grid_size
+        self._block_size = block_size
+        self._stream = stream
+
+    @property
+    def children(self) -> Sequence[SfgCallTreeNode]:
+        return (
+            self._grid_size,
+            self._block_size,
+        ) + ((self._stream,) if self._stream is not None else ())
 
     @property
     def depends(self) -> set[SfgVar]:
-        return set(self._kernel_handle.parameters) | self._depends
+        return set(self._kernel_handle.parameters)
 
     def get_code(self, cstyle: CodeStyle) -> str:
-        ast_params = self._kernel_handle.parameters
+        kparams = self._kernel_handle.parameters
         fnc_name = self._kernel_handle.fqname
-        call_parameters = ", ".join([p.name for p in ast_params])
+        call_parameters = ", ".join([p.name for p in kparams])
 
-        grid_args = [self._num_blocks, self._threads_per_block]
+        grid_args = [self._grid_size, self._block_size]
         if self._stream is not None:
             grid_args += [self._stream]
 
-        grid = "<<< " + ", ".join(grid_args) + " >>>"
+        grid = "<<< " + ", ".join(arg.get_code(cstyle) for arg in grid_args) + " >>>"
         return f"{fnc_name}{grid}({call_parameters});"
 
 
diff --git a/src/pystencilssfg/ir/postprocessing.py b/src/pystencilssfg/ir/postprocessing.py
index 1e692b0..8966933 100644
--- a/src/pystencilssfg/ir/postprocessing.py
+++ b/src/pystencilssfg/ir/postprocessing.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 from typing import Sequence, Iterable
 import warnings
-from functools import reduce
 from dataclasses import dataclass
 
 from abc import ABC, abstractmethod
@@ -15,7 +14,7 @@ from pystencils.codegen.properties import FieldBasePtr, FieldShape, FieldStride
 from ..exceptions import SfgException
 from ..config import CodeStyle
 
-from .call_tree import SfgCallTreeNode, SfgCallTreeLeaf, SfgSequence, SfgStatements
+from .call_tree import SfgCallTreeNode, SfgSequence, SfgStatements
 from ..lang.expressions import SfgKernelParamVar
 from ..lang import (
     SfgVar,
@@ -163,17 +162,12 @@ class CallTreePostProcessing:
                 self.handle_sequence(node, ppc)
                 return ppc.live_variables
 
-            case SfgCallTreeLeaf():
-                return node.depends
-
             case SfgDeferredNode():
                 raise SfgException("Deferred nodes can only occur inside a sequence.")
 
             case _:
-                return reduce(
-                    lambda x, y: x | y,
-                    (self.get_live_variables(c) for c in node.children),
-                    set(),
+                return node.depends.union(
+                    *(self.get_live_variables(c) for c in node.children)
                 )
 
 
diff --git a/src/pystencilssfg/lang/cpp/std_tuple.py b/src/pystencilssfg/lang/cpp/std_tuple.py
index 645b6b5..6d1e1c0 100644
--- a/src/pystencilssfg/lang/cpp/std_tuple.py
+++ b/src/pystencilssfg/lang/cpp/std_tuple.py
@@ -19,10 +19,13 @@ class StdTuple(AugExpr, SupportsVectorExtraction):
         dtype = self._template(ts=", ".join(elt_type_strings), const=const, ref=ref)
         super().__init__(dtype)
 
+    def get(self, idx: int | str) -> AugExpr:
+        return AugExpr.format("std::get< {} >({})", idx, self)
+
     def _extract_component(self, coordinate: int) -> AugExpr:
         if coordinate < 0 or coordinate >= self._length:
             raise ValueError(
                 f"Index {coordinate} out-of-bounds for std::tuple with {self._length} entries."
             )
 
-        return AugExpr.format("std::get< {} >({})", coordinate, self)
+        return self.get(coordinate)
diff --git a/src/pystencilssfg/lang/cuda.py b/src/pystencilssfg/lang/cuda.py
new file mode 100644
index 0000000..28794da
--- /dev/null
+++ b/src/pystencilssfg/lang/cuda.py
@@ -0,0 +1,13 @@
+from .expressions import CppClass, cpptype
+
+
+from ..extensions.gpu import dim3class
+
+dim3 = dim3class(
+    "<cuda_runtime.h>"
+)
+"""Reflection of CUDA's `dim3`."""
+
+
+class cudaStream_t(CppClass):
+    template = cpptype("cudaStream_t", "<cuda_runtime.h>")
diff --git a/src/pystencilssfg/lang/expressions.py b/src/pystencilssfg/lang/expressions.py
index 135a54e..8be59b0 100644
--- a/src/pystencilssfg/lang/expressions.py
+++ b/src/pystencilssfg/lang/expressions.py
@@ -218,8 +218,12 @@ class AugExpr:
         return self._bind(expr)
 
     @staticmethod
-    def make(code: str, depends: Iterable[SfgVar | AugExpr]):
-        return AugExpr()._bind(DependentExpression(code, depends))
+    def make(
+        code: str,
+        depends: Iterable[SfgVar | AugExpr],
+        dtype: UserTypeSpec | None = None,
+    ):
+        return AugExpr(dtype)._bind(DependentExpression(code, depends))
 
     @staticmethod
     def format(fmt: str, *deps, **kwdeps) -> AugExpr:
-- 
GitLab


From e8f5f8aef9a1191f19b71138aabcb28dc55a08e9 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 5 Mar 2025 15:56:48 +0100
Subject: [PATCH 02/28] small fixes to cuda_invoke

---
 src/pystencilssfg/composer/basic_composer.py | 24 +++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py
index 8a07674..686b60d 100644
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -493,7 +493,7 @@ class SfgBasicComposer(SfgIComposer):
                 ]
                 block_size_var = dim3(const=True).var("__block_size")
 
-                nodes: list[SfgCallTreeNode] = [
+                nodes = [
                     self.init(grid_size_var)(*grid_size_entries),
                     self.init(block_size_var)(*block_size_entries),
                     _render_invocation(grid_size_var, block_size_var, stream),
@@ -502,12 +502,26 @@ class SfgBasicComposer(SfgIComposer):
                 return SfgBlock(SfgSequence(nodes))
 
             case DynamicBlockSizeLaunchConfiguration():
-                block_size = kwargs["block_size"]
+                user_block_size: ExprLike | None = kwargs["block_size"]
                 stream = kwargs["stream"]
 
+                if user_block_size is None:
+                    if launch_config.block_size is None:
+                        raise ValueError(
+                            "Neither a user-defined nor a default block size was defined."
+                        )
+
+                    block_size_init_args = tuple(
+                        str(bs) for bs in launch_config.block_size
+                    )
+                else:
+                    block_size_init_args = (user_block_size,)
+
+                block_size_var = dim3(const=True).var("__block_size")
+
                 from ..lang.cpp import std
 
-                witem_types = [lmb.return_type for lmb in launch_config.num_work_items]
+                witem_types = [wit.return_type for wit in launch_config.num_work_items]
                 work_items_entries = [
                     self.expr_from_lambda(wit) for wit in launch_config.num_work_items
                 ]
@@ -519,8 +533,6 @@ class SfgBasicComposer(SfgIComposer):
                 def _div_ceil(a: ExprLike, b: ExprLike):
                     return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
 
-                block_size_var = dim3(const=True).var("__block_size")
-
                 reduced_block_size_entries = [
                     _min(work_items_var.get(i), bs)
                     for i, bs in enumerate(
@@ -542,7 +554,7 @@ class SfgBasicComposer(SfgIComposer):
                 grid_size_var = dim3(const=True).var("__grid_size")
 
                 nodes = [
-                    self.init(block_size_var)(block_size),
+                    self.init(block_size_var)(*block_size_init_args),
                     self.init(work_items_var)(*work_items_entries),
                     self.init(reduced_block_size_var)(*reduced_block_size_entries),
                     self.init(grid_size_var)(*grid_size_entries),
-- 
GitLab


From 69a150edeeeec3b618ba1b70fef296db245dc5c4 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 5 Mar 2025 16:43:52 +0100
Subject: [PATCH 03/28] first CUDA genscript test - not compiling just yet

---
 src/pystencilssfg/composer/basic_composer.py  | 22 ++++++++----------
 tests/generator_scripts/index.yaml            | 14 +++++++++++
 tests/generator_scripts/source/CudaKernels.py | 23 +++++++++++++++++++
 .../test_generator_scripts.py                 |  7 ++++--
 4 files changed, 52 insertions(+), 14 deletions(-)
 create mode 100644 tests/generator_scripts/source/CudaKernels.py

diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py
index 686b60d..0466e6c 100644
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -470,19 +470,16 @@ class SfgBasicComposer(SfgIComposer):
 
         grid_size: ExprLike
         block_size: ExprLike
-        stream: ExprLike | None
+        stream: ExprLike | None = kwargs.get("stream", None)
 
         match launch_config:
             case ManualLaunchConfiguration():
                 grid_size = kwargs["grid_size"]
                 block_size = kwargs["block_size"]
-                stream = kwargs["stream"]
 
                 return _render_invocation(grid_size, block_size, stream)
 
             case AutomaticLaunchConfiguration():
-                stream = kwargs["stream"]
-
                 grid_size_entries = [
                     self.expr_from_lambda(gs) for gs in launch_config._grid_size
                 ]
@@ -502,9 +499,9 @@ class SfgBasicComposer(SfgIComposer):
                 return SfgBlock(SfgSequence(nodes))
 
             case DynamicBlockSizeLaunchConfiguration():
-                user_block_size: ExprLike | None = kwargs["block_size"]
-                stream = kwargs["stream"]
+                user_block_size: ExprLike | None = kwargs.get("block_size", None)
 
+                block_size_init_args: tuple[ExprLike, ...]
                 if user_block_size is None:
                     if launch_config.block_size is None:
                         raise ValueError(
@@ -521,11 +518,12 @@ class SfgBasicComposer(SfgIComposer):
 
                 from ..lang.cpp import std
 
-                witem_types = [wit.return_type for wit in launch_config.num_work_items]
                 work_items_entries = [
                     self.expr_from_lambda(wit) for wit in launch_config.num_work_items
                 ]
-                work_items_var = std.tuple(*witem_types, const=True).var("__work_items")
+                work_items_var = std.tuple(
+                    "uint32_t", "uint32_t", "uint32_t", const=True
+                ).var("__work_items")
 
                 def _min(a: ExprLike, b: ExprLike):
                     return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
@@ -536,7 +534,7 @@ class SfgBasicComposer(SfgIComposer):
                 reduced_block_size_entries = [
                     _min(work_items_var.get(i), bs)
                     for i, bs in enumerate(
-                        [block_size_var.x(), block_size_var.y(), block_size_var.z()]
+                        [block_size_var.x, block_size_var.y, block_size_var.z]
                     )
                 ]
                 reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
@@ -545,9 +543,9 @@ class SfgBasicComposer(SfgIComposer):
                     _div_ceil(work_items_var.get(i), bs)
                     for i, bs in enumerate(
                         [
-                            reduced_block_size_var.x(),
-                            reduced_block_size_var.y(),
-                            reduced_block_size_var.z(),
+                            reduced_block_size_var.x,
+                            reduced_block_size_var.y,
+                            reduced_block_size_var.z,
                         ]
                     )
                 ]
diff --git a/tests/generator_scripts/index.yaml b/tests/generator_scripts/index.yaml
index 1c97aaf..bfbedda 100644
--- a/tests/generator_scripts/index.yaml
+++ b/tests/generator_scripts/index.yaml
@@ -90,6 +90,20 @@ StlContainers1D:
 MdSpanFixedShapeLayouts:
 MdSpanLbStreaming:
 
+# CUDA
+
+CudaKernels:
+  sfg-args:
+    file-extensions: ["cuh", "cu"]
+  compile:
+    cxx: nvcc
+    cxx-flags: 
+      - -std=c++20
+      - -Werror
+      - all-warnings
+      - --expt-relaxed-constexpr
+    skip-if-not-found: true
+
 # SYCL
 
 SyclKernels:
diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py
new file mode 100644
index 0000000..9bd37a5
--- /dev/null
+++ b/tests/generator_scripts/source/CudaKernels.py
@@ -0,0 +1,23 @@
+from pystencilssfg import SourceFileGenerator
+from pystencilssfg.lang.cuda import dim3
+from pystencilssfg.lang.cpp import std
+
+import pystencils as ps
+
+std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
+
+with SourceFileGenerator() as sfg:
+
+    src, dst = ps.fields("src, dst: double[3D]", layout="c")
+    asm = ps.Assignment(dst(0), 2 * src(0))
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    khandle = sfg.kernels.create(asm, "scale", cfg)
+
+    block_size = dim3().var("blockSize")
+    
+    sfg.function("invoke")(
+        sfg.map_field(src, std.mdspan.from_field(src)),
+        sfg.map_field(dst, std.mdspan.from_field(dst)),
+        sfg.cuda_invoke(khandle, block_size=block_size)
+    )
diff --git a/tests/generator_scripts/test_generator_scripts.py b/tests/generator_scripts/test_generator_scripts.py
index 6f2ff16..8571d25 100644
--- a/tests/generator_scripts/test_generator_scripts.py
+++ b/tests/generator_scripts/test_generator_scripts.py
@@ -12,16 +12,19 @@ import shutil
 import warnings
 import subprocess
 
+from pystencils.include import get_pystencils_include_path
+
 THIS_DIR = pathlib.Path(__file__).parent
 
 DEPS_DIR = THIS_DIR / "deps"
 MDSPAN_QUAL_PATH = "mdspan-mdspan-0.6.0/include/"
 
+PYSTENCILS_RT_INCLUDE_PATH = get_pystencils_include_path()
 
 TEST_INDEX = THIS_DIR / "index.yaml"
 SOURCE_DIR = THIS_DIR / "source"
 EXPECTED_DIR = THIS_DIR / "expected"
-CXX_INCLUDE_FLAGS = ["-I", f"{DEPS_DIR}/{MDSPAN_QUAL_PATH}"]
+CXX_INCLUDE_FLAGS = ["-I", f"{DEPS_DIR}/{MDSPAN_QUAL_PATH},{PYSTENCILS_RT_INCLUDE_PATH}"]
 
 
 def prepare_deps():
@@ -101,7 +104,7 @@ class GenScriptTest:
         for ext in self._expected_extensions:
             fname = f"{self._name}.{ext}"
             self._expected_files.add(fname)
-            if ext in ("cpp", "cxx", "c++"):
+            if ext in ("cpp", "cxx", "c++", "cu", "hip"):
                 self._files_to_compile.append(fname)
 
         compile_descr: dict = test_description.get("compile", dict())
-- 
GitLab


From e93ca92c9508235d1b85aff96e6eb5096185aac6 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 5 Mar 2025 16:48:49 +0100
Subject: [PATCH 04/28] fix include flags in genscript test suite

---
 tests/generator_scripts/test_generator_scripts.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/generator_scripts/test_generator_scripts.py b/tests/generator_scripts/test_generator_scripts.py
index 8571d25..8901c34 100644
--- a/tests/generator_scripts/test_generator_scripts.py
+++ b/tests/generator_scripts/test_generator_scripts.py
@@ -24,7 +24,12 @@ PYSTENCILS_RT_INCLUDE_PATH = get_pystencils_include_path()
 TEST_INDEX = THIS_DIR / "index.yaml"
 SOURCE_DIR = THIS_DIR / "source"
 EXPECTED_DIR = THIS_DIR / "expected"
-CXX_INCLUDE_FLAGS = ["-I", f"{DEPS_DIR}/{MDSPAN_QUAL_PATH},{PYSTENCILS_RT_INCLUDE_PATH}"]
+CXX_INCLUDE_FLAGS = [
+    "-I",
+    f"{DEPS_DIR}/{MDSPAN_QUAL_PATH}",
+    "-I",
+    PYSTENCILS_RT_INCLUDE_PATH,
+]
 
 
 def prepare_deps():
-- 
GitLab


From 6c120a849a18bb8c5b6e60dad50c0ea5611bf1ba Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 6 Mar 2025 09:55:43 +0100
Subject: [PATCH 05/28] Add CUDA and HIP API provider protocols. Factor out GPU
 stuff into separate Gpu Composer.

---
 src/pystencilssfg/composer/basic_composer.py  | 170 +-----------
 src/pystencilssfg/composer/composer.py        |   4 +-
 src/pystencilssfg/composer/gpu_composer.py    | 248 ++++++++++++++++++
 src/pystencilssfg/lang/cuda.py                |  13 -
 src/pystencilssfg/lang/gpu.py                 |  52 ++++
 tests/generator_scripts/index.yaml            |  13 +
 tests/generator_scripts/source/CudaKernels.py |   8 +-
 tests/generator_scripts/source/HipKernels.py  |  23 ++
 8 files changed, 345 insertions(+), 186 deletions(-)
 create mode 100644 src/pystencilssfg/composer/gpu_composer.py
 delete mode 100644 src/pystencilssfg/lang/cuda.py
 create mode 100644 src/pystencilssfg/lang/gpu.py
 create mode 100644 tests/generator_scripts/source/HipKernels.py

diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py
index 0466e6c..97334db 100644
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Sequence, TypeAlias, overload
+from typing import Sequence, TypeAlias
 from abc import ABC, abstractmethod
 import sympy as sp
 from functools import reduce
@@ -13,12 +13,7 @@ from pystencils import (
     Assignment,
     AssignmentCollection,
 )
-from pystencils.codegen import Kernel, GpuKernel, Lambda
-from pystencils.codegen.gpu_indexing import (
-    ManualLaunchConfiguration,
-    AutomaticLaunchConfiguration,
-    DynamicBlockSizeLaunchConfiguration,
-)
+from pystencils.codegen import Kernel, Lambda
 from pystencils.types import create_type, UserTypeSpec, PsType
 
 from ..context import SfgContext, SfgCursor
@@ -26,7 +21,6 @@ from .custom import CustomGenerator
 from ..ir import (
     SfgCallTreeNode,
     SfgKernelCallNode,
-    SfgCudaKernelInvocation,
     SfgStatements,
     SfgFunctionParams,
     SfgRequireIncludes,
@@ -404,166 +398,6 @@ class SfgBasicComposer(SfgIComposer):
         """
         return SfgKernelCallNode(kernel_handle)
 
-    @overload
-    def cuda_invoke(
-        self,
-        kernel_handle: SfgKernelHandle,
-        *,
-        grid_size: ExprLike,
-        block_size: ExprLike,
-        stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with a manual launch grid.
-
-        Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
-        set to `True`.
-        """
-
-    @overload
-    def cuda_invoke(
-        self,
-        kernel_handle: SfgKernelHandle,
-        *,
-        stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with an automatic launch grid.
-
-        This signature accepts kernels generated with an indexing scheme that permits
-        the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
-        """
-
-    @overload
-    def cuda_invoke(
-        self,
-        kernel_handle: SfgKernelHandle,
-        *,
-        block_size: ExprLike | None = None,
-        stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with a dynamic launch grid.
-
-        This signature accepts kernels generated with an indexing scheme that permits a user-defined
-        blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
-        The grid size is calculated automatically.
-        """
-
-    def cuda_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
-        ker = kernel_handle.kernel
-
-        if not isinstance(ker, GpuKernel):
-            raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}")
-
-        launch_config = ker.get_launch_configuration()
-
-        from ..lang.cuda import dim3
-
-        def _render_invocation(
-            grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None
-        ):
-            stmt_grid_size = make_statements(grid_size)
-            stmt_block_size = make_statements(block_size)
-            stmt_stream = make_statements(stream) if stream is not None else None
-
-            return SfgCudaKernelInvocation(
-                kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream
-            )
-
-        grid_size: ExprLike
-        block_size: ExprLike
-        stream: ExprLike | None = kwargs.get("stream", None)
-
-        match launch_config:
-            case ManualLaunchConfiguration():
-                grid_size = kwargs["grid_size"]
-                block_size = kwargs["block_size"]
-
-                return _render_invocation(grid_size, block_size, stream)
-
-            case AutomaticLaunchConfiguration():
-                grid_size_entries = [
-                    self.expr_from_lambda(gs) for gs in launch_config._grid_size
-                ]
-                grid_size_var = dim3(const=True).var("__grid_size")
-
-                block_size_entries = [
-                    self.expr_from_lambda(bs) for bs in launch_config._block_size
-                ]
-                block_size_var = dim3(const=True).var("__block_size")
-
-                nodes = [
-                    self.init(grid_size_var)(*grid_size_entries),
-                    self.init(block_size_var)(*block_size_entries),
-                    _render_invocation(grid_size_var, block_size_var, stream),
-                ]
-
-                return SfgBlock(SfgSequence(nodes))
-
-            case DynamicBlockSizeLaunchConfiguration():
-                user_block_size: ExprLike | None = kwargs.get("block_size", None)
-
-                block_size_init_args: tuple[ExprLike, ...]
-                if user_block_size is None:
-                    if launch_config.block_size is None:
-                        raise ValueError(
-                            "Neither a user-defined nor a default block size was defined."
-                        )
-
-                    block_size_init_args = tuple(
-                        str(bs) for bs in launch_config.block_size
-                    )
-                else:
-                    block_size_init_args = (user_block_size,)
-
-                block_size_var = dim3(const=True).var("__block_size")
-
-                from ..lang.cpp import std
-
-                work_items_entries = [
-                    self.expr_from_lambda(wit) for wit in launch_config.num_work_items
-                ]
-                work_items_var = std.tuple(
-                    "uint32_t", "uint32_t", "uint32_t", const=True
-                ).var("__work_items")
-
-                def _min(a: ExprLike, b: ExprLike):
-                    return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
-
-                def _div_ceil(a: ExprLike, b: ExprLike):
-                    return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
-
-                reduced_block_size_entries = [
-                    _min(work_items_var.get(i), bs)
-                    for i, bs in enumerate(
-                        [block_size_var.x, block_size_var.y, block_size_var.z]
-                    )
-                ]
-                reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
-
-                grid_size_entries = [
-                    _div_ceil(work_items_var.get(i), bs)
-                    for i, bs in enumerate(
-                        [
-                            reduced_block_size_var.x,
-                            reduced_block_size_var.y,
-                            reduced_block_size_var.z,
-                        ]
-                    )
-                ]
-                grid_size_var = dim3(const=True).var("__grid_size")
-
-                nodes = [
-                    self.init(block_size_var)(*block_size_init_args),
-                    self.init(work_items_var)(*work_items_entries),
-                    self.init(reduced_block_size_var)(*reduced_block_size_entries),
-                    self.init(grid_size_var)(*grid_size_entries),
-                    _render_invocation(grid_size_var, reduced_block_size_var, stream),
-                ]
-
-                return SfgBlock(SfgSequence(nodes))
-
-            case _:
-                raise ValueError(f"Unexpected launch configuration: {launch_config}")
-
     def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence:
         """Syntax sequencing. For details, see `make_sequence`"""
         return make_sequence(*args)
diff --git a/src/pystencilssfg/composer/composer.py b/src/pystencilssfg/composer/composer.py
index bba479e..b1cfc4b 100644
--- a/src/pystencilssfg/composer/composer.py
+++ b/src/pystencilssfg/composer/composer.py
@@ -3,12 +3,13 @@ from typing import TYPE_CHECKING
 
 from .basic_composer import SfgBasicComposer
 from .class_composer import SfgClassComposer
+from .gpu_composer import SfgGpuComposer
 
 if TYPE_CHECKING:
     from ..context import SfgContext
 
 
-class SfgComposer(SfgBasicComposer, SfgClassComposer):
+class SfgComposer(SfgBasicComposer, SfgClassComposer, SfgGpuComposer):
     """Primary interface for constructing source files in pystencils-sfg.
 
     The SfgComposer combines the `SfgBasicComposer`
@@ -19,3 +20,4 @@ class SfgComposer(SfgBasicComposer, SfgClassComposer):
     def __init__(self, sfg: SfgContext | SfgBasicComposer):
         SfgBasicComposer.__init__(self, sfg)
         SfgClassComposer.__init__(self)
+        SfgGpuComposer.__init__(self)
diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
new file mode 100644
index 0000000..bedc206
--- /dev/null
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -0,0 +1,248 @@
+from __future__ import annotations
+
+from typing import overload
+
+from pystencils.codegen import GpuKernel
+from pystencils.codegen.gpu_indexing import (
+    ManualLaunchConfiguration,
+    AutomaticLaunchConfiguration,
+    DynamicBlockSizeLaunchConfiguration,
+)
+
+from .mixin import SfgComposerMixIn
+from .basic_composer import SfgBasicComposer, make_statements
+
+from ..ir import (
+    SfgKernelHandle,
+    SfgCallTreeNode,
+    SfgCudaKernelInvocation,
+    SfgBlock,
+    SfgSequence,
+)
+from ..lang import ExprLike, AugExpr
+from ..lang.gpu import ProvidesGpuRuntimeAPI
+
+
+class SfgGpuComposer(SfgComposerMixIn):
+
+    def __init__(self) -> None:
+        self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
+
+    def use_cuda(self):
+        from ..lang.gpu import CudaAPI
+
+        if self._gpu_api_provider is not None and not isinstance(
+            self._gpu_api_provider, CudaAPI
+        ):
+            raise ValueError(
+                "Cannot select CUDA GPU API since another API was already chosen"
+            )
+
+        self._gpu_api_provider = CudaAPI()
+
+    def use_hip(self):
+        from ..lang.gpu import HipAPI
+
+        if self._gpu_api_provider is not None and not isinstance(
+            self._gpu_api_provider, HipAPI
+        ):
+            raise ValueError(
+                "Cannot select HIP GPU API since another API was already chosen"
+            )
+
+        self._gpu_api_provider = HipAPI()
+
+    @property
+    def gpu_api(self) -> ProvidesGpuRuntimeAPI:
+        if self._gpu_api_provider is None:
+            raise AttributeError(
+                "No GPU API was selected - call `use_cuda()` or `use_hip()` first."
+            )
+
+        return self._gpu_api_provider
+
+    @overload
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        grid_size: ExprLike,
+        block_size: ExprLike,
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode:
+        """Invoke a CUDA kernel with a manual launch grid.
+
+        Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
+        set to `True`.
+        """
+
+    @overload
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode:
+        """Invoke a CUDA kernel with an automatic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that permits
+        the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
+        """
+
+    @overload
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        block_size: ExprLike | None = None,
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode:
+        """Invoke a CUDA kernel with a dynamic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that permits a user-defined
+        blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
+        The grid size is calculated automatically.
+        """
+
+    def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
+        assert isinstance(
+            self, SfgBasicComposer
+        )  # for type checking this function body
+
+        ker = kernel_handle.kernel
+
+        if not isinstance(ker, GpuKernel):
+            raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}")
+
+        launch_config = ker.get_launch_configuration()
+
+        dim3 = self.gpu_api.dim3
+
+        def _render_invocation(
+            grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None
+        ):
+            stmt_grid_size = make_statements(grid_size)
+            stmt_block_size = make_statements(block_size)
+            stmt_stream = make_statements(stream) if stream is not None else None
+
+            return SfgCudaKernelInvocation(
+                kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream
+            )
+
+        grid_size: ExprLike
+        block_size: ExprLike
+        stream: ExprLike | None = kwargs.get("stream", None)
+
+        match launch_config:
+            case ManualLaunchConfiguration():
+                grid_size = kwargs["grid_size"]
+                block_size = kwargs["block_size"]
+
+                return _render_invocation(grid_size, block_size, stream)
+
+            case AutomaticLaunchConfiguration():
+                grid_size_entries = [
+                    self.expr_from_lambda(gs) for gs in launch_config._grid_size
+                ]
+                grid_size_var = dim3(const=True).var("__grid_size")
+
+                block_size_entries = [
+                    self.expr_from_lambda(bs) for bs in launch_config._block_size
+                ]
+                block_size_var = dim3(const=True).var("__block_size")
+
+                nodes = [
+                    self.init(grid_size_var)(*grid_size_entries),
+                    self.init(block_size_var)(*block_size_entries),
+                    _render_invocation(grid_size_var, block_size_var, stream),
+                ]
+
+                return SfgBlock(SfgSequence(nodes))
+
+            case DynamicBlockSizeLaunchConfiguration():
+                user_block_size: ExprLike | None = kwargs.get("block_size", None)
+
+                block_size_init_args: tuple[ExprLike, ...]
+                if user_block_size is None:
+                    if launch_config.block_size is None:
+                        raise ValueError(
+                            "Neither a user-defined nor a default block size was defined."
+                        )
+
+                    block_size_init_args = tuple(
+                        str(bs) for bs in launch_config.block_size
+                    )
+                else:
+                    block_size_init_args = (user_block_size,)
+
+                block_size_var = dim3(const=True).var("__block_size")
+
+                from ..lang.cpp import std
+
+                work_items_entries = [
+                    self.expr_from_lambda(wit) for wit in launch_config.num_work_items
+                ]
+                work_items_var = std.tuple(
+                    "uint32_t", "uint32_t", "uint32_t", const=True
+                ).var("__work_items")
+
+                def _min(a: ExprLike, b: ExprLike):
+                    return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
+
+                def _div_ceil(a: ExprLike, b: ExprLike):
+                    return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
+
+                reduced_block_size_entries = [
+                    _min(work_items_var.get(i), bs)
+                    for i, bs in enumerate(
+                        [block_size_var.x, block_size_var.y, block_size_var.z]
+                    )
+                ]
+                reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
+
+                grid_size_entries = [
+                    _div_ceil(work_items_var.get(i), bs)
+                    for i, bs in enumerate(
+                        [
+                            reduced_block_size_var.x,
+                            reduced_block_size_var.y,
+                            reduced_block_size_var.z,
+                        ]
+                    )
+                ]
+                grid_size_var = dim3(const=True).var("__grid_size")
+
+                nodes = [
+                    self.init(block_size_var)(*block_size_init_args),
+                    self.init(work_items_var)(*work_items_entries),
+                    self.init(reduced_block_size_var)(*reduced_block_size_entries),
+                    self.init(grid_size_var)(*grid_size_entries),
+                    _render_invocation(grid_size_var, reduced_block_size_var, stream),
+                ]
+
+                return SfgBlock(SfgSequence(nodes))
+
+            case _:
+                raise ValueError(f"Unexpected launch configuration: {launch_config}")
+
+    def cuda_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        num_blocks: ExprLike,
+        threads_per_block: ExprLike,
+        stream: ExprLike | None,
+    ):
+        from warnings import warn
+
+        warn(
+            "cuda_invoke is deprecated and will be removed before version 0.1. "
+            "Call `use_cuda()` and use `gpu_invoke` instead.",
+            FutureWarning,
+        )
+
+        return self.gpu_invoke(
+            kernel_handle,
+            grid_size=num_blocks,
+            block_size=threads_per_block,
+            stream=stream,
+        )
diff --git a/src/pystencilssfg/lang/cuda.py b/src/pystencilssfg/lang/cuda.py
deleted file mode 100644
index 28794da..0000000
--- a/src/pystencilssfg/lang/cuda.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from .expressions import CppClass, cpptype
-
-
-from ..extensions.gpu import dim3class
-
-dim3 = dim3class(
-    "<cuda_runtime.h>"
-)
-"""Reflection of CUDA's `dim3`."""
-
-
-class cudaStream_t(CppClass):
-    template = cpptype("cudaStream_t", "<cuda_runtime.h>")
diff --git a/src/pystencilssfg/lang/gpu.py b/src/pystencilssfg/lang/gpu.py
new file mode 100644
index 0000000..ccf86d9
--- /dev/null
+++ b/src/pystencilssfg/lang/gpu.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from typing import Protocol
+
+from .expressions import CppClass, cpptype, AugExpr
+
+
+class _Dim3Base(CppClass):
+    def ctor(self, dim0=1, dim1=1, dim2=1):
+        return self.ctor_bind(dim0, dim1, dim2)
+
+    @property
+    def x(self):
+        return AugExpr.format("{}.x", self)
+
+    @property
+    def y(self):
+        return AugExpr.format("{}.y", self)
+
+    @property
+    def z(self):
+        return AugExpr.format("{}.z", self)
+
+    @property
+    def dims(self):
+        """The dims property."""
+        return [self.x, self.y, self.z]
+
+
+class ProvidesGpuRuntimeAPI(Protocol):
+
+    dim3: type[_Dim3Base]
+
+    stream_t: type[AugExpr]
+
+
+class CudaAPI(ProvidesGpuRuntimeAPI):
+
+    class dim3(_Dim3Base):
+        template = cpptype("dim3", "<cuda_runtime.h>")
+
+    class stream_t(CppClass):
+        template = cpptype("cudaStream_t", "<cuda_runtime.h>")
+
+
+class HipAPI(ProvidesGpuRuntimeAPI):
+
+    class dim3(_Dim3Base):
+        template = cpptype("dim3", "<hip/hip_runtime.h>")
+
+    class stream_t(CppClass):
+        template = cpptype("hipStream_t", "<hip/hip_runtime.h>")
diff --git a/tests/generator_scripts/index.yaml b/tests/generator_scripts/index.yaml
index bfbedda..837ea10 100644
--- a/tests/generator_scripts/index.yaml
+++ b/tests/generator_scripts/index.yaml
@@ -104,6 +104,19 @@ CudaKernels:
       - --expt-relaxed-constexpr
     skip-if-not-found: true
 
+# HIP
+
+HipKernels:
+  sfg-args:
+    file-extensions: ["h++", "hip"]
+  compile:
+    cxx: hipcc
+    cxx-flags: 
+      - -std=c++20
+      - -Wall
+      - -Werror
+    skip-if-not-found: true
+
 # SYCL
 
 SyclKernels:
diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py
index 9bd37a5..21064f6 100644
--- a/tests/generator_scripts/source/CudaKernels.py
+++ b/tests/generator_scripts/source/CudaKernels.py
@@ -1,5 +1,4 @@
 from pystencilssfg import SourceFileGenerator
-from pystencilssfg.lang.cuda import dim3
 from pystencilssfg.lang.cpp import std
 
 import pystencils as ps
@@ -7,6 +6,7 @@ import pystencils as ps
 std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
 
 with SourceFileGenerator() as sfg:
+    sfg.use_cuda()
 
     src, dst = ps.fields("src, dst: double[3D]", layout="c")
     asm = ps.Assignment(dst(0), 2 * src(0))
@@ -14,10 +14,10 @@ with SourceFileGenerator() as sfg:
 
     khandle = sfg.kernels.create(asm, "scale", cfg)
 
-    block_size = dim3().var("blockSize")
-    
+    block_size = sfg.gpu_api.dim3().var("blockSize")
+
     sfg.function("invoke")(
         sfg.map_field(src, std.mdspan.from_field(src)),
         sfg.map_field(dst, std.mdspan.from_field(dst)),
-        sfg.cuda_invoke(khandle, block_size=block_size)
+        sfg.gpu_invoke(khandle, block_size=block_size),
     )
diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py
new file mode 100644
index 0000000..16508d2
--- /dev/null
+++ b/tests/generator_scripts/source/HipKernels.py
@@ -0,0 +1,23 @@
+from pystencilssfg import SourceFileGenerator
+from pystencilssfg.lang.cpp import std
+
+import pystencils as ps
+
+std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
+
+with SourceFileGenerator() as sfg:
+    sfg.use_hip()
+
+    src, dst = ps.fields("src, dst: double[3D]", layout="c")
+    asm = ps.Assignment(dst(0), 2 * src(0))
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    khandle = sfg.kernels.create(asm, "scale", cfg)
+
+    block_size = sfg.gpu_api.dim3().var("blockSize")
+
+    sfg.function("invoke")(
+        sfg.map_field(src, std.mdspan.from_field(src)),
+        sfg.map_field(dst, std.mdspan.from_field(dst)),
+        sfg.gpu_invoke(khandle, block_size=block_size),
+    )
-- 
GitLab


From 635d3693f00ee5db633a539852d08877a1cdea5a Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 6 Mar 2025 17:04:14 +0100
Subject: [PATCH 06/28] first working HIP test case

---
 src/pystencilssfg/composer/gpu_composer.py    | 37 ++++++----
 src/pystencilssfg/ir/__init__.py              |  4 +-
 src/pystencilssfg/ir/call_tree.py             | 28 ++++++--
 tests/generator_scripts/index.yaml            |  2 +-
 .../source/HipKernels.harness.cpp             | 72 +++++++++++++++++++
 tests/generator_scripts/source/HipKernels.py  | 10 +--
 6 files changed, 129 insertions(+), 24 deletions(-)
 create mode 100644 tests/generator_scripts/source/HipKernels.harness.cpp

diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index bedc206..6394980 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -15,7 +15,7 @@ from .basic_composer import SfgBasicComposer, make_statements
 from ..ir import (
     SfgKernelHandle,
     SfgCallTreeNode,
-    SfgCudaKernelInvocation,
+    SfgGpuKernelInvocation,
     SfgBlock,
     SfgSequence,
 )
@@ -68,6 +68,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         *,
         grid_size: ExprLike,
         block_size: ExprLike,
+        shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
     ) -> SfgCallTreeNode:
         """Invoke a CUDA kernel with a manual launch grid.
@@ -81,6 +82,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         self,
         kernel_handle: SfgKernelHandle,
         *,
+        shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
     ) -> SfgCallTreeNode:
         """Invoke a CUDA kernel with an automatic launch grid.
@@ -95,6 +97,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         kernel_handle: SfgKernelHandle,
         *,
         block_size: ExprLike | None = None,
+        shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
     ) -> SfgCallTreeNode:
         """Invoke a CUDA kernel with a dynamic launch grid.
@@ -118,27 +121,35 @@ class SfgGpuComposer(SfgComposerMixIn):
 
         dim3 = self.gpu_api.dim3
 
-        def _render_invocation(
-            grid_size: ExprLike, block_size: ExprLike, stream: ExprLike | None
-        ):
+        grid_size: ExprLike
+        block_size: ExprLike
+        shared_memory_bytes: ExprLike = kwargs.get("shared_memory_bytes", "0")
+        stream: ExprLike | None = kwargs.get("stream", None)
+
+        def _render_invocation(grid_size: ExprLike, block_size: ExprLike):
             stmt_grid_size = make_statements(grid_size)
             stmt_block_size = make_statements(block_size)
+            stmt_smem = (
+                make_statements(shared_memory_bytes)
+                if shared_memory_bytes is not None
+                else None
+            )
             stmt_stream = make_statements(stream) if stream is not None else None
 
-            return SfgCudaKernelInvocation(
-                kernel_handle, stmt_grid_size, stmt_block_size, stmt_stream
+            return SfgGpuKernelInvocation(
+                kernel_handle,
+                stmt_grid_size,
+                stmt_block_size,
+                shared_memory_bytes=stmt_smem,
+                stream=stmt_stream,
             )
 
-        grid_size: ExprLike
-        block_size: ExprLike
-        stream: ExprLike | None = kwargs.get("stream", None)
-
         match launch_config:
             case ManualLaunchConfiguration():
                 grid_size = kwargs["grid_size"]
                 block_size = kwargs["block_size"]
 
-                return _render_invocation(grid_size, block_size, stream)
+                return _render_invocation(grid_size, block_size)
 
             case AutomaticLaunchConfiguration():
                 grid_size_entries = [
@@ -154,7 +165,7 @@ class SfgGpuComposer(SfgComposerMixIn):
                 nodes = [
                     self.init(grid_size_var)(*grid_size_entries),
                     self.init(block_size_var)(*block_size_entries),
-                    _render_invocation(grid_size_var, block_size_var, stream),
+                    _render_invocation(grid_size_var, block_size_var),
                 ]
 
                 return SfgBlock(SfgSequence(nodes))
@@ -217,7 +228,7 @@ class SfgGpuComposer(SfgComposerMixIn):
                     self.init(work_items_var)(*work_items_entries),
                     self.init(reduced_block_size_var)(*reduced_block_size_entries),
                     self.init(grid_size_var)(*grid_size_entries),
-                    _render_invocation(grid_size_var, reduced_block_size_var, stream),
+                    _render_invocation(grid_size_var, reduced_block_size_var),
                 ]
 
                 return SfgBlock(SfgSequence(nodes))
diff --git a/src/pystencilssfg/ir/__init__.py b/src/pystencilssfg/ir/__init__.py
index 8f03fed..0d93fb1 100644
--- a/src/pystencilssfg/ir/__init__.py
+++ b/src/pystencilssfg/ir/__init__.py
@@ -3,7 +3,7 @@ from .call_tree import (
     SfgCallTreeLeaf,
     SfgEmptyNode,
     SfgKernelCallNode,
-    SfgCudaKernelInvocation,
+    SfgGpuKernelInvocation,
     SfgBlock,
     SfgSequence,
     SfgStatements,
@@ -47,7 +47,7 @@ __all__ = [
     "SfgCallTreeLeaf",
     "SfgEmptyNode",
     "SfgKernelCallNode",
-    "SfgCudaKernelInvocation",
+    "SfgGpuKernelInvocation",
     "SfgSequence",
     "SfgBlock",
     "SfgStatements",
diff --git a/src/pystencilssfg/ir/call_tree.py b/src/pystencilssfg/ir/call_tree.py
index 78ba841..ab84db4 100644
--- a/src/pystencilssfg/ir/call_tree.py
+++ b/src/pystencilssfg/ir/call_tree.py
@@ -210,12 +210,20 @@ class SfgKernelCallNode(SfgCallTreeLeaf):
         return f"{fnc_name}({call_parameters});"
 
 
-class SfgCudaKernelInvocation(SfgCallTreeNode):
+class SfgGpuKernelInvocation(SfgCallTreeNode):
+    """A CUDA or HIP kernel invocation.
+
+    See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
+    or https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
+    for the syntax.
+    """
+
     def __init__(
         self,
         kernel_handle: SfgKernelHandle,
         grid_size: SfgStatements,
         block_size: SfgStatements,
+        shared_memory_bytes: SfgStatements | None,
         stream: SfgStatements | None,
     ):
         from pystencils import Target
@@ -231,14 +239,23 @@ class SfgCudaKernelInvocation(SfgCallTreeNode):
         self._kernel_handle = kernel_handle
         self._grid_size = grid_size
         self._block_size = block_size
+        self._shared_memory_bytes = shared_memory_bytes
         self._stream = stream
 
     @property
     def children(self) -> Sequence[SfgCallTreeNode]:
         return (
-            self._grid_size,
-            self._block_size,
-        ) + ((self._stream,) if self._stream is not None else ())
+            (
+                self._grid_size,
+                self._block_size,
+            )
+            + (
+                (self._shared_memory_bytes,)
+                if self._shared_memory_bytes is not None
+                else ()
+            )
+            + ((self._stream,) if self._stream is not None else ())
+        )
 
     @property
     def depends(self) -> set[SfgVar]:
@@ -250,6 +267,9 @@ class SfgCudaKernelInvocation(SfgCallTreeNode):
         call_parameters = ", ".join([p.name for p in kparams])
 
         grid_args = [self._grid_size, self._block_size]
+        if self._shared_memory_bytes is not None:
+            grid_args += [self._shared_memory_bytes]
+
         if self._stream is not None:
             grid_args += [self._stream]
 
diff --git a/tests/generator_scripts/index.yaml b/tests/generator_scripts/index.yaml
index 837ea10..788c8bd 100644
--- a/tests/generator_scripts/index.yaml
+++ b/tests/generator_scripts/index.yaml
@@ -108,7 +108,7 @@ CudaKernels:
 
 HipKernels:
   sfg-args:
-    file-extensions: ["h++", "hip"]
+    file-extensions: ["hpp", "hip"]
   compile:
     cxx: hipcc
     cxx-flags: 
diff --git a/tests/generator_scripts/source/HipKernels.harness.cpp b/tests/generator_scripts/source/HipKernels.harness.cpp
new file mode 100644
index 0000000..495f100
--- /dev/null
+++ b/tests/generator_scripts/source/HipKernels.harness.cpp
@@ -0,0 +1,72 @@
+#include "HipKernels.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include <experimental/mdspan>
+#include <random>
+#include <iostream>
+
+#undef NDEBUG
+#include <cassert>
+
+namespace stdex = std::experimental;
+
+using extents_t = stdex::dextents<uint64_t, 3>;
+using field_t = stdex::mdspan<double, extents_t, stdex::layout_right>;
+
+void checkHipError(hipError_t err)
+{
+    if (err != hipSuccess)
+    {
+        std::cerr << "HIP Error: " << err << std::endl;
+        exit(2);
+    }
+}
+
+int main(void)
+{
+
+    extents_t extents{23, 25, 132};
+    size_t items{extents.extent(0) * extents.extent(1) * extents.extent(2)};
+
+    double *data_src;
+    checkHipError(hipMallocManaged<double>(&data_src, sizeof(double) * items));
+    field_t src{data_src, extents};
+
+    double *data_dst;
+    checkHipError(hipMallocManaged<double>(&data_dst, sizeof(double) * items));
+    field_t dst{data_dst, extents};
+
+    std::random_device rd;
+    std::mt19937 gen{rd()};
+    std::uniform_real_distribution<double> distrib{-1.0, 1.0};
+
+    for (size_t i = 0; i < items; ++i)
+    {
+        data_src[i] = distrib(gen);
+    }
+
+    dim3 blockSize{64, 8, 1};
+
+    hipStream_t stream;
+    checkHipError(hipStreamCreate(&stream));
+
+    gen::gpuScaleKernel(blockSize, dst, src, stream);
+
+    checkHipError(hipStreamSynchronize(stream));
+
+    for (size_t i = 0; i < items; ++i)
+    {
+        const double desired = 2.0 * data_src[i];
+        if (std::abs(desired - data_dst[i]) >= 1e-12)
+        {
+            std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl;
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    checkHipError(hipFree(data_src));
+    checkHipError(hipFree(data_dst));
+
+    return EXIT_SUCCESS;
+}
diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py
index 16508d2..ed22933 100644
--- a/tests/generator_scripts/source/HipKernels.py
+++ b/tests/generator_scripts/source/HipKernels.py
@@ -7,6 +7,7 @@ std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan
 
 with SourceFileGenerator() as sfg:
     sfg.use_hip()
+    sfg.namespace("gen")
 
     src, dst = ps.fields("src, dst: double[3D]", layout="c")
     asm = ps.Assignment(dst(0), 2 * src(0))
@@ -15,9 +16,10 @@ with SourceFileGenerator() as sfg:
     khandle = sfg.kernels.create(asm, "scale", cfg)
 
     block_size = sfg.gpu_api.dim3().var("blockSize")
+    stream = sfg.gpu_api.stream_t().var("stream")
 
-    sfg.function("invoke")(
-        sfg.map_field(src, std.mdspan.from_field(src)),
-        sfg.map_field(dst, std.mdspan.from_field(dst)),
-        sfg.gpu_invoke(khandle, block_size=block_size),
+    sfg.function("gpuScaleKernel")(
+        sfg.map_field(src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")),
+        sfg.map_field(dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")),
+        sfg.gpu_invoke(khandle, block_size=block_size, stream=stream),
     )
-- 
GitLab


From b1b71c327617e57eee03c532f74c38116439a39d Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 10 Mar 2025 16:24:03 +0100
Subject: [PATCH 07/28] extend HIP test case. Do not alter block sizes in
 composer.

---
 src/pystencilssfg/composer/gpu_composer.py    | 53 +++++--------
 src/pystencilssfg/context.py                  |  4 +
 .../source/HipKernels.harness.cpp             | 73 ++++++++++++-----
 tests/generator_scripts/source/HipKernels.py  | 78 ++++++++++++++++---
 4 files changed, 146 insertions(+), 62 deletions(-)

diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index 6394980..b24afcd 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -29,31 +29,24 @@ class SfgGpuComposer(SfgComposerMixIn):
         self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
 
     def use_cuda(self):
+        """Instruct the GPU composer to use the CUDA runtime API"""
         from ..lang.gpu import CudaAPI
 
-        if self._gpu_api_provider is not None and not isinstance(
-            self._gpu_api_provider, CudaAPI
-        ):
-            raise ValueError(
-                "Cannot select CUDA GPU API since another API was already chosen"
-            )
-
         self._gpu_api_provider = CudaAPI()
 
     def use_hip(self):
+        """Instruct the GPU composer to use the HIP runtime API"""
         from ..lang.gpu import HipAPI
 
-        if self._gpu_api_provider is not None and not isinstance(
-            self._gpu_api_provider, HipAPI
-        ):
-            raise ValueError(
-                "Cannot select HIP GPU API since another API was already chosen"
-            )
-
         self._gpu_api_provider = HipAPI()
 
     @property
     def gpu_api(self) -> ProvidesGpuRuntimeAPI:
+        """GPU runtime API wrapper currently used by this GPU composer.
+
+        Raises:
+            AttributeError: If no runtime API was set yet (see `use_cuda`, `use_hip`)
+        """
         if self._gpu_api_provider is None:
             raise AttributeError(
                 "No GPU API was selected - call `use_cuda()` or `use_hip()` first."
@@ -104,7 +97,8 @@ class SfgGpuComposer(SfgComposerMixIn):
 
         This signature accepts kernels generated with an indexing scheme that permits a user-defined
         blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
-        The grid size is calculated automatically.
+        The grid size is calculated automatically by dividing the number of work items in each
+        dimension by the block size, rounding up.
         """
 
     def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
@@ -144,6 +138,9 @@ class SfgGpuComposer(SfgComposerMixIn):
                 stream=stmt_stream,
             )
 
+        def to_uint32_t(expr: AugExpr) -> AugExpr:
+            return AugExpr("uint32_t").format("uint32_t({})", expr)
+
         match launch_config:
             case ManualLaunchConfiguration():
                 grid_size = kwargs["grid_size"]
@@ -153,12 +150,14 @@ class SfgGpuComposer(SfgComposerMixIn):
 
             case AutomaticLaunchConfiguration():
                 grid_size_entries = [
-                    self.expr_from_lambda(gs) for gs in launch_config._grid_size
+                    to_uint32_t(self.expr_from_lambda(gs))
+                    for gs in launch_config._grid_size
                 ]
                 grid_size_var = dim3(const=True).var("__grid_size")
 
                 block_size_entries = [
-                    self.expr_from_lambda(bs) for bs in launch_config._block_size
+                    to_uint32_t(self.expr_from_lambda(bs))
+                    for bs in launch_config._block_size
                 ]
                 block_size_var = dim3(const=True).var("__block_size")
 
@@ -197,27 +196,16 @@ class SfgGpuComposer(SfgComposerMixIn):
                     "uint32_t", "uint32_t", "uint32_t", const=True
                 ).var("__work_items")
 
-                def _min(a: ExprLike, b: ExprLike):
-                    return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
-
                 def _div_ceil(a: ExprLike, b: ExprLike):
                     return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
 
-                reduced_block_size_entries = [
-                    _min(work_items_var.get(i), bs)
-                    for i, bs in enumerate(
-                        [block_size_var.x, block_size_var.y, block_size_var.z]
-                    )
-                ]
-                reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
-
                 grid_size_entries = [
                     _div_ceil(work_items_var.get(i), bs)
                     for i, bs in enumerate(
                         [
-                            reduced_block_size_var.x,
-                            reduced_block_size_var.y,
-                            reduced_block_size_var.z,
+                            block_size_var.x,
+                            block_size_var.y,
+                            block_size_var.z,
                         ]
                     )
                 ]
@@ -226,9 +214,8 @@ class SfgGpuComposer(SfgComposerMixIn):
                 nodes = [
                     self.init(block_size_var)(*block_size_init_args),
                     self.init(work_items_var)(*work_items_entries),
-                    self.init(reduced_block_size_var)(*reduced_block_size_entries),
                     self.init(grid_size_var)(*grid_size_entries),
-                    _render_invocation(grid_size_var, reduced_block_size_var),
+                    _render_invocation(grid_size_var, block_size_var),
                 ]
 
                 return SfgBlock(SfgSequence(nodes))
diff --git a/src/pystencilssfg/context.py b/src/pystencilssfg/context.py
index 1622a1e..3ea82f2 100644
--- a/src/pystencilssfg/context.py
+++ b/src/pystencilssfg/context.py
@@ -150,6 +150,9 @@ class SfgCursor:
             self._loc[f].append(block)
             self._loc[f] = block.elements
 
+        outer_namespace = self._cur_namespace
+        self._cur_namespace = namespace
+
         @contextmanager
         def ctxmgr():
             try:
@@ -157,5 +160,6 @@ class SfgCursor:
             finally:
                 #   Have the cursor step back out of the nested namespace blocks
                 self._loc = outer_locs
+                self._cur_namespace = outer_namespace
 
         return ctxmgr()
diff --git a/tests/generator_scripts/source/HipKernels.harness.cpp b/tests/generator_scripts/source/HipKernels.harness.cpp
index 495f100..b6d2d2d 100644
--- a/tests/generator_scripts/source/HipKernels.harness.cpp
+++ b/tests/generator_scripts/source/HipKernels.harness.cpp
@@ -5,6 +5,7 @@
 #include <experimental/mdspan>
 #include <random>
 #include <iostream>
+#include <functional>
 
 #undef NDEBUG
 #include <cassert>
@@ -41,29 +42,63 @@ int main(void)
     std::mt19937 gen{rd()};
     std::uniform_real_distribution<double> distrib{-1.0, 1.0};
 
-    for (size_t i = 0; i < items; ++i)
-    {
-        data_src[i] = distrib(gen);
-    }
-
-    dim3 blockSize{64, 8, 1};
-
-    hipStream_t stream;
-    checkHipError(hipStreamCreate(&stream));
-
-    gen::gpuScaleKernel(blockSize, dst, src, stream);
+    auto check = [&](std::function< void () > invoke) {
+        for (size_t i = 0; i < items; ++i)
+        {
+            data_src[i] = distrib(gen);
+            data_dst[i] = NAN;
+        }
 
-    checkHipError(hipStreamSynchronize(stream));
+        invoke();
 
-    for (size_t i = 0; i < items; ++i)
-    {
-        const double desired = 2.0 * data_src[i];
-        if (std::abs(desired - data_dst[i]) >= 1e-12)
+        for (size_t i = 0; i < items; ++i)
         {
-            std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl;
-            exit(EXIT_FAILURE);
+            const double desired = 2.0 * data_src[i];
+            if (std::abs(desired - data_dst[i]) >= 1e-12)
+            {
+                std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl;
+                exit(EXIT_FAILURE);
+            }
         }
-    }
+    };
+
+    check([&]() {
+        /* Linear3D Dynamic */
+        dim3 blockSize{64, 8, 1};
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::linear3d::scaleKernel(blockSize, dst, src, stream);
+        checkHipError(hipStreamSynchronize(stream));
+    });
+
+    check([&]() {
+        /* Blockwise4D Automatic */
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::blockwise4d::scaleKernel(dst, src, stream);
+        checkHipError(hipStreamSynchronize(stream));
+    });
+
+    check([&]() {
+        /* Linear3D Manual */
+        dim3 blockSize{32, 8, 1};
+        dim3 gridSize{5, 4, 23};
+
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkHipError(hipStreamSynchronize(stream));
+    });
+
+    check([&]() {
+        /* Blockwise4D Manual */
+        dim3 blockSize{132, 1, 1};
+        dim3 gridSize{25, 23, 1};
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkHipError(hipStreamSynchronize(stream));
+    });
 
     checkHipError(hipFree(data_src));
     checkHipError(hipFree(data_dst));
diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py
index ed22933..35315b8 100644
--- a/tests/generator_scripts/source/HipKernels.py
+++ b/tests/generator_scripts/source/HipKernels.py
@@ -5,21 +5,79 @@ import pystencils as ps
 
 std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
 
+
+src, dst = ps.fields("src, dst: double[3D]", layout="c")
+asm = ps.Assignment(dst(0), 2 * src(0))
+
+
 with SourceFileGenerator() as sfg:
     sfg.use_hip()
     sfg.namespace("gen")
 
-    src, dst = ps.fields("src, dst: double[3D]", layout="c")
-    asm = ps.Assignment(dst(0), 2 * src(0))
-    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
-
-    khandle = sfg.kernels.create(asm, "scale", cfg)
+    base_config = ps.CreateKernelConfig(target=ps.Target.CUDA)
 
     block_size = sfg.gpu_api.dim3().var("blockSize")
+    grid_size = sfg.gpu_api.dim3().var("gridSize")
     stream = sfg.gpu_api.stream_t().var("stream")
 
-    sfg.function("gpuScaleKernel")(
-        sfg.map_field(src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")),
-        sfg.map_field(dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")),
-        sfg.gpu_invoke(khandle, block_size=block_size, stream=stream),
-    )
+    with sfg.namespace("linear3d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, stream=stream),
+        )
+
+    with sfg.namespace("blockwise4d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, stream=stream),
+        )
+
+    with sfg.namespace("linear3d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+        )
+
+    with sfg.namespace("blockwise4d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+        )
-- 
GitLab


From 4bc5ff44dd2076e36411ee037bf1fc48a1efaf57 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 10 Mar 2025 16:43:00 +0100
Subject: [PATCH 08/28] add cuda tests; add CUDA requirement to CI

---
 .gitlab-ci.yml                                |   5 +-
 .../source/CudaKernels.harness.cpp            | 107 ++++++++++++++++++
 tests/generator_scripts/source/CudaKernels.py |  80 +++++++++++--
 3 files changed, 181 insertions(+), 11 deletions(-)
 create mode 100644 tests/generator_scripts/source/CudaKernels.harness.cpp

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1420bd2..9a6e7b5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -38,10 +38,13 @@ typechecker:
         coverage_format: cobertura
         path: coverage.xml
 
-"testsuite-py3.10":
+"testsuite-py3.10+cuda":
   extends: .testsuite-base
   script:
     - nox --session testsuite-3.10
+  tags:
+    - docker
+    - cuda11
 
 "testsuite-py3.13":
   extends: .testsuite-base
diff --git a/tests/generator_scripts/source/CudaKernels.harness.cpp b/tests/generator_scripts/source/CudaKernels.harness.cpp
new file mode 100644
index 0000000..b86a7c2
--- /dev/null
+++ b/tests/generator_scripts/source/CudaKernels.harness.cpp
@@ -0,0 +1,107 @@
+#include "CudaKernels.hpp"
+
+#include <cuda/cuda_runtime.h>
+
+#include <experimental/mdspan>
+#include <random>
+#include <iostream>
+#include <functional>
+
+#undef NDEBUG
+#include <cassert>
+
+namespace stdex = std::experimental;
+
+using extents_t = stdex::dextents<uint64_t, 3>;
+using field_t = stdex::mdspan<double, extents_t, stdex::layout_right>;
+
+void checkCudaError(cudaError_t err)
+{
+    if (err != cudaSuccess)
+    {
+        std::cerr << "HIP Error: " << err << std::endl;
+        exit(2);
+    }
+}
+
+int main(void)
+{
+
+    extents_t extents{23, 25, 132};
+    size_t items{extents.extent(0) * extents.extent(1) * extents.extent(2)};
+
+    double *data_src;
+    checkCudaError(cudaMallocManaged<double>(&data_src, sizeof(double) * items));
+    field_t src{data_src, extents};
+
+    double *data_dst;
+    checkCudaError(cudaMallocManaged<double>(&data_dst, sizeof(double) * items));
+    field_t dst{data_dst, extents};
+
+    std::random_device rd;
+    std::mt19937 gen{rd()};
+    std::uniform_real_distribution<double> distrib{-1.0, 1.0};
+
+    auto check = [&](std::function< void () > invoke) {
+        for (size_t i = 0; i < items; ++i)
+        {
+            data_src[i] = distrib(gen);
+            data_dst[i] = NAN;
+        }
+
+        invoke();
+
+        for (size_t i = 0; i < items; ++i)
+        {
+            const double desired = 2.0 * data_src[i];
+            if (std::abs(desired - data_dst[i]) >= 1e-12)
+            {
+                std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl;
+                exit(EXIT_FAILURE);
+            }
+        }
+    };
+
+    check([&]() {
+        /* Linear3D Dynamic */
+        dim3 blockSize{64, 8, 1};
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::linear3d::scaleKernel(blockSize, dst, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream));
+    });
+
+    check([&]() {
+        /* Blockwise4D Automatic */
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::blockwise4d::scaleKernel(dst, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream));
+    });
+
+    check([&]() {
+        /* Linear3D Manual */
+        dim3 blockSize{32, 8, 1};
+        dim3 gridSize{5, 4, 23};
+
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream));
+    });
+
+    check([&]() {
+        /* Blockwise4D Manual */
+        dim3 blockSize{132, 1, 1};
+        dim3 gridSize{25, 23, 1};
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream));
+    });
+
+    checkCudaError(cudaFree(data_src));
+    checkCudaError(cudaFree(data_dst));
+
+    return EXIT_SUCCESS;
+}
diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py
index 21064f6..dc7e643 100644
--- a/tests/generator_scripts/source/CudaKernels.py
+++ b/tests/generator_scripts/source/CudaKernels.py
@@ -5,19 +5,79 @@ import pystencils as ps
 
 std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
 
+
+src, dst = ps.fields("src, dst: double[3D]", layout="c")
+asm = ps.Assignment(dst(0), 2 * src(0))
+
+
 with SourceFileGenerator() as sfg:
     sfg.use_cuda()
+    sfg.namespace("gen")
 
-    src, dst = ps.fields("src, dst: double[3D]", layout="c")
-    asm = ps.Assignment(dst(0), 2 * src(0))
-    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
-
-    khandle = sfg.kernels.create(asm, "scale", cfg)
+    base_config = ps.CreateKernelConfig(target=ps.Target.CUDA)
 
     block_size = sfg.gpu_api.dim3().var("blockSize")
+    grid_size = sfg.gpu_api.dim3().var("gridSize")
+    stream = sfg.gpu_api.stream_t().var("stream")
+
+    with sfg.namespace("linear3d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, stream=stream),
+        )
+
+    with sfg.namespace("blockwise4d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, stream=stream),
+        )
+
+    with sfg.namespace("linear3d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+        )
+
+    with sfg.namespace("blockwise4d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
 
-    sfg.function("invoke")(
-        sfg.map_field(src, std.mdspan.from_field(src)),
-        sfg.map_field(dst, std.mdspan.from_field(dst)),
-        sfg.gpu_invoke(khandle, block_size=block_size),
-    )
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+        )
-- 
GitLab


From 39e83b12dc93864d3c290d79e16e076589e0b80a Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 10 Mar 2025 16:46:23 +0100
Subject: [PATCH 09/28] fix GPU CI task  to use cuda image

---
 .gitlab-ci.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9a6e7b5..abadb5f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,27 +4,26 @@ stages:
   - "Documentation"
   - deploy
 
-.nox-base:
+.qa-base:
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:alpine
   tags:
     - docker
 
 linter:
-  extends: .nox-base
+  extends: .qa-base
   stage: "Code Quality"
   needs: []
   script:
     - nox --session lint
 
 typechecker:
-  extends: .nox-base
+  extends: .qa-base
   stage: "Code Quality"
   needs: []
   script:
     - nox --session typecheck
 
 .testsuite-base:
-  extends: .nox-base
   stage: "Tests"
   needs: []
   coverage: '/TOTAL.*\s+(\d+%)$/'
@@ -40,6 +39,7 @@ typechecker:
 
 "testsuite-py3.10+cuda":
   extends: .testsuite-base
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:ubuntu24.04-cuda12.6
   script:
     - nox --session testsuite-3.10
   tags:
@@ -48,6 +48,7 @@ typechecker:
 
 "testsuite-py3.13":
   extends: .testsuite-base
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:alpine
   script:
     - nox --session testsuite-3.13
 
-- 
GitLab


From d81a23d956b65b8ccdb4c9501ce5d585d3c387f6 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 10 Mar 2025 16:46:39 +0100
Subject: [PATCH 10/28] Update .gitlab-ci.yml file

---
 .gitlab-ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index abadb5f..ffee882 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,20 +4,20 @@ stages:
   - "Documentation"
   - deploy
 
-.qa-base:
+.nox-base:
   image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:alpine
   tags:
     - docker
 
 linter:
-  extends: .qa-base
+  extends: .nox-base
   stage: "Code Quality"
   needs: []
   script:
     - nox --session lint
 
 typechecker:
-  extends: .qa-base
+  extends: .nox-base
   stage: "Code Quality"
   needs: []
   script:
-- 
GitLab


From 455b455efc29d70bc81175d0e0c605d36d44ad0a Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 10 Mar 2025 16:51:15 +0100
Subject: [PATCH 11/28] fix header suffix

---
 tests/generator_scripts/index.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generator_scripts/index.yaml b/tests/generator_scripts/index.yaml
index 788c8bd..c87977f 100644
--- a/tests/generator_scripts/index.yaml
+++ b/tests/generator_scripts/index.yaml
@@ -94,7 +94,7 @@ MdSpanLbStreaming:
 
 CudaKernels:
   sfg-args:
-    file-extensions: ["cuh", "cu"]
+    file-extensions: ["hpp", "cu"]
   compile:
     cxx: nvcc
     cxx-flags: 
-- 
GitLab


From 8fea44168743b849446259565436e4c608d037bc Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 10 Mar 2025 16:54:41 +0100
Subject: [PATCH 12/28] test automatic block size in linear3d

---
 .../source/CudaKernels.harness.cpp            | 35 ++++++++++++-------
 tests/generator_scripts/source/CudaKernels.py | 23 ++++++++++--
 .../source/HipKernels.harness.cpp             | 35 ++++++++++++-------
 tests/generator_scripts/source/HipKernels.py  | 23 ++++++++++--
 4 files changed, 86 insertions(+), 30 deletions(-)

diff --git a/tests/generator_scripts/source/CudaKernels.harness.cpp b/tests/generator_scripts/source/CudaKernels.harness.cpp
index b86a7c2..e691129 100644
--- a/tests/generator_scripts/source/CudaKernels.harness.cpp
+++ b/tests/generator_scripts/source/CudaKernels.harness.cpp
@@ -42,7 +42,8 @@ int main(void)
     std::mt19937 gen{rd()};
     std::uniform_real_distribution<double> distrib{-1.0, 1.0};
 
-    auto check = [&](std::function< void () > invoke) {
+    auto check = [&](std::function<void()> invoke)
+    {
         for (size_t i = 0; i < items; ++i)
         {
             data_src[i] = distrib(gen);
@@ -62,24 +63,33 @@ int main(void)
         }
     };
 
-    check([&]() {
+    check([&]()
+          {
         /* Linear3D Dynamic */
         dim3 blockSize{64, 8, 1};
         cudaStream_t stream;
         checkCudaError(cudaStreamCreate(&stream));
         gen::linear3d::scaleKernel(blockSize, dst, src, stream);
-        checkCudaError(cudaStreamSynchronize(stream));
-    });
+        checkCudaError(cudaStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Linear3D Automatic */
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::linear3d::scaleKernel(dst, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream)); });
 
-    check([&]() {
+    check([&]()
+          {
         /* Blockwise4D Automatic */
         cudaStream_t stream;
         checkCudaError(cudaStreamCreate(&stream));
         gen::blockwise4d::scaleKernel(dst, src, stream);
-        checkCudaError(cudaStreamSynchronize(stream));
-    });
+        checkCudaError(cudaStreamSynchronize(stream)); });
 
-    check([&]() {
+    check([&]()
+          {
         /* Linear3D Manual */
         dim3 blockSize{32, 8, 1};
         dim3 gridSize{5, 4, 23};
@@ -87,18 +97,17 @@ int main(void)
         cudaStream_t stream;
         checkCudaError(cudaStreamCreate(&stream));
         gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
-        checkCudaError(cudaStreamSynchronize(stream));
-    });
+        checkCudaError(cudaStreamSynchronize(stream)); });
 
-    check([&]() {
+    check([&]()
+          {
         /* Blockwise4D Manual */
         dim3 blockSize{132, 1, 1};
         dim3 gridSize{25, 23, 1};
         cudaStream_t stream;
         checkCudaError(cudaStreamCreate(&stream));
         gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
-        checkCudaError(cudaStreamSynchronize(stream));
-    });
+        checkCudaError(cudaStreamSynchronize(stream)); });
 
     checkCudaError(cudaFree(data_src));
     checkCudaError(cudaFree(data_dst));
diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py
index dc7e643..8572782 100644
--- a/tests/generator_scripts/source/CudaKernels.py
+++ b/tests/generator_scripts/source/CudaKernels.py
@@ -35,6 +35,21 @@ with SourceFileGenerator() as sfg:
             sfg.gpu_invoke(khandle, block_size=block_size, stream=stream),
         )
 
+    with sfg.namespace("linear3d_automatic"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, stream=stream),
+        )
+
     with sfg.namespace("blockwise4d"):
         cfg = base_config.copy()
         cfg.gpu.indexing_scheme = "blockwise4d"
@@ -63,7 +78,9 @@ with SourceFileGenerator() as sfg:
             sfg.map_field(
                 dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
             ),
-            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+            sfg.gpu_invoke(
+                khandle, block_size=block_size, grid_size=grid_size, stream=stream
+            ),
         )
 
     with sfg.namespace("blockwise4d_manual"):
@@ -79,5 +96,7 @@ with SourceFileGenerator() as sfg:
             sfg.map_field(
                 dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
             ),
-            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+            sfg.gpu_invoke(
+                khandle, block_size=block_size, grid_size=grid_size, stream=stream
+            ),
         )
diff --git a/tests/generator_scripts/source/HipKernels.harness.cpp b/tests/generator_scripts/source/HipKernels.harness.cpp
index b6d2d2d..2bf7b83 100644
--- a/tests/generator_scripts/source/HipKernels.harness.cpp
+++ b/tests/generator_scripts/source/HipKernels.harness.cpp
@@ -42,7 +42,8 @@ int main(void)
     std::mt19937 gen{rd()};
     std::uniform_real_distribution<double> distrib{-1.0, 1.0};
 
-    auto check = [&](std::function< void () > invoke) {
+    auto check = [&](std::function<void()> invoke)
+    {
         for (size_t i = 0; i < items; ++i)
         {
             data_src[i] = distrib(gen);
@@ -62,24 +63,33 @@ int main(void)
         }
     };
 
-    check([&]() {
+    check([&]()
+          {
         /* Linear3D Dynamic */
         dim3 blockSize{64, 8, 1};
         hipStream_t stream;
         checkHipError(hipStreamCreate(&stream));
         gen::linear3d::scaleKernel(blockSize, dst, src, stream);
-        checkHipError(hipStreamSynchronize(stream));
-    });
+        checkHipError(hipStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Linear3D Automatic */
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::linear3d_automatic::scaleKernel(dst, src, stream);
+        checkHipError(hipStreamSynchronize(stream)); });
 
-    check([&]() {
+    check([&]()
+          {
         /* Blockwise4D Automatic */
         hipStream_t stream;
         checkHipError(hipStreamCreate(&stream));
         gen::blockwise4d::scaleKernel(dst, src, stream);
-        checkHipError(hipStreamSynchronize(stream));
-    });
+        checkHipError(hipStreamSynchronize(stream)); });
 
-    check([&]() {
+    check([&]()
+          {
         /* Linear3D Manual */
         dim3 blockSize{32, 8, 1};
         dim3 gridSize{5, 4, 23};
@@ -87,18 +97,17 @@ int main(void)
         hipStream_t stream;
         checkHipError(hipStreamCreate(&stream));
         gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
-        checkHipError(hipStreamSynchronize(stream));
-    });
+        checkHipError(hipStreamSynchronize(stream)); });
 
-    check([&]() {
+    check([&]()
+          {
         /* Blockwise4D Manual */
         dim3 blockSize{132, 1, 1};
         dim3 gridSize{25, 23, 1};
         hipStream_t stream;
         checkHipError(hipStreamCreate(&stream));
         gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
-        checkHipError(hipStreamSynchronize(stream));
-    });
+        checkHipError(hipStreamSynchronize(stream)); });
 
     checkHipError(hipFree(data_src));
     checkHipError(hipFree(data_dst));
diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py
index 35315b8..78464b5 100644
--- a/tests/generator_scripts/source/HipKernels.py
+++ b/tests/generator_scripts/source/HipKernels.py
@@ -35,6 +35,21 @@ with SourceFileGenerator() as sfg:
             sfg.gpu_invoke(khandle, block_size=block_size, stream=stream),
         )
 
+    with sfg.namespace("linear3d_automatic"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, stream=stream),
+        )
+
     with sfg.namespace("blockwise4d"):
         cfg = base_config.copy()
         cfg.gpu.indexing_scheme = "blockwise4d"
@@ -63,7 +78,9 @@ with SourceFileGenerator() as sfg:
             sfg.map_field(
                 dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
             ),
-            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+            sfg.gpu_invoke(
+                khandle, block_size=block_size, grid_size=grid_size, stream=stream
+            ),
         )
 
     with sfg.namespace("blockwise4d_manual"):
@@ -79,5 +96,7 @@ with SourceFileGenerator() as sfg:
             sfg.map_field(
                 dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
             ),
-            sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size, stream=stream),
+            sfg.gpu_invoke(
+                khandle, block_size=block_size, grid_size=grid_size, stream=stream
+            ),
         )
-- 
GitLab


From e9fdbbe1d45f84636d4d3bd8a9927107b40f8085 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 10 Mar 2025 16:56:06 +0100
Subject: [PATCH 13/28] fix cuda runtime header path

---
 tests/generator_scripts/source/CudaKernels.harness.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generator_scripts/source/CudaKernels.harness.cpp b/tests/generator_scripts/source/CudaKernels.harness.cpp
index e691129..76694bd 100644
--- a/tests/generator_scripts/source/CudaKernels.harness.cpp
+++ b/tests/generator_scripts/source/CudaKernels.harness.cpp
@@ -1,6 +1,6 @@
 #include "CudaKernels.hpp"
 
-#include <cuda/cuda_runtime.h>
+#include <cuda_runtime.h>
 
 #include <experimental/mdspan>
 #include <random>
-- 
GitLab


From 3ff729e9f98b8d9a2323f9ee46f3d68d9eea4491 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 11 Mar 2025 08:59:13 +0100
Subject: [PATCH 14/28] fix CUDA kernels test

---
 tests/generator_scripts/source/CudaKernels.harness.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generator_scripts/source/CudaKernels.harness.cpp b/tests/generator_scripts/source/CudaKernels.harness.cpp
index 76694bd..a7d34d5 100644
--- a/tests/generator_scripts/source/CudaKernels.harness.cpp
+++ b/tests/generator_scripts/source/CudaKernels.harness.cpp
@@ -77,7 +77,7 @@ int main(void)
         /* Linear3D Automatic */
         cudaStream_t stream;
         checkCudaError(cudaStreamCreate(&stream));
-        gen::linear3d::scaleKernel(dst, src, stream);
+        gen::linear3d_automatic::scaleKernel(dst, src, stream);
         checkCudaError(cudaStreamSynchronize(stream)); });
 
     check([&]()
-- 
GitLab


From 21c0ba7fd47f41c7b38852e9ba59e6f255858d91 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 11 Mar 2025 09:54:54 +0100
Subject: [PATCH 15/28] documentation on GPU invocation

---
 docs/source/_util/sfg_monkeypatch.py         | 17 ++---
 docs/source/api/composer.rst                 |  3 +
 docs/source/usage/how_to_composer.md         | 38 +++++++++++
 src/pystencilssfg/composer/__init__.py       |  2 +
 src/pystencilssfg/composer/basic_composer.py |  4 +-
 src/pystencilssfg/composer/gpu_composer.py   | 67 ++++++++++++++------
 6 files changed, 101 insertions(+), 30 deletions(-)

diff --git a/docs/source/_util/sfg_monkeypatch.py b/docs/source/_util/sfg_monkeypatch.py
index 0269d40..1277603 100644
--- a/docs/source/_util/sfg_monkeypatch.py
+++ b/docs/source/_util/sfg_monkeypatch.py
@@ -1,6 +1,8 @@
 import pystencilssfg
 from pystencilssfg.config import SfgConfig
 
+from os.path import splitext
+
 
 class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
     """Mockup wrapper around SourceFileGenerator for use in documentation
@@ -30,21 +32,20 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
             self._finish_files()
 
             header_code = self._emitter.dumps(self._header_file)
-            impl_code = (
-                None
-                if self._impl_file is None
-                else self._emitter.dumps(self._impl_file)
-            )
+            header_ext = splitext(self._header_file.name)[1]
 
             mdcode = ":::::{tab-set}\n"
 
-            mdcode += "::::{tab-item} Generated Header (.hpp)\n"
+            mdcode += f"::::{{tab-item}} Generated Header ({header_ext})\n"
             mdcode += ":::{code-block} C++\n\n"
             mdcode += header_code
             mdcode += "\n:::\n::::\n"
 
-            if impl_code:
-                mdcode += "::::{tab-item} Generated Implementation (.cpp)\n"
+            if self._impl_file is not None:
+                impl_code = self._emitter.dumps(self._impl_file)
+                impl_ext = splitext(self._impl_file.name)[1]
+
+                mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n"
                 mdcode += ":::{code-block} C++\n\n"
                 mdcode += impl_code
                 mdcode += "\n:::\n::::\n"
diff --git a/docs/source/api/composer.rst b/docs/source/api/composer.rst
index 124d0fb..078e0eb 100644
--- a/docs/source/api/composer.rst
+++ b/docs/source/api/composer.rst
@@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``)
 .. autoclass:: SfgClassComposer
     :members:
 
+.. autoclass:: SfgGpuComposer
+    :members:
+
 Custom Generators
 =================
 
diff --git a/docs/source/usage/how_to_composer.md b/docs/source/usage/how_to_composer.md
index 966a9a6..4610d07 100644
--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -344,6 +344,44 @@ cause them to be added to its signature.
 We don't want to expose this complexity, but instead hide it by using appropriate data structures.
 The next section explains how that is achieved in pystencils-sfg.
 
+#### Invoking GPU Kernels
+
+Pystencils also allows us to generate kernels for the CUDA and HIP GPU platforms.
+First, we need to decide for one of the two systems by calling either
+{any}`sfg.use_cuda <SfgGpuComposer.use_cuda>` or {any}`sfg.use_hip <SfgGpuComposer.use_hip>`.
+After registering a GPU kernel,
+you can render its invocation using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
+
+Here is a basic example:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+
+f, g = ps.fields("f, g: double[2D]")
+asm = ps.Assignment(f(0), g(0))
+```
+
+```{code-cell} ipython3
+from pystencilssfg import SfgConfig
+sfg_config = SfgConfig()
+sfg_config.extensions.impl = "cu"
+
+with SourceFileGenerator(sfg_config) as sfg:
+    #   Activate CUDA
+    sfg.use_cuda()
+
+    #   Register the GPU kernel
+    cfg = ps.CreateKernelConfig()
+    cfg.target = ps.Target.CUDA
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    #   Invoke it
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle)
+    )
+
+```
+
 #### Mapping Fields to Data Structures
 
 Pystencils kernels operate on n-dimensional contiguous or strided arrays,
diff --git a/src/pystencilssfg/composer/__init__.py b/src/pystencilssfg/composer/__init__.py
index f6af76b..c8f279e 100644
--- a/src/pystencilssfg/composer/__init__.py
+++ b/src/pystencilssfg/composer/__init__.py
@@ -9,6 +9,7 @@ from .basic_composer import (
 )
 from .mixin import SfgComposerMixIn
 from .class_composer import SfgClassComposer
+from .gpu_composer import SfgGpuComposer
 
 __all__ = [
     "SfgIComposer",
@@ -20,4 +21,5 @@ __all__ = [
     "ExprLike",
     "SfgBasicComposer",
     "SfgClassComposer",
+    "SfgGpuComposer",
 ]
diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py
index 97334db..d78e43d 100644
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -390,8 +390,8 @@ class SfgBasicComposer(SfgIComposer):
         """Use inside a function body to directly call a kernel.
 
         When using `call`, the given kernel will simply be called as a function.
-        To invoke a GPU kernel on a specified launch grid, use `cuda_invoke`
-        or the interfaces of ``pystencilssfg.extensions.sycl`` instead.
+        To invoke a GPU kernel on a specified launch grid,
+        use `gpu_invoke <SfgGpuComposer.gpu_invoke>` instead.
 
         Args:
             kernel_handle: Handle to a kernel previously added to some kernel namespace.
diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index b24afcd..274c81c 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -24,6 +24,50 @@ from ..lang.gpu import ProvidesGpuRuntimeAPI
 
 
 class SfgGpuComposer(SfgComposerMixIn):
+    """Composer mix-in providing methods to generate GPU kernel invocations.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs)
+
+        Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration.
+
+        The overloads of this method are listed below.
+        They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax;
+        for details on the launch configuration arguments,
+        refer to `Launch Configurations in CUDA`_
+        or `Launch Configurations in HIP`_.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a manual launch grid.
+
+        Requires that the kernel was generated
+        with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+        set to `True`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with an automatic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that
+        causes the launch grid to be determined automatically,
+        such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a dynamic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that permits a user-defined
+        blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`.
+        The grid size is calculated automatically by dividing the number of work items in each
+        dimension by the block size, rounding up.
+
+    .. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
+
+    .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
+    """  # NOQA: E501
 
     def __init__(self) -> None:
         self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
@@ -63,12 +107,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         block_size: ExprLike,
         shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with a manual launch grid.
-
-        Requires that the kernel was generated with `manual_launch_grid <GpuOptions.manual_launch_grid>`
-        set to `True`.
-        """
+    ) -> SfgCallTreeNode: ...
 
     @overload
     def gpu_invoke(
@@ -77,12 +116,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         *,
         shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with an automatic launch grid.
-
-        This signature accepts kernels generated with an indexing scheme that permits
-        the automatic inferrence of the launch grid, such as `Blockwise4D <IndexingScheme.Blockwise4D>`
-        """
+    ) -> SfgCallTreeNode: ...
 
     @overload
     def gpu_invoke(
@@ -92,14 +126,7 @@ class SfgGpuComposer(SfgComposerMixIn):
         block_size: ExprLike | None = None,
         shared_memory_bytes: ExprLike = "0",
         stream: ExprLike | None = None,
-    ) -> SfgCallTreeNode:
-        """Invoke a CUDA kernel with a dynamic launch grid.
-
-        This signature accepts kernels generated with an indexing scheme that permits a user-defined
-        blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
-        The grid size is calculated automatically by dividing the number of work items in each
-        dimension by the block size, rounding up.
-        """
+    ) -> SfgCallTreeNode: ...
 
     def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
         assert isinstance(
-- 
GitLab


From f2ad7231a50c375fbc3bbee9f27fbff7cde704c5 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 11 Mar 2025 11:35:06 +0100
Subject: [PATCH 16/28] more explanation in composer guide

---
 docs/source/usage/how_to_composer.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/source/usage/how_to_composer.md b/docs/source/usage/how_to_composer.md
index 4610d07..d21d46f 100644
--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -379,9 +379,16 @@ with SourceFileGenerator(sfg_config) as sfg:
     sfg.function("kernel_wrapper")(
         sfg.gpu_invoke(khandle)
     )
-
 ```
 
+In this snippet, we used the [generator configuration](#how_to_generator_scripts_config)
+to change the suffix of the generated implementation file to `.cu`.
+
+When investigating the generated `.cu` file, you can see that the GPU launch configuration parameters
+*grid size* and *block size* are being computed automatically from the array sizes.
+This behavior can be changed by modifying options in the {any}`gpu <pystencils.codegen.config.GpuOptions>`
+category of the `CreateKernelConfig`.
+
 #### Mapping Fields to Data Structures
 
 Pystencils kernels operate on n-dimensional contiguous or strided arrays,
-- 
GitLab


From 377bab57e08ebb0fbc75a8971ff0a49d8f2f1076 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 11 Mar 2025 11:48:31 +0100
Subject: [PATCH 17/28] remove extensions.gpu

---
 src/pystencilssfg/extensions/gpu.py | 38 -----------------------------
 tests/extensions/test_gpu.py        | 26 --------------------
 2 files changed, 64 deletions(-)
 delete mode 100644 src/pystencilssfg/extensions/gpu.py
 delete mode 100644 tests/extensions/test_gpu.py

diff --git a/src/pystencilssfg/extensions/gpu.py b/src/pystencilssfg/extensions/gpu.py
deleted file mode 100644
index b4242ac..0000000
--- a/src/pystencilssfg/extensions/gpu.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from pystencilssfg import lang
-
-
-def dim3class(gpu_runtime_header: str, *, cls_name: str = "dim3"):
-    """
-    >>> dim3 = dim3class("<hip/hip_runtime.h>")
-    >>> dim3().ctor(64, 1, 1)
-    dim3{64, 1, 1}
-
-    Args:
-        gpu_runtime_header: String with the name of the gpu runtime header
-        cls_name: String with the acutal name (default "dim3")
-    Returns:
-        Dim3Class: A `lang.CppClass` that mimics cuda's/hip's `dim3`
-    """
-    @lang.cppclass(cls_name, gpu_runtime_header)
-    class Dim3Class:
-        def ctor(self, dim0=1, dim1=1, dim2=1):
-            return self.ctor_bind(dim0, dim1, dim2)
-
-        @property
-        def x(self):
-            return lang.AugExpr.format("{}.x", self)
-
-        @property
-        def y(self):
-            return lang.AugExpr.format("{}.y", self)
-
-        @property
-        def z(self):
-            return lang.AugExpr.format("{}.z", self)
-
-        @property
-        def dims(self):
-            """The dims property."""
-            return [self.x, self.y, self.z]
-
-    return Dim3Class
diff --git a/tests/extensions/test_gpu.py b/tests/extensions/test_gpu.py
deleted file mode 100644
index 2e8d133..0000000
--- a/tests/extensions/test_gpu.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from pystencilssfg.extensions.gpu import dim3class
-from pystencilssfg.lang import HeaderFile, AugExpr
-
-
-def test_dim3():
-    cuda_runtime = "<cuda_runtime.h>"
-    dim3 = dim3class(cuda_runtime, cls_name="dim3")
-    assert HeaderFile.parse(cuda_runtime) in dim3.template.includes
-    assert str(dim3().ctor(128, 1, 1)) == "dim3{128, 1, 1}"
-    assert str(dim3().ctor()) == "dim3{1, 1, 1}"
-    assert str(dim3().ctor(1, 1, 128)) == "dim3{1, 1, 128}"
-
-    block = dim3(ref=True, const=True).var("block")
-
-    dims = [
-        AugExpr.format(
-            "uint32_t(({} + {} - 1)/ {})",
-            1024,
-            block.dims[i],
-            block.dims[i],
-        )
-        for i in range(3)
-    ]
-
-    grid = dim3().ctor(*dims)
-    assert str(grid) == f"dim3{{{', '.join((str(d) for d in dims))}}}"
-- 
GitLab


From e4c257df0c0087103d5d638baaa75519d16462ce Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Tue, 11 Mar 2025 13:51:30 +0100
Subject: [PATCH 18/28] add lang.gpu to api docs

---
 docs/source/api/lang.rst      |  6 ++++++
 src/pystencilssfg/lang/gpu.py | 35 +++++++++++++++++++++++++----------
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/docs/source/api/lang.rst b/docs/source/api/lang.rst
index bd5e4fa..cdf8f24 100644
--- a/docs/source/api/lang.rst
+++ b/docs/source/api/lang.rst
@@ -41,3 +41,9 @@ Implementation
 
 .. automodule:: pystencilssfg.lang.cpp
     :members:
+
+GPU Runtime APIs
+----------------
+
+.. automodule:: pystencilssfg.lang.gpu
+    :members:
diff --git a/src/pystencilssfg/lang/gpu.py b/src/pystencilssfg/lang/gpu.py
index ccf86d9..0ca2a6d 100644
--- a/src/pystencilssfg/lang/gpu.py
+++ b/src/pystencilssfg/lang/gpu.py
@@ -5,38 +5,50 @@ from typing import Protocol
 from .expressions import CppClass, cpptype, AugExpr
 
 
-class _Dim3Base(CppClass):
+class Dim3Interface(CppClass):
+    """Interface definition for the ``dim3`` struct of Cuda and HIP."""
+
     def ctor(self, dim0=1, dim1=1, dim2=1):
+        """Constructor invocation of ``dim3``"""
         return self.ctor_bind(dim0, dim1, dim2)
 
     @property
-    def x(self):
+    def x(self) -> AugExpr:
+        """The `x` coordinate member."""
         return AugExpr.format("{}.x", self)
 
     @property
-    def y(self):
+    def y(self) -> AugExpr:
+        """The `y` coordinate member."""
         return AugExpr.format("{}.y", self)
 
     @property
-    def z(self):
+    def z(self) -> AugExpr:
+        """The `z` coordinate member."""
         return AugExpr.format("{}.z", self)
 
     @property
-    def dims(self):
-        """The dims property."""
-        return [self.x, self.y, self.z]
+    def dims(self) -> tuple[AugExpr, AugExpr, AugExpr]:
+        """`x`, `y`, and `z` as a tuple."""
+        return (self.x, self.y, self.z)
 
 
 class ProvidesGpuRuntimeAPI(Protocol):
+    """Protocol definition for a GPU runtime API provider."""
 
-    dim3: type[_Dim3Base]
+    dim3: type[Dim3Interface]
+    """The ``dim3`` struct type for this GPU runtime"""
 
     stream_t: type[AugExpr]
+    """The ``stream_t`` type for this GPU runtime"""
 
 
 class CudaAPI(ProvidesGpuRuntimeAPI):
+    """Reflection of the CUDA runtime API"""
+
+    class dim3(Dim3Interface):
+        """Implements `Dim3Interface` for CUDA"""
 
-    class dim3(_Dim3Base):
         template = cpptype("dim3", "<cuda_runtime.h>")
 
     class stream_t(CppClass):
@@ -44,8 +56,11 @@ class CudaAPI(ProvidesGpuRuntimeAPI):
 
 
 class HipAPI(ProvidesGpuRuntimeAPI):
+    """Reflection of the HIP runtime API"""
+
+    class dim3(Dim3Interface):
+        """Implements `Dim3Interface` for HIP"""
 
-    class dim3(_Dim3Base):
         template = cpptype("dim3", "<hip/hip_runtime.h>")
 
     class stream_t(CppClass):
-- 
GitLab


From 6a7559133cf32eb8f257d8cc376054cce7d1995f Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 12 Mar 2025 09:42:28 +0100
Subject: [PATCH 19/28] separate failing and non-failing getters for gpu api

---
 src/pystencilssfg/composer/gpu_composer.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index 274c81c..4c3a8a4 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -85,8 +85,13 @@ class SfgGpuComposer(SfgComposerMixIn):
         self._gpu_api_provider = HipAPI()
 
     @property
-    def gpu_api(self) -> ProvidesGpuRuntimeAPI:
-        """GPU runtime API wrapper currently used by this GPU composer.
+    def gpu_api(self) -> ProvidesGpuRuntimeAPI | None:
+        """GPU runtime API wrapper currently used by this GPU composer,
+        or `None` if none was selected."""
+        return self._gpu_api_provider
+
+    def get_gpu_api(self) -> ProvidesGpuRuntimeAPI:
+        """GPU runtime API provider currently used by this GPU composer.
 
         Raises:
             AttributeError: If no runtime API was set yet (see `use_cuda`, `use_hip`)
@@ -140,7 +145,7 @@ class SfgGpuComposer(SfgComposerMixIn):
 
         launch_config = ker.get_launch_configuration()
 
-        dim3 = self.gpu_api.dim3
+        dim3 = self.get_gpu_api().dim3
 
         grid_size: ExprLike
         block_size: ExprLike
-- 
GitLab


From d35acf15374b7424978bc63a5d3f1bead1dcae2a Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 12 Mar 2025 10:26:48 +0100
Subject: [PATCH 20/28] use Target.HIP

---
 src/pystencilssfg/ir/call_tree.py            | 5 ++---
 tests/generator_scripts/source/HipKernels.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/pystencilssfg/ir/call_tree.py b/src/pystencilssfg/ir/call_tree.py
index ab84db4..61c5bbb 100644
--- a/src/pystencilssfg/ir/call_tree.py
+++ b/src/pystencilssfg/ir/call_tree.py
@@ -226,13 +226,12 @@ class SfgGpuKernelInvocation(SfgCallTreeNode):
         shared_memory_bytes: SfgStatements | None,
         stream: SfgStatements | None,
     ):
-        from pystencils import Target
         from pystencils.codegen import GpuKernel
 
         kernel = kernel_handle.kernel
-        if not (isinstance(kernel, GpuKernel) and kernel.target == Target.CUDA):
+        if not isinstance(kernel, GpuKernel):
             raise ValueError(
-                "An `SfgCudaKernelInvocation` node can only call a CUDA kernel."
+                "An `SfgGpuKernelInvocation` node can only call GPU kernels."
             )
 
         super().__init__()
diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py
index 78464b5..32d9b1d 100644
--- a/tests/generator_scripts/source/HipKernels.py
+++ b/tests/generator_scripts/source/HipKernels.py
@@ -14,7 +14,7 @@ with SourceFileGenerator() as sfg:
     sfg.use_hip()
     sfg.namespace("gen")
 
-    base_config = ps.CreateKernelConfig(target=ps.Target.CUDA)
+    base_config = ps.CreateKernelConfig(target=ps.Target.HIP)
 
     block_size = sfg.gpu_api.dim3().var("blockSize")
     grid_size = sfg.gpu_api.dim3().var("gridSize")
-- 
GitLab


From 069d274d504b54d9805e729716691fb33789fd46 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 12 Mar 2025 10:57:02 +0100
Subject: [PATCH 21/28] permit modification of code style and clang-format
 options through the composer/context

---
 docs/source/_util/sfg_monkeypatch.py |  5 +++--
 src/pystencilssfg/context.py         | 10 +++++++++-
 src/pystencilssfg/generator.py       | 21 ++++++++++++---------
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/docs/source/_util/sfg_monkeypatch.py b/docs/source/_util/sfg_monkeypatch.py
index 1277603..91a9623 100644
--- a/docs/source/_util/sfg_monkeypatch.py
+++ b/docs/source/_util/sfg_monkeypatch.py
@@ -30,8 +30,9 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
     def __exit__(self, exc_type, exc_value, traceback):
         if exc_type is None:
             self._finish_files()
+            emitter = self._get_emitter()
 
-            header_code = self._emitter.dumps(self._header_file)
+            header_code = emitter.dumps(self._header_file)
             header_ext = splitext(self._header_file.name)[1]
 
             mdcode = ":::::{tab-set}\n"
@@ -42,7 +43,7 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
             mdcode += "\n:::\n::::\n"
 
             if self._impl_file is not None:
-                impl_code = self._emitter.dumps(self._impl_file)
+                impl_code = emitter.dumps(self._impl_file)
                 impl_ext = splitext(self._impl_file.name)[1]
 
                 mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n"
diff --git a/src/pystencilssfg/context.py b/src/pystencilssfg/context.py
index 3ea82f2..5773455 100644
--- a/src/pystencilssfg/context.py
+++ b/src/pystencilssfg/context.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import Sequence, Any, Generator
 from contextlib import contextmanager
 
-from .config import CodeStyle
+from .config import CodeStyle, ClangFormatOptions
 from .ir import (
     SfgSourceFile,
     SfgNamespace,
@@ -23,6 +23,7 @@ class SfgContext:
         impl_file: SfgSourceFile | None,
         namespace: str | None = None,
         codestyle: CodeStyle | None = None,
+        clang_format_opts: ClangFormatOptions | None = None,
         argv: Sequence[str] | None = None,
         project_info: Any = None,
     ):
@@ -33,6 +34,9 @@ class SfgContext:
         self._inner_namespace: str | None = None
 
         self._codestyle = codestyle if codestyle is not None else CodeStyle()
+        self._clang_format: ClangFormatOptions = (
+            clang_format_opts if clang_format_opts is not None else ClangFormatOptions()
+        )
 
         self._header_file = header_file
         self._impl_file = impl_file
@@ -73,6 +77,10 @@ class SfgContext:
         """The code style object for this generation context."""
         return self._codestyle
 
+    @property
+    def clang_format(self) -> ClangFormatOptions:
+        return self._clang_format
+
     @property
     def header_file(self) -> SfgSourceFile:
         return self._header_file
diff --git a/src/pystencilssfg/generator.py b/src/pystencilssfg/generator.py
index c314d67..fe4eb99 100644
--- a/src/pystencilssfg/generator.py
+++ b/src/pystencilssfg/generator.py
@@ -95,9 +95,7 @@ class SourceFileGenerator:
             self._impl_file = SfgSourceFile(
                 output_files[1].name, SfgSourceFileType.TRANSLATION_UNIT
             )
-            self._impl_file.includes.append(
-                HeaderFile.parse(self._header_file.name)
-            )
+            self._impl_file.includes.append(HeaderFile.parse(self._header_file.name))
 
         #   TODO: Find a way to not hard-code the restrict qualifier in pystencils
         self._header_file.elements.append("#define RESTRICT __restrict__")
@@ -115,14 +113,11 @@ class SourceFileGenerator:
             self._impl_file,
             namespace,
             config.codestyle,
+            config.clang_format,
             argv=script_args,
             project_info=cli_params.get_project_info(),
         )
 
-        self._emitter = SfgCodeEmitter(
-            self._output_dir, config.codestyle, config.clang_format
-        )
-
         sort_key = config.codestyle.get_option("includes_sorting_key")
         if sort_key is None:
 
@@ -161,6 +156,13 @@ class SourceFileGenerator:
             )
             self._impl_file.includes.sort(key=self._include_sort_key)
 
+    def _get_emitter(self):
+        return SfgCodeEmitter(
+            self._output_dir,
+            self._context.codestyle,
+            self._context.clang_format,
+        )
+
     def __enter__(self) -> SfgComposer:
         self.clean_files()
         return SfgComposer(self._context)
@@ -169,6 +171,7 @@ class SourceFileGenerator:
         if exc_type is None:
             self._finish_files()
 
-            self._emitter.emit(self._header_file)
+            emitter = self._get_emitter()
+            emitter.emit(self._header_file)
             if self._impl_file is not None:
-                self._emitter.emit(self._impl_file)
+                emitter.emit(self._impl_file)
-- 
GitLab


From 28941caeea1eb275eff3fe5862e8e76d5bfb5833 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 12 Mar 2025 12:04:41 +0100
Subject: [PATCH 22/28] disable formatting on GPU invocations to avoid
 clang-format breaking cuda/hip code

---
 src/pystencilssfg/composer/gpu_composer.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index 4c3a8a4..757ab66 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -162,12 +162,17 @@ class SfgGpuComposer(SfgComposerMixIn):
             )
             stmt_stream = make_statements(stream) if stream is not None else None
 
-            return SfgGpuKernelInvocation(
-                kernel_handle,
-                stmt_grid_size,
-                stmt_block_size,
-                shared_memory_bytes=stmt_smem,
-                stream=stmt_stream,
+            return self.seq(
+                "// clang-format off: "
+                "[pystencils-sfg] Formatting may add illegal spaces between angular brackets in `<<< >>>`.",
+                SfgGpuKernelInvocation(
+                    kernel_handle,
+                    stmt_grid_size,
+                    stmt_block_size,
+                    shared_memory_bytes=stmt_smem,
+                    stream=stmt_stream,
+                ),
+                "// clang-format on",
             )
 
         def to_uint32_t(expr: AugExpr) -> AugExpr:
-- 
GitLab


From c8c1a548ec3e25dbda9e11a1e9ffc09601f13dec Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Sat, 15 Mar 2025 11:59:15 +0100
Subject: [PATCH 23/28] Use newly introduced `HIP` target.

 - remove `sfg.use_cuda` and `sfg.use_hip`; infer API from kernel target instead
 - Adapt test cases
 - Adapt composer user guide
---
 docs/source/usage/how_to_composer.md          | 35 ++++++-------
 src/pystencilssfg/composer/gpu_composer.py    | 49 +++++--------------
 src/pystencilssfg/lang/gpu.py                 |  8 +++
 tests/generator_scripts/source/CudaKernels.py |  8 +--
 tests/generator_scripts/source/HipKernels.py  |  8 +--
 5 files changed, 43 insertions(+), 65 deletions(-)

diff --git a/docs/source/usage/how_to_composer.md b/docs/source/usage/how_to_composer.md
index d21d46f..12a8435 100644
--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -346,20 +346,13 @@ The next section explains how that is achieved in pystencils-sfg.
 
 #### Invoking GPU Kernels
 
-Pystencils also allows us to generate kernels for the CUDA and HIP GPU platforms.
-First, we need to decide for one of the two systems by calling either
-{any}`sfg.use_cuda <SfgGpuComposer.use_cuda>` or {any}`sfg.use_hip <SfgGpuComposer.use_hip>`.
+Pystencils also allows us to generate kernels for the CUDA and HIP GPU programming models.
+To generate a kernel targetting either of these, set the
+{any}`target <pystencils.codegen.config.CreateKernelConfig.target>`
+code generator option to either `Target.CUDA` or `Target.HIP`.
 After registering a GPU kernel,
-you can render its invocation using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
-
-Here is a basic example:
-
-```{code-cell} ipython3
-:tags: [remove-cell]
-
-f, g = ps.fields("f, g: double[2D]")
-asm = ps.Assignment(f(0), g(0))
-```
+its invocation can be rendered using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
+Here is an example using CUDA:
 
 ```{code-cell} ipython3
 from pystencilssfg import SfgConfig
@@ -367,12 +360,14 @@ sfg_config = SfgConfig()
 sfg_config.extensions.impl = "cu"
 
 with SourceFileGenerator(sfg_config) as sfg:
-    #   Activate CUDA
-    sfg.use_cuda()
+    #   Configure the code generator to use CUDA
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    #   Create fields, assemble assignments
+    f, g = ps.fields("f, g: double[2D]")
+    asm = ps.Assignment(f(0), g(0))
 
-    #   Register the GPU kernel
-    cfg = ps.CreateKernelConfig()
-    cfg.target = ps.Target.CUDA
+    #   Register kernel
     khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
 
     #   Invoke it
@@ -442,8 +437,8 @@ with SourceFileGenerator() as sfg:
 
 :::{admonition} To Do
 
- - Creating and calling kernels
- - Invoking GPU kernels and the CUDA API Mirror
+ - Modifying GPU kernel launch configs
+ - GPU API Reflections
  - Defining classes, their fields constructors, and methods
 
 :::
diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index 757ab66..72628ab 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
 from typing import overload
 
-from pystencils.codegen import GpuKernel
+from pystencils.codegen import GpuKernel, Target
 from pystencils.codegen.gpu_indexing import (
     ManualLaunchConfiguration,
     AutomaticLaunchConfiguration,
@@ -20,7 +20,7 @@ from ..ir import (
     SfgSequence,
 )
 from ..lang import ExprLike, AugExpr
-from ..lang.gpu import ProvidesGpuRuntimeAPI
+from ..lang.gpu import CudaAPI, HipAPI, ProvidesGpuRuntimeAPI
 
 
 class SfgGpuComposer(SfgComposerMixIn):
@@ -69,40 +69,6 @@ class SfgGpuComposer(SfgComposerMixIn):
     .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
     """  # NOQA: E501
 
-    def __init__(self) -> None:
-        self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None
-
-    def use_cuda(self):
-        """Instruct the GPU composer to use the CUDA runtime API"""
-        from ..lang.gpu import CudaAPI
-
-        self._gpu_api_provider = CudaAPI()
-
-    def use_hip(self):
-        """Instruct the GPU composer to use the HIP runtime API"""
-        from ..lang.gpu import HipAPI
-
-        self._gpu_api_provider = HipAPI()
-
-    @property
-    def gpu_api(self) -> ProvidesGpuRuntimeAPI | None:
-        """GPU runtime API wrapper currently used by this GPU composer,
-        or `None` if none was selected."""
-        return self._gpu_api_provider
-
-    def get_gpu_api(self) -> ProvidesGpuRuntimeAPI:
-        """GPU runtime API provider currently used by this GPU composer.
-
-        Raises:
-            AttributeError: If no runtime API was set yet (see `use_cuda`, `use_hip`)
-        """
-        if self._gpu_api_provider is None:
-            raise AttributeError(
-                "No GPU API was selected - call `use_cuda()` or `use_hip()` first."
-            )
-
-        return self._gpu_api_provider
-
     @overload
     def gpu_invoke(
         self,
@@ -145,7 +111,16 @@ class SfgGpuComposer(SfgComposerMixIn):
 
         launch_config = ker.get_launch_configuration()
 
-        dim3 = self.get_gpu_api().dim3
+        gpu_api: type[ProvidesGpuRuntimeAPI]
+        match ker.target:
+            case Target.CUDA:
+                gpu_api = CudaAPI
+            case Target.HIP:
+                gpu_api = HipAPI
+            case _:
+                assert False, "unexpected GPU target"
+
+        dim3 = gpu_api.dim3
 
         grid_size: ExprLike
         block_size: ExprLike
diff --git a/src/pystencilssfg/lang/gpu.py b/src/pystencilssfg/lang/gpu.py
index 0ca2a6d..c9736fb 100644
--- a/src/pystencilssfg/lang/gpu.py
+++ b/src/pystencilssfg/lang/gpu.py
@@ -55,6 +55,10 @@ class CudaAPI(ProvidesGpuRuntimeAPI):
         template = cpptype("cudaStream_t", "<cuda_runtime.h>")
 
 
+cuda = CudaAPI
+"""Reflection of the CUDA runtime API"""
+
+
 class HipAPI(ProvidesGpuRuntimeAPI):
     """Reflection of the HIP runtime API"""
 
@@ -65,3 +69,7 @@ class HipAPI(ProvidesGpuRuntimeAPI):
 
     class stream_t(CppClass):
         template = cpptype("hipStream_t", "<hip/hip_runtime.h>")
+
+
+hip = HipAPI
+"""Reflection of the HIP runtime API"""
diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py
index 8572782..e019e4f 100644
--- a/tests/generator_scripts/source/CudaKernels.py
+++ b/tests/generator_scripts/source/CudaKernels.py
@@ -1,5 +1,6 @@
 from pystencilssfg import SourceFileGenerator
 from pystencilssfg.lang.cpp import std
+from pystencilssfg.lang.gpu import cuda
 
 import pystencils as ps
 
@@ -11,14 +12,13 @@ asm = ps.Assignment(dst(0), 2 * src(0))
 
 
 with SourceFileGenerator() as sfg:
-    sfg.use_cuda()
     sfg.namespace("gen")
 
     base_config = ps.CreateKernelConfig(target=ps.Target.CUDA)
 
-    block_size = sfg.gpu_api.dim3().var("blockSize")
-    grid_size = sfg.gpu_api.dim3().var("gridSize")
-    stream = sfg.gpu_api.stream_t().var("stream")
+    block_size = cuda.dim3().var("blockSize")
+    grid_size = cuda.dim3().var("gridSize")
+    stream = cuda.stream_t().var("stream")
 
     with sfg.namespace("linear3d"):
         cfg = base_config.copy()
diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py
index 32d9b1d..20d9df5 100644
--- a/tests/generator_scripts/source/HipKernels.py
+++ b/tests/generator_scripts/source/HipKernels.py
@@ -1,5 +1,6 @@
 from pystencilssfg import SourceFileGenerator
 from pystencilssfg.lang.cpp import std
+from pystencilssfg.lang.gpu import hip
 
 import pystencils as ps
 
@@ -11,14 +12,13 @@ asm = ps.Assignment(dst(0), 2 * src(0))
 
 
 with SourceFileGenerator() as sfg:
-    sfg.use_hip()
     sfg.namespace("gen")
 
     base_config = ps.CreateKernelConfig(target=ps.Target.HIP)
 
-    block_size = sfg.gpu_api.dim3().var("blockSize")
-    grid_size = sfg.gpu_api.dim3().var("gridSize")
-    stream = sfg.gpu_api.stream_t().var("stream")
+    block_size = hip.dim3().var("blockSize")
+    grid_size = hip.dim3().var("gridSize")
+    stream = hip.stream_t().var("stream")
 
     with sfg.namespace("linear3d"):
         cfg = base_config.copy()
-- 
GitLab


From 51c03215aedefc3b36ebea07355fbe164e09709a Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Sat, 15 Mar 2025 13:17:36 +0100
Subject: [PATCH 24/28] extend GPU section of composer guide. Fix and clarify
 some aspects in the API docs and user guide.

---
 docs/source/api/composer.rst         |   7 +
 docs/source/usage/config_and_cli.md  |  30 ++--
 docs/source/usage/how_to_composer.md | 209 +++++++++++++++++++++------
 src/pystencilssfg/lang/gpu.py        |   4 +-
 4 files changed, 196 insertions(+), 54 deletions(-)

diff --git a/docs/source/api/composer.rst b/docs/source/api/composer.rst
index 078e0eb..8b470b0 100644
--- a/docs/source/api/composer.rst
+++ b/docs/source/api/composer.rst
@@ -40,6 +40,7 @@ Helper Methods and Builders
 
 .. autoclass:: SfgFunctionSequencer
     :members:
+    :inherited-members:
 
 .. autoclass:: SfgNodeBuilder
     :members:
@@ -50,6 +51,12 @@ Helper Methods and Builders
 .. autoclass:: SfgSwitchBuilder
     :members:
 
+.. module:: pystencilssfg.composer.class_composer
+
+.. autoclass:: SfgMethodSequencer
+    :members:
+    :inherited-members:
+
 Context and Cursor
 ==================
 
diff --git a/docs/source/usage/config_and_cli.md b/docs/source/usage/config_and_cli.md
index 785ff52..b6060c0 100644
--- a/docs/source/usage/config_and_cli.md
+++ b/docs/source/usage/config_and_cli.md
@@ -12,7 +12,7 @@ different configuration sources:
   the generator script to set some of its configuration options; see [Command-Line Options](#cmdline_options)
 - **Project Configuration:** When embedded into a larger project, using a build system such as CMake, generator scripts
   may be configured globally within that project by the use of a *configuration module*.
-  Settings specified inside that configuration module are always overridden by the former to configuration sources.
+  Settings specified inside that configuration module are always overridden by the two other configuration sources listed above.
   For details on configuration modules, refer to the guide on [Project and Build System Integration](#guide_project_integration).
 
 (inline_config)=
@@ -60,14 +60,26 @@ set {any}`cfg.outer_namespace <SfgConfig.outer_namespace>`.
 
 ### Code Style and Formatting
 
- - Modify the values in the {any}`cfg.code_style <CodeStyle>` category to affect
-   certain formatting aspects of the generated code.
- - To change, enforce, or disable auto-formatting of generated code through `clang-format`,
-   take a look at the {any}`cfg.clang_format <ClangFormatOptions>` category.
- - Clang-format will, by default, sort `#include` statements alphabetically and separate
-   local and system header includes.
-   To override this, you can set a custom sorting key for `#include` sorting via
-   {any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.
+Pystencils-sfg gives you some options to affect its output code style.
+These are controlled by the options in the {any}`cfg.code_style <CodeStyle>` category.
+
+Furthermore, pystencils-sfg uses `clang-format` to beautify generated code.
+The behaviour of the clang-format integration is managed by the
+the {any}`cfg.clang_format <ClangFormatOptions>` category,
+where you can set options to skip or enforce formatting,
+or change the formatter binary.
+To set the code style used by `clang-format` either create a `.clang-format` file
+in any of the parent folders of your generator script,
+or modify the {any}`cfg.clang_format.code_style <ClangFormatOptions.code_style>` option.
+
+:::{seealso}
+[Clang-Format Style Options](https://clang.llvm.org/docs/ClangFormatStyleOptions.html)
+:::
+
+Clang-format will, by default, sort `#include` statements alphabetically and separate
+local and system header includes.
+To override this, you can set a custom sorting key for `#include` sorting via
+{any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.
 
 (cmdline_options)=
 ## Command-Line Options
diff --git a/docs/source/usage/how_to_composer.md b/docs/source/usage/how_to_composer.md
index 12a8435..849de18 100644
--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -283,7 +283,7 @@ The composer gives us access to the default kernel namespace (`<current_namespac
 via `sfg.kernels`.
 
 To add a kernel,
- - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.reate() <KernelsAdder.create>`,
+ - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.create() <KernelsAdder.create>`,
  - or create the kernel separately through {any}`pystencils.create_kernel <pystencils.codegen.create_kernel>` and register it using
    {any}`kernels.add() <KernelsAdder.add>`.
 
@@ -344,9 +344,63 @@ cause them to be added to its signature.
 We don't want to expose this complexity, but instead hide it by using appropriate data structures.
 The next section explains how that is achieved in pystencils-sfg.
 
-#### Invoking GPU Kernels
+#### Mapping Fields to Data Structures
+
+Pystencils kernels operate on n-dimensional contiguous or strided arrays,
+There exist many classes with diverse APIs modelling such arrays throughout the scientific
+computing landscape, including [Kokkos Views][kokkos_view], [C++ std::mdspan][mdspan],
+[SYCL buffers][sycl_buffer], and many framework-specific custom-built classes.
+Using the protocols behind {any}`sfg.map_field <SfgBasicComposer.map_field>`,
+it is possible to automatically emit code
+that extracts the indexing information required by a kernel from any of these classes,
+as long as a suitable API reflection is available.
+
+:::{seealso}
+[](#field_data_structure_reflection) for instructions on how to set up field API
+reflection for a custom nd-array data structure.
+:::
+
+Pystencils-sfg natively provides field extraction for a number of C++ STL-classes,
+such as `std::vector` and `std::span` (for 1D fields) and `std::mdspan`.
+Import any of them from `pystencilssfg.lang.cpp.std` and create an instance for a given
+field using `.from_field()`.
+Then, inside the wrapper function, pass the symbolic field and its associated data structure to
+{any}`sfg.map_field <SfgBasicComposer.map_field>`.
+before calling the kernel:
+
+```{code-cell} ipython3
+import pystencils as ps
+from pystencilssfg.lang.cpp import std
+
+with SourceFileGenerator() as sfg:
+    #   Create symbolic fields
+    f, g = ps.fields("f, g: double[1D]")
+
+    #   Create data structure reflections
+    f_vec = std.vector.from_field(f)
+    g_span = std.span.from_field(g)
+
+    #   Create the kernel
+    asm = ps.Assignment(f(0), g(0))
+    khandle = sfg.kernels.create(asm, "my_kernel")
+
+    #   Create the wrapper function
+    sfg.function("call_my_kernel")(
+        sfg.map_field(f, f_vec),
+        sfg.map_field(g, g_span),
+        sfg.call(khandle)
+    )
+```
+
+## GPU Kernels
 
 Pystencils also allows us to generate kernels for the CUDA and HIP GPU programming models.
+This section describes how to generate GPU kernels through pystencils-sfg;
+how to invoke them with various launch configurations,
+and how GPU execution streams are reflected.
+
+### Generate and Invoke CUDA and HIP Kernels
+
 To generate a kernel targetting either of these, set the
 {any}`target <pystencils.codegen.config.CreateKernelConfig.target>`
 code generator option to either `Target.CUDA` or `Target.HIP`.
@@ -364,7 +418,7 @@ with SourceFileGenerator(sfg_config) as sfg:
     cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
 
     #   Create fields, assemble assignments
-    f, g = ps.fields("f, g: double[2D]")
+    f, g = ps.fields("f, g: double[128, 128]")
     asm = ps.Assignment(f(0), g(0))
 
     #   Register kernel
@@ -384,61 +438,130 @@ When investigating the generated `.cu` file, you can see that the GPU launch con
 This behavior can be changed by modifying options in the {any}`gpu <pystencils.codegen.config.GpuOptions>`
 category of the `CreateKernelConfig`.
 
-#### Mapping Fields to Data Structures
+### Adapting the Launch Configuration
 
-Pystencils kernels operate on n-dimensional contiguous or strided arrays,
-There exist many classes with diverse APIs modelling such arrays throughout the scientific
-computing landscape, including [Kokkos Views][kokkos_view], [C++ std::mdspan][mdspan],
-[SYCL buffers][sycl_buffer], and many framework-specific custom-built classes.
-Using the protocols behind {any}`sfg.map_field <SfgBasicComposer.map_field>`,
-it is possible to automatically emit code
-that extracts the indexing information required by a kernel from any of these classes,
-as long as a suitable API reflection is available.
+GPU kernel invocations usually require the user to provide a launch grid, defined
+by the GPU thread block size and the number of blocks on the grid.
+In the simplest case (seen above), pystencils-sfg will emit code that automatically
+computes these parameters from the size of the arrays passed to the kernel,
+using a default block size defined by pystencils.
 
-:::{seealso}
-[](#field_data_structure_reflection) for instructions on how to set up field API
-reflection for a custom nd-array data structure.
-:::
+The code generator also permits customization of the launch configuration.
+You may provide a custom block size to override the default, in which case the
+grid size will still be computed by dividing the array sizes by your block size.
+Otherwise, you can also fully take over control of both block and grid size.
+For both cases, instructions are given in the following.
 
-Pystencils-sfg natively provides field extraction for a number of C++ STL-classes,
-such as `std::vector` and `std::span` (for 1D fields) and `std::mdspan`.
-Import any of them from `pystencilssfg.lang.cpp.std` and create an instance for a given
-field using `.from_field()`.
-Then, inside the wrapper function, pass the symbolic field and its associated data structure to
-{any}`sfg.map_field <SfgBasicComposer.map_field>`.
-before calling the kernel:
+#### User-Defined Block Size for Auto-Computed Grid Size
+
+To merely modify the block size argument while still automatically inferring the grid size,
+pass a variable or expression of type `dim3` to the `block_size` parameter of `gpu_invoke`.
+Pystencils-sfg exposes two versions of `dim3`, which differ primarily in their associated
+runtime headers:
+
+ - {any}`pystencilssfg.lang.gpu.cuda.dim3 <CudaAPI.dim3>` for CUDA, and
+ - {any}`pystencilssfg.lang.gpu.hip.dim3 <HipAPI.dim3>` for HIP.
+
+The following snippet selects the correct `dim3` type according to the kernel target;
+it then creates a variable of that type and turns that into an argument to the kernel invocation:
 
 ```{code-cell} ipython3
-import pystencils as ps
-from pystencilssfg.lang.cpp import std
+:tags: [remove-cell]
+target = ps.Target.HIP
+cfg = ps.CreateKernelConfig(target=target)
+f, g = ps.fields("f, g: double[128, 128]")
+asm = ps.Assignment(f(0), g(0))
+```
 
-with SourceFileGenerator() as sfg:
-    #   Create symbolic fields
-    f, g = ps.fields("f, g: double[1D]")
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
 
-    #   Create data structure reflections
-    f_vec = std.vector.from_field(f)
-    g_span = std.span.from_field(g)
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
 
-    #   Create the kernel
-    asm = ps.Assignment(f(0), g(0))
-    khandle = sfg.kernels.create(asm, "my_kernel")
+    #   Select dim3 reflection
+    match target:
+        case ps.Target.CUDA:
+            from pystencilssfg.lang.gpu import cuda as gpu_api
+        case ps.Target.HIP:
+            from pystencilssfg.lang.gpu import hip as gpu_api
+    
+    #   Create dim3 variable and pass it to kernel invocation
+    block_size = gpu_api.dim3(const=True).var("block_size")
 
-    #   Create the wrapper function
-    sfg.function("call_my_kernel")(
-        sfg.map_field(f, f_vec),
-        sfg.map_field(g, g_span),
-        sfg.call(khandle)
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, block_size=block_size)
     )
 ```
 
-(exposed_inline_kernels)=
-### Exposed and Inline Kernels
+#### Manual Launch Configurations
+
+To take full control of the launch configuration, we must disable its automatic inferrence
+by setting the {any}`gpu.manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+code generator option to `True`.
+Then, we must pass `dim3` arguments for both `block_size` and `grid_size` to the kernel invocation:
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+
+    #   Configure for manual launch config
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+    cfg.gpu.manual_launch_grid = True
+
+    #   Register kernel
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+    
+    #   Create dim3 variables
+    from pystencilssfg.lang.gpu import cuda
+    block_size = cuda.dim3(const=True).var("block_size")
+    grid_size = cuda.dim3(const=True).var("grid_size")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size)
+    )
+```
+
+### Using Streams
+
+CUDA and HIP kernels can be enqueued into streams for concurrent execution.
+This is mirrored in pystencils-sfg;
+all overloads of `gpu_invoke` take an optional `stream` argument.
+The `stream_t` data types of both CUDA and HIP are made available
+through the respective API reflections:
+
+ - {any}`lang.gpu.cuda.stream_t <CudaAPI.stream_t>` reflects `cudaStream_t`, and
+ - {any}`lang.gpu.hip.stream_t <HipAPI.stream_t>` reflects `hipStream_t`.
+
+Here is an example that creates a variable of the HIP stream type
+and passes it to `gpu_invoke`:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+cfg = ps.CreateKernelConfig(target=ps.Target.HIP)
+f, g = ps.fields("f, g: double[128, 128]")
+asm = ps.Assignment(f(0), g(0))
+```
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    stream = hip.stream_t(const=True).var("stream")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, stream=stream)
+    )
+```
 
 :::{admonition} To Do
 
- - Modifying GPU kernel launch configs
- - GPU API Reflections
  - Defining classes, their fields constructors, and methods
 
 :::
diff --git a/src/pystencilssfg/lang/gpu.py b/src/pystencilssfg/lang/gpu.py
index c9736fb..e3b5516 100644
--- a/src/pystencilssfg/lang/gpu.py
+++ b/src/pystencilssfg/lang/gpu.py
@@ -56,7 +56,7 @@ class CudaAPI(ProvidesGpuRuntimeAPI):
 
 
 cuda = CudaAPI
-"""Reflection of the CUDA runtime API"""
+"""Alias for `CudaAPI`"""
 
 
 class HipAPI(ProvidesGpuRuntimeAPI):
@@ -72,4 +72,4 @@ class HipAPI(ProvidesGpuRuntimeAPI):
 
 
 hip = HipAPI
-"""Reflection of the HIP runtime API"""
+"""Alias for `HipAPI`"""
-- 
GitLab


From 9fda3a06d6fea2d5793943d13786bd876cac57ee Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 19 Mar 2025 13:31:20 +0000
Subject: [PATCH 25/28] fix default block size for dynamic launch grids

---
 src/pystencilssfg/composer/gpu_composer.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index 72628ab..a1d01c4 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -186,13 +186,8 @@ class SfgGpuComposer(SfgComposerMixIn):
 
                 block_size_init_args: tuple[ExprLike, ...]
                 if user_block_size is None:
-                    if launch_config.block_size is None:
-                        raise ValueError(
-                            "Neither a user-defined nor a default block size was defined."
-                        )
-
                     block_size_init_args = tuple(
-                        str(bs) for bs in launch_config.block_size
+                        str(bs) for bs in launch_config.default_block_size
                     )
                 else:
                     block_size_init_args = (user_block_size,)
-- 
GitLab


From cefe0bdd269e88dd9e7630fe4cf0e4933e1c6180 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Thu, 20 Mar 2025 09:56:17 +0100
Subject: [PATCH 26/28] fix outdated deprecation notice

---
 src/pystencilssfg/composer/gpu_composer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index a1d01c4..2315a76 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -241,7 +241,7 @@ class SfgGpuComposer(SfgComposerMixIn):
 
         warn(
             "cuda_invoke is deprecated and will be removed before version 0.1. "
-            "Call `use_cuda()` and use `gpu_invoke` instead.",
+            "Use `gpu_invoke` instead.",
             FutureWarning,
         )
 
-- 
GitLab


From 8b597b98abb39c082893b50e4836436a0c7a925b Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 21 Mar 2025 11:05:06 +0100
Subject: [PATCH 27/28] clean up implementation of gpu_invoke using a builder

---
 src/pystencilssfg/composer/gpu_composer.py | 284 ++++++++++++---------
 1 file changed, 169 insertions(+), 115 deletions(-)

diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index 2315a76..06021dd 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -10,8 +10,9 @@ from pystencils.codegen.gpu_indexing import (
 )
 
 from .mixin import SfgComposerMixIn
-from .basic_composer import SfgBasicComposer, make_statements
+from .basic_composer import make_statements, make_sequence
 
+from ..context import SfgContext
 from ..ir import (
     SfgKernelHandle,
     SfgCallTreeNode,
@@ -99,18 +100,60 @@ class SfgGpuComposer(SfgComposerMixIn):
         stream: ExprLike | None = None,
     ) -> SfgCallTreeNode: ...
 
-    def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
-        assert isinstance(
-            self, SfgBasicComposer
-        )  # for type checking this function body
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        shared_memory_bytes: ExprLike = "0",
+        stream: ExprLike | None = None,
+        **kwargs,
+    ) -> SfgCallTreeNode:
+        builder = GpuInvocationBuilder(self._ctx, kernel_handle)
+        builder.shared_memory_bytes = shared_memory_bytes
+        builder.stream = stream
+
+        return builder(**kwargs)
+
+    def cuda_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        num_blocks: ExprLike,
+        threads_per_block: ExprLike,
+        stream: ExprLike | None,
+    ):
+        from warnings import warn
+
+        warn(
+            "cuda_invoke is deprecated and will be removed before version 0.1. "
+            "Use `gpu_invoke` instead.",
+            FutureWarning,
+        )
+
+        return self.gpu_invoke(
+            kernel_handle,
+            grid_size=num_blocks,
+            block_size=threads_per_block,
+            stream=stream,
+        )
+
+
+class GpuInvocationBuilder:
+    def __init__(
+        self,
+        ctx: SfgContext,
+        kernel_handle: SfgKernelHandle,
+    ):
+        self._ctx = ctx
+        self._kernel_handle = kernel_handle
 
         ker = kernel_handle.kernel
 
         if not isinstance(ker, GpuKernel):
-            raise ValueError(f"Non-GPU kernel was passed to `cuda_invoke`: {ker}")
+            raise ValueError(f"Non-GPU kernel was passed to `gpu_invoke`: {ker}")
 
         launch_config = ker.get_launch_configuration()
 
+        self._launch_config = launch_config
+
         gpu_api: type[ProvidesGpuRuntimeAPI]
         match ker.target:
             case Target.CUDA:
@@ -120,134 +163,145 @@ class SfgGpuComposer(SfgComposerMixIn):
             case _:
                 assert False, "unexpected GPU target"
 
-        dim3 = gpu_api.dim3
+        self._gpu_api = gpu_api
+        self._dim3 = gpu_api.dim3
 
-        grid_size: ExprLike
-        block_size: ExprLike
-        shared_memory_bytes: ExprLike = kwargs.get("shared_memory_bytes", "0")
-        stream: ExprLike | None = kwargs.get("stream", None)
+        self._shared_memory_bytes: ExprLike = "0"
+        self._stream: ExprLike | None
 
-        def _render_invocation(grid_size: ExprLike, block_size: ExprLike):
-            stmt_grid_size = make_statements(grid_size)
-            stmt_block_size = make_statements(block_size)
-            stmt_smem = (
-                make_statements(shared_memory_bytes)
-                if shared_memory_bytes is not None
-                else None
-            )
-            stmt_stream = make_statements(stream) if stream is not None else None
-
-            return self.seq(
-                "// clang-format off: "
-                "[pystencils-sfg] Formatting may add illegal spaces between angular brackets in `<<< >>>`.",
-                SfgGpuKernelInvocation(
-                    kernel_handle,
-                    stmt_grid_size,
-                    stmt_block_size,
-                    shared_memory_bytes=stmt_smem,
-                    stream=stmt_stream,
-                ),
-                "// clang-format on",
-            )
+    @property
+    def shared_memory_bytes(self) -> ExprLike:
+        return self._shared_memory_bytes
 
-        def to_uint32_t(expr: AugExpr) -> AugExpr:
-            return AugExpr("uint32_t").format("uint32_t({})", expr)
+    @shared_memory_bytes.setter
+    def shared_memory_bytes(self, bs: ExprLike):
+        self._shared_memory_bytes = bs
 
-        match launch_config:
-            case ManualLaunchConfiguration():
-                grid_size = kwargs["grid_size"]
-                block_size = kwargs["block_size"]
+    @property
+    def stream(self) -> ExprLike | None:
+        return self._stream
 
-                return _render_invocation(grid_size, block_size)
+    @stream.setter
+    def stream(self, s: ExprLike | None):
+        self._stream = s
 
+    def _render_invocation(
+        self, grid_size: ExprLike, block_size: ExprLike
+    ) -> SfgSequence:
+        stmt_grid_size = make_statements(grid_size)
+        stmt_block_size = make_statements(block_size)
+        stmt_smem = make_statements(self._shared_memory_bytes)
+        stmt_stream = (
+            make_statements(self._stream) if self._stream is not None else None
+        )
+
+        return make_sequence(
+            "// clang-format off: "
+            "[pystencils-sfg] Formatting may add illegal spaces between angular brackets in `<<< >>>`.",
+            SfgGpuKernelInvocation(
+                self._kernel_handle,
+                stmt_grid_size,
+                stmt_block_size,
+                shared_memory_bytes=stmt_smem,
+                stream=stmt_stream,
+            ),
+            "// clang-format on",
+        )
+
+    def __call__(self, **kwargs: ExprLike) -> SfgCallTreeNode:
+        match self._launch_config:
+            case ManualLaunchConfiguration():
+                return self._invoke_manual(**kwargs)
             case AutomaticLaunchConfiguration():
-                grid_size_entries = [
-                    to_uint32_t(self.expr_from_lambda(gs))
-                    for gs in launch_config._grid_size
-                ]
-                grid_size_var = dim3(const=True).var("__grid_size")
+                return self._invoke_automatic(**kwargs)
+            case DynamicBlockSizeLaunchConfiguration():
+                return self._invoke_dynamic(**kwargs)
+            case _:
+                raise ValueError(
+                    f"Unexpected launch configuration: {self._launch_config}"
+                )
 
-                block_size_entries = [
-                    to_uint32_t(self.expr_from_lambda(bs))
-                    for bs in launch_config._block_size
-                ]
-                block_size_var = dim3(const=True).var("__block_size")
+    def _invoke_manual(self, grid_size: ExprLike, block_size: ExprLike):
+        assert isinstance(self._launch_config, ManualLaunchConfiguration)
+        return self._render_invocation(grid_size, block_size)
 
-                nodes = [
-                    self.init(grid_size_var)(*grid_size_entries),
-                    self.init(block_size_var)(*block_size_entries),
-                    _render_invocation(grid_size_var, block_size_var),
-                ]
+    def _invoke_automatic(self):
+        assert isinstance(self._launch_config, AutomaticLaunchConfiguration)
 
-                return SfgBlock(SfgSequence(nodes))
+        from .composer import SfgComposer
 
-            case DynamicBlockSizeLaunchConfiguration():
-                user_block_size: ExprLike | None = kwargs.get("block_size", None)
+        sfg = SfgComposer(self._ctx)
 
-                block_size_init_args: tuple[ExprLike, ...]
-                if user_block_size is None:
-                    block_size_init_args = tuple(
-                        str(bs) for bs in launch_config.default_block_size
-                    )
-                else:
-                    block_size_init_args = (user_block_size,)
+        grid_size_entries = [
+            self._to_uint32_t(sfg.expr_from_lambda(gs))
+            for gs in self._launch_config._grid_size
+        ]
+        grid_size_var = self._dim3(const=True).var("__grid_size")
 
-                block_size_var = dim3(const=True).var("__block_size")
+        block_size_entries = [
+            self._to_uint32_t(sfg.expr_from_lambda(bs))
+            for bs in self._launch_config._block_size
+        ]
+        block_size_var = self._dim3(const=True).var("__block_size")
 
-                from ..lang.cpp import std
+        nodes = [
+            sfg.init(grid_size_var)(*grid_size_entries),
+            sfg.init(block_size_var)(*block_size_entries),
+            self._render_invocation(grid_size_var, block_size_var),
+        ]
 
-                work_items_entries = [
-                    self.expr_from_lambda(wit) for wit in launch_config.num_work_items
-                ]
-                work_items_var = std.tuple(
-                    "uint32_t", "uint32_t", "uint32_t", const=True
-                ).var("__work_items")
-
-                def _div_ceil(a: ExprLike, b: ExprLike):
-                    return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
-
-                grid_size_entries = [
-                    _div_ceil(work_items_var.get(i), bs)
-                    for i, bs in enumerate(
-                        [
-                            block_size_var.x,
-                            block_size_var.y,
-                            block_size_var.z,
-                        ]
-                    )
-                ]
-                grid_size_var = dim3(const=True).var("__grid_size")
+        return SfgBlock(SfgSequence(nodes))
 
-                nodes = [
-                    self.init(block_size_var)(*block_size_init_args),
-                    self.init(work_items_var)(*work_items_entries),
-                    self.init(grid_size_var)(*grid_size_entries),
-                    _render_invocation(grid_size_var, block_size_var),
-                ]
+    def _invoke_dynamic(self, block_size: ExprLike | None = None):
+        assert isinstance(self._launch_config, DynamicBlockSizeLaunchConfiguration)
 
-                return SfgBlock(SfgSequence(nodes))
+        from .composer import SfgComposer
 
-            case _:
-                raise ValueError(f"Unexpected launch configuration: {launch_config}")
+        sfg = SfgComposer(self._ctx)
 
-    def cuda_invoke(
-        self,
-        kernel_handle: SfgKernelHandle,
-        num_blocks: ExprLike,
-        threads_per_block: ExprLike,
-        stream: ExprLike | None,
-    ):
-        from warnings import warn
+        block_size_init_args: tuple[ExprLike, ...]
+        if block_size is None:
+            block_size_init_args = tuple(
+                str(bs) for bs in self._launch_config.default_block_size
+            )
+        else:
+            block_size_init_args = (block_size,)
 
-        warn(
-            "cuda_invoke is deprecated and will be removed before version 0.1. "
-            "Use `gpu_invoke` instead.",
-            FutureWarning,
-        )
+        block_size_var = self._dim3(const=True).var("__block_size")
 
-        return self.gpu_invoke(
-            kernel_handle,
-            grid_size=num_blocks,
-            block_size=threads_per_block,
-            stream=stream,
+        from ..lang.cpp import std
+
+        work_items_entries = [
+            sfg.expr_from_lambda(wit) for wit in self._launch_config.num_work_items
+        ]
+        work_items_var = std.tuple("uint32_t", "uint32_t", "uint32_t", const=True).var(
+            "__work_items"
         )
+
+        def _div_ceil(a: ExprLike, b: ExprLike):
+            return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
+
+        grid_size_entries = [
+            _div_ceil(work_items_var.get(i), bs)
+            for i, bs in enumerate(
+                [
+                    block_size_var.x,
+                    block_size_var.y,
+                    block_size_var.z,
+                ]
+            )
+        ]
+        grid_size_var = self._dim3(const=True).var("__grid_size")
+
+        nodes = [
+            sfg.init(block_size_var)(*block_size_init_args),
+            sfg.init(work_items_var)(*work_items_entries),
+            sfg.init(grid_size_var)(*grid_size_entries),
+            self._render_invocation(grid_size_var, block_size_var),
+        ]
+
+        return SfgBlock(SfgSequence(nodes))
+
+    @staticmethod
+    def _to_uint32_t(expr: AugExpr) -> AugExpr:
+        return AugExpr("uint32_t").format("uint32_t({})", expr)
-- 
GitLab


From dc1a393595156af4a3fdfcd7c9d77b71440101a6 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 21 Mar 2025 11:48:51 +0100
Subject: [PATCH 28/28] added missing default value

---
 src/pystencilssfg/composer/gpu_composer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
index 06021dd..aaffb02 100644
--- a/src/pystencilssfg/composer/gpu_composer.py
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -167,7 +167,7 @@ class GpuInvocationBuilder:
         self._dim3 = gpu_api.dim3
 
         self._shared_memory_bytes: ExprLike = "0"
-        self._stream: ExprLike | None
+        self._stream: ExprLike | None = None
 
     @property
     def shared_memory_bytes(self) -> ExprLike:
-- 
GitLab