Frederik Hennig
--- a/src/pystencilssfg/composer/gpu_composer.py

+ 20

− 33
+++ b/src/pystencilssfg/composer/gpu_composer.py

+ 20

− 33
 @@ -29,31 +29,24 @@ class SfgGpuComposer(SfgComposerMixIn):
        self._gpu_api_provider: ProvidesGpuRuntimeAPI | None = None

    def use_cuda(self):
+        """Instruct the GPU composer to use the CUDA runtime API"""
        from ..lang.gpu import CudaAPI

-        if self._gpu_api_provider is not None and not isinstance(
-            self._gpu_api_provider, CudaAPI
-        ):
-            raise ValueError(
-                "Cannot select CUDA GPU API since another API was already chosen"
-            )
-
        self._gpu_api_provider = CudaAPI()

    def use_hip(self):
+        """Instruct the GPU composer to use the HIP runtime API"""
        from ..lang.gpu import HipAPI

-        if self._gpu_api_provider is not None and not isinstance(
-            self._gpu_api_provider, HipAPI
-        ):
-            raise ValueError(
-                "Cannot select HIP GPU API since another API was already chosen"
-            )
-
        self._gpu_api_provider = HipAPI()

    @property
    def gpu_api(self) -> ProvidesGpuRuntimeAPI:
+        """GPU runtime API wrapper currently used by this GPU composer.
+
+        Raises:
+            AttributeError: If no runtime API was set yet (see `use_cuda`, `use_hip`)
+        """
        if self._gpu_api_provider is None:
            raise AttributeError(
                "No GPU API was selected - call `use_cuda()` or `use_hip()` first."
 @@ -104,7 +97,8 @@ class SfgGpuComposer(SfgComposerMixIn):

        This signature accepts kernels generated with an indexing scheme that permits a user-defined
        blocks size, such as `Linear3D <IndexingScheme.Linear3D>`.
-        The grid size is calculated automatically.
+        The grid size is calculated automatically by dividing the number of work items in each
+        dimension by the block size, rounding up.
        """

    def gpu_invoke(self, kernel_handle: SfgKernelHandle, **kwargs) -> SfgCallTreeNode:
 @@ -144,6 +138,9 @@ class SfgGpuComposer(SfgComposerMixIn):
                stream=stmt_stream,
            )

+        def to_uint32_t(expr: AugExpr) -> AugExpr:
+            return AugExpr("uint32_t").format("uint32_t({})", expr)
+
        match launch_config:
            case ManualLaunchConfiguration():
                grid_size = kwargs["grid_size"]
 @@ -153,12 +150,14 @@ class SfgGpuComposer(SfgComposerMixIn):

            case AutomaticLaunchConfiguration():
                grid_size_entries = [
-                    self.expr_from_lambda(gs) for gs in launch_config._grid_size
+                    to_uint32_t(self.expr_from_lambda(gs))
+                    for gs in launch_config._grid_size
                ]
                grid_size_var = dim3(const=True).var("__grid_size")

                block_size_entries = [
-                    self.expr_from_lambda(bs) for bs in launch_config._block_size
+                    to_uint32_t(self.expr_from_lambda(bs))
+                    for bs in launch_config._block_size
                ]
                block_size_var = dim3(const=True).var("__block_size")

 @@ -197,27 +196,16 @@ class SfgGpuComposer(SfgComposerMixIn):
                    "uint32_t", "uint32_t", "uint32_t", const=True
                ).var("__work_items")

-                def _min(a: ExprLike, b: ExprLike):
-                    return AugExpr.format("{a} < {b} ? {a} : {b}", a=a, b=b)
-
                def _div_ceil(a: ExprLike, b: ExprLike):
                    return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)

-                reduced_block_size_entries = [
-                    _min(work_items_var.get(i), bs)
-                    for i, bs in enumerate(
-                        [block_size_var.x, block_size_var.y, block_size_var.z]
-                    )
-                ]
-                reduced_block_size_var = dim3(const=True).var("__reduced_block_size")
-
                grid_size_entries = [
                    _div_ceil(work_items_var.get(i), bs)
                    for i, bs in enumerate(
                        [
-                            reduced_block_size_var.x,
-                            reduced_block_size_var.y,
-                            reduced_block_size_var.z,
+                            block_size_var.x,
+                            block_size_var.y,
+                            block_size_var.z,
                        ]
                    )
                ]
 @@ -226,9 +214,8 @@ class SfgGpuComposer(SfgComposerMixIn):
                nodes = [
                    self.init(block_size_var)(*block_size_init_args),
                    self.init(work_items_var)(*work_items_entries),
-                    self.init(reduced_block_size_var)(*reduced_block_size_entries),
                    self.init(grid_size_var)(*grid_size_entries),
-                    _render_invocation(grid_size_var, reduced_block_size_var),
+                    _render_invocation(grid_size_var, block_size_var),
                ]

                return SfgBlock(SfgSequence(nodes))