diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1420bd2b53e918cfce8e6b5491e94888ad36a0f9..ffee8829de5e64a54fcc0937745b08b97429d61b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -24,7 +24,6 @@ typechecker:
     - nox --session typecheck
 
 .testsuite-base:
-  extends: .nox-base
   stage: "Tests"
   needs: []
   coverage: '/TOTAL.*\s+(\d+%)$/'
@@ -38,13 +37,18 @@ typechecker:
         coverage_format: cobertura
         path: coverage.xml
 
-"testsuite-py3.10":
+"testsuite-py3.10+cuda":
   extends: .testsuite-base
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:ubuntu24.04-cuda12.6
   script:
     - nox --session testsuite-3.10
+  tags:
+    - docker
+    - cuda11
 
 "testsuite-py3.13":
   extends: .testsuite-base
+  image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:alpine
   script:
     - nox --session testsuite-3.13
 
diff --git a/docs/source/_util/sfg_monkeypatch.py b/docs/source/_util/sfg_monkeypatch.py
index 0269d40f43492ea1540f51f49c8e78c5ebebf37d..91a96233b0af6831dbd0c8ac4db06d862517d994 100644
--- a/docs/source/_util/sfg_monkeypatch.py
+++ b/docs/source/_util/sfg_monkeypatch.py
@@ -1,6 +1,8 @@
 import pystencilssfg
 from pystencilssfg.config import SfgConfig
 
+from os.path import splitext
+
 
 class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
     """Mockup wrapper around SourceFileGenerator for use in documentation
@@ -28,23 +30,23 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator):
     def __exit__(self, exc_type, exc_value, traceback):
         if exc_type is None:
             self._finish_files()
+            emitter = self._get_emitter()
 
-            header_code = self._emitter.dumps(self._header_file)
-            impl_code = (
-                None
-                if self._impl_file is None
-                else self._emitter.dumps(self._impl_file)
-            )
+            header_code = emitter.dumps(self._header_file)
+            header_ext = splitext(self._header_file.name)[1]
 
             mdcode = ":::::{tab-set}\n"
 
-            mdcode += "::::{tab-item} Generated Header (.hpp)\n"
+            mdcode += f"::::{{tab-item}} Generated Header ({header_ext})\n"
             mdcode += ":::{code-block} C++\n\n"
             mdcode += header_code
             mdcode += "\n:::\n::::\n"
 
-            if impl_code:
-                mdcode += "::::{tab-item} Generated Implementation (.cpp)\n"
+            if self._impl_file is not None:
+                impl_code = emitter.dumps(self._impl_file)
+                impl_ext = splitext(self._impl_file.name)[1]
+
+                mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n"
                 mdcode += ":::{code-block} C++\n\n"
                 mdcode += impl_code
                 mdcode += "\n:::\n::::\n"
diff --git a/docs/source/api/composer.rst b/docs/source/api/composer.rst
index 124d0fb97ac8f94bc2a6d4c38815edee8403c65b..8b470b033cfb2755dd57c5f8cce682f64756b0d2 100644
--- a/docs/source/api/composer.rst
+++ b/docs/source/api/composer.rst
@@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``)
 .. autoclass:: SfgClassComposer
     :members:
 
+.. autoclass:: SfgGpuComposer
+    :members:
+
 Custom Generators
 =================
 
@@ -37,6 +40,7 @@ Helper Methods and Builders
 
 .. autoclass:: SfgFunctionSequencer
     :members:
+    :inherited-members:
 
 .. autoclass:: SfgNodeBuilder
     :members:
@@ -47,6 +51,12 @@ Helper Methods and Builders
 .. autoclass:: SfgSwitchBuilder
     :members:
 
+.. module:: pystencilssfg.composer.class_composer
+
+.. autoclass:: SfgMethodSequencer
+    :members:
+    :inherited-members:
+
 Context and Cursor
 ==================
 
diff --git a/docs/source/api/lang.rst b/docs/source/api/lang.rst
index bd5e4fa7993f30d4c8f6e5c532eb3d234c64061d..cdf8f242cf58b3f0d9889ad94cef6c7a8b7a8155 100644
--- a/docs/source/api/lang.rst
+++ b/docs/source/api/lang.rst
@@ -41,3 +41,9 @@ Implementation
 
 .. automodule:: pystencilssfg.lang.cpp
     :members:
+
+GPU Runtime APIs
+----------------
+
+.. automodule:: pystencilssfg.lang.gpu
+    :members:
diff --git a/docs/source/usage/config_and_cli.md b/docs/source/usage/config_and_cli.md
index 785ff52ed1d5153b04be9e900363af156ff94e7c..b6060c0b07f5a7a4f0165f6c681db5fb97bea976 100644
--- a/docs/source/usage/config_and_cli.md
+++ b/docs/source/usage/config_and_cli.md
@@ -12,7 +12,7 @@ different configuration sources:
   the generator script to set some of its configuration options; see [Command-Line Options](#cmdline_options)
 - **Project Configuration:** When embedded into a larger project, using a build system such as CMake, generator scripts
   may be configured globally within that project by the use of a *configuration module*.
-  Settings specified inside that configuration module are always overridden by the former to configuration sources.
+  Settings specified inside that configuration module are always overridden by the two other configuration sources listed above.
   For details on configuration modules, refer to the guide on [Project and Build System Integration](#guide_project_integration).
 
 (inline_config)=
@@ -60,14 +60,26 @@ set {any}`cfg.outer_namespace <SfgConfig.outer_namespace>`.
 
 ### Code Style and Formatting
 
- - Modify the values in the {any}`cfg.code_style <CodeStyle>` category to affect
-   certain formatting aspects of the generated code.
- - To change, enforce, or disable auto-formatting of generated code through `clang-format`,
-   take a look at the {any}`cfg.clang_format <ClangFormatOptions>` category.
- - Clang-format will, by default, sort `#include` statements alphabetically and separate
-   local and system header includes.
-   To override this, you can set a custom sorting key for `#include` sorting via
-   {any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.
+Pystencils-sfg gives you some options to affect its output code style.
+These are controlled by the options in the {any}`cfg.code_style <CodeStyle>` category.
+
+Furthermore, pystencils-sfg uses `clang-format` to beautify generated code.
+The behaviour of the clang-format integration is managed by the
+the {any}`cfg.clang_format <ClangFormatOptions>` category,
+where you can set options to skip or enforce formatting,
+or change the formatter binary.
+To set the code style used by `clang-format` either create a `.clang-format` file
+in any of the parent folders of your generator script,
+or modify the {any}`cfg.clang_format.code_style <ClangFormatOptions.code_style>` option.
+
+:::{seealso}
+[Clang-Format Style Options](https://clang.llvm.org/docs/ClangFormatStyleOptions.html)
+:::
+
+Clang-format will, by default, sort `#include` statements alphabetically and separate
+local and system header includes.
+To override this, you can set a custom sorting key for `#include` sorting via
+{any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`.
 
 (cmdline_options)=
 ## Command-Line Options
diff --git a/docs/source/usage/how_to_composer.md b/docs/source/usage/how_to_composer.md
index 966a9a661b8f7c5d4d863b07c2a9549a95032591..849de18a3cca12e8bfc697e7b84aeb8ae3950239 100644
--- a/docs/source/usage/how_to_composer.md
+++ b/docs/source/usage/how_to_composer.md
@@ -283,7 +283,7 @@ The composer gives us access to the default kernel namespace (`<current_namespac
 via `sfg.kernels`.
 
 To add a kernel,
- - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.reate() <KernelsAdder.create>`,
+ - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.create() <KernelsAdder.create>`,
  - or create the kernel separately through {any}`pystencils.create_kernel <pystencils.codegen.create_kernel>` and register it using
    {any}`kernels.add() <KernelsAdder.add>`.
 
@@ -392,13 +392,176 @@ with SourceFileGenerator() as sfg:
     )
 ```
 
-(exposed_inline_kernels)=
-### Exposed and Inline Kernels
+## GPU Kernels
+
+Pystencils also allows us to generate kernels for the CUDA and HIP GPU programming models.
+This section describes how to generate GPU kernels through pystencils-sfg;
+how to invoke them with various launch configurations,
+and how GPU execution streams are reflected.
+
+### Generate and Invoke CUDA and HIP Kernels
+
+To generate a kernel targetting either of these, set the
+{any}`target <pystencils.codegen.config.CreateKernelConfig.target>`
+code generator option to either `Target.CUDA` or `Target.HIP`.
+After registering a GPU kernel,
+its invocation can be rendered using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`.
+Here is an example using CUDA:
+
+```{code-cell} ipython3
+from pystencilssfg import SfgConfig
+sfg_config = SfgConfig()
+sfg_config.extensions.impl = "cu"
+
+with SourceFileGenerator(sfg_config) as sfg:
+    #   Configure the code generator to use CUDA
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    #   Create fields, assemble assignments
+    f, g = ps.fields("f, g: double[128, 128]")
+    asm = ps.Assignment(f(0), g(0))
+
+    #   Register kernel
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    #   Invoke it
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle)
+    )
+```
+
+In this snippet, we used the [generator configuration](#how_to_generator_scripts_config)
+to change the suffix of the generated implementation file to `.cu`.
+
+When investigating the generated `.cu` file, you can see that the GPU launch configuration parameters
+*grid size* and *block size* are being computed automatically from the array sizes.
+This behavior can be changed by modifying options in the {any}`gpu <pystencils.codegen.config.GpuOptions>`
+category of the `CreateKernelConfig`.
+
+### Adapting the Launch Configuration
+
+GPU kernel invocations usually require the user to provide a launch grid, defined
+by the GPU thread block size and the number of blocks on the grid.
+In the simplest case (seen above), pystencils-sfg will emit code that automatically
+computes these parameters from the size of the arrays passed to the kernel,
+using a default block size defined by pystencils.
+
+The code generator also permits customization of the launch configuration.
+You may provide a custom block size to override the default, in which case the
+grid size will still be computed by dividing the array sizes by your block size.
+Otherwise, you can also fully take over control of both block and grid size.
+For both cases, instructions are given in the following.
+
+#### User-Defined Block Size for Auto-Computed Grid Size
+
+To merely modify the block size argument while still automatically inferring the grid size,
+pass a variable or expression of type `dim3` to the `block_size` parameter of `gpu_invoke`.
+Pystencils-sfg exposes two versions of `dim3`, which differ primarily in their associated
+runtime headers:
+
+ - {any}`pystencilssfg.lang.gpu.cuda.dim3 <CudaAPI.dim3>` for CUDA, and
+ - {any}`pystencilssfg.lang.gpu.hip.dim3 <HipAPI.dim3>` for HIP.
+
+The following snippet selects the correct `dim3` type according to the kernel target;
+it then creates a variable of that type and turns that into an argument to the kernel invocation:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+target = ps.Target.HIP
+cfg = ps.CreateKernelConfig(target=target)
+f, g = ps.fields("f, g: double[128, 128]")
+asm = ps.Assignment(f(0), g(0))
+```
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    #   Select dim3 reflection
+    match target:
+        case ps.Target.CUDA:
+            from pystencilssfg.lang.gpu import cuda as gpu_api
+        case ps.Target.HIP:
+            from pystencilssfg.lang.gpu import hip as gpu_api
+    
+    #   Create dim3 variable and pass it to kernel invocation
+    block_size = gpu_api.dim3(const=True).var("block_size")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, block_size=block_size)
+    )
+```
+
+#### Manual Launch Configurations
+
+To take full control of the launch configuration, we must disable its automatic inferrence
+by setting the {any}`gpu.manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+code generator option to `True`.
+Then, we must pass `dim3` arguments for both `block_size` and `grid_size` to the kernel invocation:
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+
+    #   Configure for manual launch config
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+    cfg.gpu.manual_launch_grid = True
+
+    #   Register kernel
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+    
+    #   Create dim3 variables
+    from pystencilssfg.lang.gpu import cuda
+    block_size = cuda.dim3(const=True).var("block_size")
+    grid_size = cuda.dim3(const=True).var("grid_size")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size)
+    )
+```
+
+### Using Streams
+
+CUDA and HIP kernels can be enqueued into streams for concurrent execution.
+This is mirrored in pystencils-sfg;
+all overloads of `gpu_invoke` take an optional `stream` argument.
+The `stream_t` data types of both CUDA and HIP are made available
+through the respective API reflections:
+
+ - {any}`lang.gpu.cuda.stream_t <CudaAPI.stream_t>` reflects `cudaStream_t`, and
+ - {any}`lang.gpu.hip.stream_t <HipAPI.stream_t>` reflects `hipStream_t`.
+
+Here is an example that creates a variable of the HIP stream type
+and passes it to `gpu_invoke`:
+
+```{code-cell} ipython3
+:tags: [remove-cell]
+cfg = ps.CreateKernelConfig(target=ps.Target.HIP)
+f, g = ps.fields("f, g: double[128, 128]")
+asm = ps.Assignment(f(0), g(0))
+```
+
+```{code-cell} ipython3
+from pystencilssfg.lang.gpu import hip
+
+with SourceFileGenerator(sfg_config) as sfg:
+    # ... define kernel ...
+    khandle = sfg.kernels.create(asm, "gpu_kernel", cfg)
+
+    stream = hip.stream_t(const=True).var("stream")
+
+    sfg.function("kernel_wrapper")(
+        sfg.gpu_invoke(khandle, stream=stream)
+    )
+```
 
 :::{admonition} To Do
 
- - Creating and calling kernels
- - Invoking GPU kernels and the CUDA API Mirror
  - Defining classes, their fields constructors, and methods
 
 :::
diff --git a/src/pystencilssfg/composer/__init__.py b/src/pystencilssfg/composer/__init__.py
index f6af76b8b9c36445990fc451983fec5c14a4cf34..c8f279ecd43c9e7809e8f7796c5ad4ad36ba7a76 100644
--- a/src/pystencilssfg/composer/__init__.py
+++ b/src/pystencilssfg/composer/__init__.py
@@ -9,6 +9,7 @@ from .basic_composer import (
 )
 from .mixin import SfgComposerMixIn
 from .class_composer import SfgClassComposer
+from .gpu_composer import SfgGpuComposer
 
 __all__ = [
     "SfgIComposer",
@@ -20,4 +21,5 @@ __all__ = [
     "ExprLike",
     "SfgBasicComposer",
     "SfgClassComposer",
+    "SfgGpuComposer",
 ]
diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py
index 31337a6282932420f8d5b6d9093deec5c2caea1a..d78e43deb53345aceca56baa6cf63f07b3a1d8de 100644
--- a/src/pystencilssfg/composer/basic_composer.py
+++ b/src/pystencilssfg/composer/basic_composer.py
@@ -13,7 +13,7 @@ from pystencils import (
     Assignment,
     AssignmentCollection,
 )
-from pystencils.codegen import Kernel
+from pystencils.codegen import Kernel, Lambda
 from pystencils.types import create_type, UserTypeSpec, PsType
 
 from ..context import SfgContext, SfgCursor
@@ -21,7 +21,6 @@ from .custom import CustomGenerator
 from ..ir import (
     SfgCallTreeNode,
     SfgKernelCallNode,
-    SfgCudaKernelInvocation,
     SfgStatements,
     SfgFunctionParams,
     SfgRequireIncludes,
@@ -53,6 +52,7 @@ from ..lang import (
     HeaderFile,
     includes,
     SfgVar,
+    SfgKernelParamVar,
     AugExpr,
     SupportsFieldExtraction,
     SupportsVectorExtraction,
@@ -390,34 +390,14 @@ class SfgBasicComposer(SfgIComposer):
         """Use inside a function body to directly call a kernel.
 
         When using `call`, the given kernel will simply be called as a function.
-        To invoke a GPU kernel on a specified launch grid, use `cuda_invoke`
-        or the interfaces of ``pystencilssfg.extensions.sycl`` instead.
+        To invoke a GPU kernel on a specified launch grid,
+        use `gpu_invoke <SfgGpuComposer.gpu_invoke>` instead.
 
         Args:
             kernel_handle: Handle to a kernel previously added to some kernel namespace.
         """
         return SfgKernelCallNode(kernel_handle)
 
-    def cuda_invoke(
-        self,
-        kernel_handle: SfgKernelHandle,
-        num_blocks: ExprLike,
-        threads_per_block: ExprLike,
-        stream: ExprLike | None,
-    ):
-        """Dispatch a CUDA kernel to the device."""
-        num_blocks_str = str(num_blocks)
-        tpb_str = str(threads_per_block)
-        stream_str = str(stream) if stream is not None else None
-
-        deps = depends(num_blocks) | depends(threads_per_block)
-        if stream is not None:
-            deps |= depends(stream)
-
-        return SfgCudaKernelInvocation(
-            kernel_handle, num_blocks_str, tpb_str, stream_str, deps
-        )
-
     def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence:
         """Syntax sequencing. For details, see `make_sequence`"""
         return make_sequence(*args)
@@ -511,6 +491,11 @@ class SfgBasicComposer(SfgIComposer):
         """
         return AugExpr.format(fmt, *deps, **kwdeps)
 
+    def expr_from_lambda(self, lamb: Lambda) -> AugExpr:
+        depends = set(SfgKernelParamVar(p) for p in lamb.parameters)
+        code = lamb.c_code()
+        return AugExpr.make(code, depends, dtype=lamb.return_type)
+
     @property
     def branch(self) -> SfgBranchBuilder:
         """Use inside a function body to create an if/else conditonal branch.
@@ -564,7 +549,11 @@ class SfgBasicComposer(SfgIComposer):
         var: SfgVar | sp.Symbol = asvar(param) if isinstance(param, _VarLike) else param
         return SfgDeferredParamSetter(var, expr)
 
-    def map_vector(self, lhs_components: Sequence[VarLike | sp.Symbol], rhs: SupportsVectorExtraction):
+    def map_vector(
+        self,
+        lhs_components: Sequence[VarLike | sp.Symbol],
+        rhs: SupportsVectorExtraction,
+    ):
         """Extracts scalar numerical values from a vector data type.
 
         Args:
diff --git a/src/pystencilssfg/composer/composer.py b/src/pystencilssfg/composer/composer.py
index bba479e3289f2b589a1d32e61997bdaa915eb47e..b1cfc4b1d5cbbe704f3fa1b7d7082be379e379b5 100644
--- a/src/pystencilssfg/composer/composer.py
+++ b/src/pystencilssfg/composer/composer.py
@@ -3,12 +3,13 @@ from typing import TYPE_CHECKING
 
 from .basic_composer import SfgBasicComposer
 from .class_composer import SfgClassComposer
+from .gpu_composer import SfgGpuComposer
 
 if TYPE_CHECKING:
     from ..context import SfgContext
 
 
-class SfgComposer(SfgBasicComposer, SfgClassComposer):
+class SfgComposer(SfgBasicComposer, SfgClassComposer, SfgGpuComposer):
     """Primary interface for constructing source files in pystencils-sfg.
 
     The SfgComposer combines the `SfgBasicComposer`
@@ -19,3 +20,4 @@ class SfgComposer(SfgBasicComposer, SfgClassComposer):
     def __init__(self, sfg: SfgContext | SfgBasicComposer):
         SfgBasicComposer.__init__(self, sfg)
         SfgClassComposer.__init__(self)
+        SfgGpuComposer.__init__(self)
diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaffb02820baabd93aead08db30a35b57851f3fd
--- /dev/null
+++ b/src/pystencilssfg/composer/gpu_composer.py
@@ -0,0 +1,307 @@
+from __future__ import annotations
+
+from typing import overload
+
+from pystencils.codegen import GpuKernel, Target
+from pystencils.codegen.gpu_indexing import (
+    ManualLaunchConfiguration,
+    AutomaticLaunchConfiguration,
+    DynamicBlockSizeLaunchConfiguration,
+)
+
+from .mixin import SfgComposerMixIn
+from .basic_composer import make_statements, make_sequence
+
+from ..context import SfgContext
+from ..ir import (
+    SfgKernelHandle,
+    SfgCallTreeNode,
+    SfgGpuKernelInvocation,
+    SfgBlock,
+    SfgSequence,
+)
+from ..lang import ExprLike, AugExpr
+from ..lang.gpu import CudaAPI, HipAPI, ProvidesGpuRuntimeAPI
+
+
+class SfgGpuComposer(SfgComposerMixIn):
+    """Composer mix-in providing methods to generate GPU kernel invocations.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs)
+
+        Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration.
+
+        The overloads of this method are listed below.
+        They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax;
+        for details on the launch configuration arguments,
+        refer to `Launch Configurations in CUDA`_
+        or `Launch Configurations in HIP`_.
+
+    .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a manual launch grid.
+
+        Requires that the kernel was generated
+        with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>`
+        set to `True`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with an automatic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that
+        causes the launch grid to be determined automatically,
+        such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`.
+
+    .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode
+        :noindex:
+
+        Invoke a GPU kernel with a dynamic launch grid.
+
+        This signature accepts kernels generated with an indexing scheme that permits a user-defined
+        blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`.
+        The grid size is calculated automatically by dividing the number of work items in each
+        dimension by the block size, rounding up.
+
+    .. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
+
+    .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
+    """  # NOQA: E501
+
+    @overload
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        grid_size: ExprLike,
+        block_size: ExprLike,
+        shared_memory_bytes: ExprLike = "0",
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode: ...
+
+    @overload
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        shared_memory_bytes: ExprLike = "0",
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode: ...
+
+    @overload
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        *,
+        block_size: ExprLike | None = None,
+        shared_memory_bytes: ExprLike = "0",
+        stream: ExprLike | None = None,
+    ) -> SfgCallTreeNode: ...
+
+    def gpu_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        shared_memory_bytes: ExprLike = "0",
+        stream: ExprLike | None = None,
+        **kwargs,
+    ) -> SfgCallTreeNode:
+        builder = GpuInvocationBuilder(self._ctx, kernel_handle)
+        builder.shared_memory_bytes = shared_memory_bytes
+        builder.stream = stream
+
+        return builder(**kwargs)
+
+    def cuda_invoke(
+        self,
+        kernel_handle: SfgKernelHandle,
+        num_blocks: ExprLike,
+        threads_per_block: ExprLike,
+        stream: ExprLike | None,
+    ):
+        from warnings import warn
+
+        warn(
+            "cuda_invoke is deprecated and will be removed before version 0.1. "
+            "Use `gpu_invoke` instead.",
+            FutureWarning,
+        )
+
+        return self.gpu_invoke(
+            kernel_handle,
+            grid_size=num_blocks,
+            block_size=threads_per_block,
+            stream=stream,
+        )
+
+
+class GpuInvocationBuilder:
+    def __init__(
+        self,
+        ctx: SfgContext,
+        kernel_handle: SfgKernelHandle,
+    ):
+        self._ctx = ctx
+        self._kernel_handle = kernel_handle
+
+        ker = kernel_handle.kernel
+
+        if not isinstance(ker, GpuKernel):
+            raise ValueError(f"Non-GPU kernel was passed to `gpu_invoke`: {ker}")
+
+        launch_config = ker.get_launch_configuration()
+
+        self._launch_config = launch_config
+
+        gpu_api: type[ProvidesGpuRuntimeAPI]
+        match ker.target:
+            case Target.CUDA:
+                gpu_api = CudaAPI
+            case Target.HIP:
+                gpu_api = HipAPI
+            case _:
+                assert False, "unexpected GPU target"
+
+        self._gpu_api = gpu_api
+        self._dim3 = gpu_api.dim3
+
+        self._shared_memory_bytes: ExprLike = "0"
+        self._stream: ExprLike | None = None
+
+    @property
+    def shared_memory_bytes(self) -> ExprLike:
+        return self._shared_memory_bytes
+
+    @shared_memory_bytes.setter
+    def shared_memory_bytes(self, bs: ExprLike):
+        self._shared_memory_bytes = bs
+
+    @property
+    def stream(self) -> ExprLike | None:
+        return self._stream
+
+    @stream.setter
+    def stream(self, s: ExprLike | None):
+        self._stream = s
+
+    def _render_invocation(
+        self, grid_size: ExprLike, block_size: ExprLike
+    ) -> SfgSequence:
+        stmt_grid_size = make_statements(grid_size)
+        stmt_block_size = make_statements(block_size)
+        stmt_smem = make_statements(self._shared_memory_bytes)
+        stmt_stream = (
+            make_statements(self._stream) if self._stream is not None else None
+        )
+
+        return make_sequence(
+            "// clang-format off: "
+            "[pystencils-sfg] Formatting may add illegal spaces between angular brackets in `<<< >>>`.",
+            SfgGpuKernelInvocation(
+                self._kernel_handle,
+                stmt_grid_size,
+                stmt_block_size,
+                shared_memory_bytes=stmt_smem,
+                stream=stmt_stream,
+            ),
+            "// clang-format on",
+        )
+
+    def __call__(self, **kwargs: ExprLike) -> SfgCallTreeNode:
+        match self._launch_config:
+            case ManualLaunchConfiguration():
+                return self._invoke_manual(**kwargs)
+            case AutomaticLaunchConfiguration():
+                return self._invoke_automatic(**kwargs)
+            case DynamicBlockSizeLaunchConfiguration():
+                return self._invoke_dynamic(**kwargs)
+            case _:
+                raise ValueError(
+                    f"Unexpected launch configuration: {self._launch_config}"
+                )
+
+    def _invoke_manual(self, grid_size: ExprLike, block_size: ExprLike):
+        assert isinstance(self._launch_config, ManualLaunchConfiguration)
+        return self._render_invocation(grid_size, block_size)
+
+    def _invoke_automatic(self):
+        assert isinstance(self._launch_config, AutomaticLaunchConfiguration)
+
+        from .composer import SfgComposer
+
+        sfg = SfgComposer(self._ctx)
+
+        grid_size_entries = [
+            self._to_uint32_t(sfg.expr_from_lambda(gs))
+            for gs in self._launch_config._grid_size
+        ]
+        grid_size_var = self._dim3(const=True).var("__grid_size")
+
+        block_size_entries = [
+            self._to_uint32_t(sfg.expr_from_lambda(bs))
+            for bs in self._launch_config._block_size
+        ]
+        block_size_var = self._dim3(const=True).var("__block_size")
+
+        nodes = [
+            sfg.init(grid_size_var)(*grid_size_entries),
+            sfg.init(block_size_var)(*block_size_entries),
+            self._render_invocation(grid_size_var, block_size_var),
+        ]
+
+        return SfgBlock(SfgSequence(nodes))
+
+    def _invoke_dynamic(self, block_size: ExprLike | None = None):
+        assert isinstance(self._launch_config, DynamicBlockSizeLaunchConfiguration)
+
+        from .composer import SfgComposer
+
+        sfg = SfgComposer(self._ctx)
+
+        block_size_init_args: tuple[ExprLike, ...]
+        if block_size is None:
+            block_size_init_args = tuple(
+                str(bs) for bs in self._launch_config.default_block_size
+            )
+        else:
+            block_size_init_args = (block_size,)
+
+        block_size_var = self._dim3(const=True).var("__block_size")
+
+        from ..lang.cpp import std
+
+        work_items_entries = [
+            sfg.expr_from_lambda(wit) for wit in self._launch_config.num_work_items
+        ]
+        work_items_var = std.tuple("uint32_t", "uint32_t", "uint32_t", const=True).var(
+            "__work_items"
+        )
+
+        def _div_ceil(a: ExprLike, b: ExprLike):
+            return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b)
+
+        grid_size_entries = [
+            _div_ceil(work_items_var.get(i), bs)
+            for i, bs in enumerate(
+                [
+                    block_size_var.x,
+                    block_size_var.y,
+                    block_size_var.z,
+                ]
+            )
+        ]
+        grid_size_var = self._dim3(const=True).var("__grid_size")
+
+        nodes = [
+            sfg.init(block_size_var)(*block_size_init_args),
+            sfg.init(work_items_var)(*work_items_entries),
+            sfg.init(grid_size_var)(*grid_size_entries),
+            self._render_invocation(grid_size_var, block_size_var),
+        ]
+
+        return SfgBlock(SfgSequence(nodes))
+
+    @staticmethod
+    def _to_uint32_t(expr: AugExpr) -> AugExpr:
+        return AugExpr("uint32_t").format("uint32_t({})", expr)
diff --git a/src/pystencilssfg/context.py b/src/pystencilssfg/context.py
index 1622a1e3bd33259cd89eef171ad043c4ea9ef536..5773455198187fef7d670ca1f4f9232fb1d7adde 100644
--- a/src/pystencilssfg/context.py
+++ b/src/pystencilssfg/context.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import Sequence, Any, Generator
 from contextlib import contextmanager
 
-from .config import CodeStyle
+from .config import CodeStyle, ClangFormatOptions
 from .ir import (
     SfgSourceFile,
     SfgNamespace,
@@ -23,6 +23,7 @@ class SfgContext:
         impl_file: SfgSourceFile | None,
         namespace: str | None = None,
         codestyle: CodeStyle | None = None,
+        clang_format_opts: ClangFormatOptions | None = None,
         argv: Sequence[str] | None = None,
         project_info: Any = None,
     ):
@@ -33,6 +34,9 @@ class SfgContext:
         self._inner_namespace: str | None = None
 
         self._codestyle = codestyle if codestyle is not None else CodeStyle()
+        self._clang_format: ClangFormatOptions = (
+            clang_format_opts if clang_format_opts is not None else ClangFormatOptions()
+        )
 
         self._header_file = header_file
         self._impl_file = impl_file
@@ -73,6 +77,10 @@ class SfgContext:
         """The code style object for this generation context."""
         return self._codestyle
 
+    @property
+    def clang_format(self) -> ClangFormatOptions:
+        return self._clang_format
+
     @property
     def header_file(self) -> SfgSourceFile:
         return self._header_file
@@ -150,6 +158,9 @@ class SfgCursor:
             self._loc[f].append(block)
             self._loc[f] = block.elements
 
+        outer_namespace = self._cur_namespace
+        self._cur_namespace = namespace
+
         @contextmanager
         def ctxmgr():
             try:
@@ -157,5 +168,6 @@ class SfgCursor:
             finally:
                 #   Have the cursor step back out of the nested namespace blocks
                 self._loc = outer_locs
+                self._cur_namespace = outer_namespace
 
         return ctxmgr()
diff --git a/src/pystencilssfg/extensions/gpu.py b/src/pystencilssfg/extensions/gpu.py
deleted file mode 100644
index b4242ac81aa166085dca20b4936cc56e1b829720..0000000000000000000000000000000000000000
--- a/src/pystencilssfg/extensions/gpu.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from pystencilssfg import lang
-
-
-def dim3class(gpu_runtime_header: str, *, cls_name: str = "dim3"):
-    """
-    >>> dim3 = dim3class("<hip/hip_runtime.h>")
-    >>> dim3().ctor(64, 1, 1)
-    dim3{64, 1, 1}
-
-    Args:
-        gpu_runtime_header: String with the name of the gpu runtime header
-        cls_name: String with the acutal name (default "dim3")
-    Returns:
-        Dim3Class: A `lang.CppClass` that mimics cuda's/hip's `dim3`
-    """
-    @lang.cppclass(cls_name, gpu_runtime_header)
-    class Dim3Class:
-        def ctor(self, dim0=1, dim1=1, dim2=1):
-            return self.ctor_bind(dim0, dim1, dim2)
-
-        @property
-        def x(self):
-            return lang.AugExpr.format("{}.x", self)
-
-        @property
-        def y(self):
-            return lang.AugExpr.format("{}.y", self)
-
-        @property
-        def z(self):
-            return lang.AugExpr.format("{}.z", self)
-
-        @property
-        def dims(self):
-            """The dims property."""
-            return [self.x, self.y, self.z]
-
-    return Dim3Class
diff --git a/src/pystencilssfg/generator.py b/src/pystencilssfg/generator.py
index c314d67bbc45d5da639d3b7a2b2f92667cc77586..fe4eb99d4cf15056e457fd14a7976d54bb0db6ae 100644
--- a/src/pystencilssfg/generator.py
+++ b/src/pystencilssfg/generator.py
@@ -95,9 +95,7 @@ class SourceFileGenerator:
             self._impl_file = SfgSourceFile(
                 output_files[1].name, SfgSourceFileType.TRANSLATION_UNIT
             )
-            self._impl_file.includes.append(
-                HeaderFile.parse(self._header_file.name)
-            )
+            self._impl_file.includes.append(HeaderFile.parse(self._header_file.name))
 
         #   TODO: Find a way to not hard-code the restrict qualifier in pystencils
         self._header_file.elements.append("#define RESTRICT __restrict__")
@@ -115,14 +113,11 @@ class SourceFileGenerator:
             self._impl_file,
             namespace,
             config.codestyle,
+            config.clang_format,
             argv=script_args,
             project_info=cli_params.get_project_info(),
         )
 
-        self._emitter = SfgCodeEmitter(
-            self._output_dir, config.codestyle, config.clang_format
-        )
-
         sort_key = config.codestyle.get_option("includes_sorting_key")
         if sort_key is None:
 
@@ -161,6 +156,13 @@ class SourceFileGenerator:
             )
             self._impl_file.includes.sort(key=self._include_sort_key)
 
+    def _get_emitter(self):
+        return SfgCodeEmitter(
+            self._output_dir,
+            self._context.codestyle,
+            self._context.clang_format,
+        )
+
     def __enter__(self) -> SfgComposer:
         self.clean_files()
         return SfgComposer(self._context)
@@ -169,6 +171,7 @@ class SourceFileGenerator:
         if exc_type is None:
             self._finish_files()
 
-            self._emitter.emit(self._header_file)
+            emitter = self._get_emitter()
+            emitter.emit(self._header_file)
             if self._impl_file is not None:
-                self._emitter.emit(self._impl_file)
+                emitter.emit(self._impl_file)
diff --git a/src/pystencilssfg/ir/__init__.py b/src/pystencilssfg/ir/__init__.py
index 8f03fed0d4c2467377cdaab6cf100a13f7ded9fb..0d93fb148333921d60f25cf68ace89f61873e383 100644
--- a/src/pystencilssfg/ir/__init__.py
+++ b/src/pystencilssfg/ir/__init__.py
@@ -3,7 +3,7 @@ from .call_tree import (
     SfgCallTreeLeaf,
     SfgEmptyNode,
     SfgKernelCallNode,
-    SfgCudaKernelInvocation,
+    SfgGpuKernelInvocation,
     SfgBlock,
     SfgSequence,
     SfgStatements,
@@ -47,7 +47,7 @@ __all__ = [
     "SfgCallTreeLeaf",
     "SfgEmptyNode",
     "SfgKernelCallNode",
-    "SfgCudaKernelInvocation",
+    "SfgGpuKernelInvocation",
     "SfgSequence",
     "SfgBlock",
     "SfgStatements",
diff --git a/src/pystencilssfg/ir/call_tree.py b/src/pystencilssfg/ir/call_tree.py
index 24a315d5a0ae0319cbc1b906f98deacc72828176..61c5bbbe01cdd42ae7b94b7843aa1e916c3afedb 100644
--- a/src/pystencilssfg/ir/call_tree.py
+++ b/src/pystencilssfg/ir/call_tree.py
@@ -19,6 +19,7 @@ class SfgCallTreeNode(ABC):
     Therefore, every instantiable call tree node must implement the method `get_code`.
     By convention, the string returned by `get_code` should not contain a trailing newline.
     """
+
     def __init__(self) -> None:
         self._includes: set[HeaderFile] = set()
 
@@ -34,6 +35,11 @@ class SfgCallTreeNode(ABC):
         By convention, the code block emitted by this function should not contain a trailing newline.
         """
 
+    @property
+    def depends(self) -> set[SfgVar]:
+        """Set of objects this leaf depends on"""
+        return set()
+
     @property
     def required_includes(self) -> set[HeaderFile]:
         """Return a set of header includes required by this node"""
@@ -53,11 +59,6 @@ class SfgCallTreeLeaf(SfgCallTreeNode, ABC):
     def children(self) -> Sequence[SfgCallTreeNode]:
         return ()
 
-    @property
-    @abstractmethod
-    def depends(self) -> set[SfgVar]:
-        """Set of objects this leaf depends on"""
-
 
 class SfgEmptyNode(SfgCallTreeLeaf):
     """A leaf node that does not emit any code.
@@ -202,52 +203,76 @@ class SfgKernelCallNode(SfgCallTreeLeaf):
         return set(self._kernel_handle.parameters)
 
     def get_code(self, cstyle: CodeStyle) -> str:
-        ast_params = self._kernel_handle.parameters
+        kparams = self._kernel_handle.parameters
         fnc_name = self._kernel_handle.fqname
-        call_parameters = ", ".join([p.name for p in ast_params])
+        call_parameters = ", ".join([p.name for p in kparams])
 
         return f"{fnc_name}({call_parameters});"
 
 
-class SfgCudaKernelInvocation(SfgCallTreeLeaf):
+class SfgGpuKernelInvocation(SfgCallTreeNode):
+    """A CUDA or HIP kernel invocation.
+
+    See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration
+    or https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions
+    for the syntax.
+    """
+
     def __init__(
         self,
         kernel_handle: SfgKernelHandle,
-        num_blocks_code: str,
-        threads_per_block_code: str,
-        stream_code: str | None,
-        depends: set[SfgVar],
+        grid_size: SfgStatements,
+        block_size: SfgStatements,
+        shared_memory_bytes: SfgStatements | None,
+        stream: SfgStatements | None,
     ):
-        from pystencils import Target
         from pystencils.codegen import GpuKernel
 
         kernel = kernel_handle.kernel
-        if not (isinstance(kernel, GpuKernel) and kernel.target == Target.CUDA):
+        if not isinstance(kernel, GpuKernel):
             raise ValueError(
-                "An `SfgCudaKernelInvocation` node can only call a CUDA kernel."
+                "An `SfgGpuKernelInvocation` node can only call GPU kernels."
             )
 
         super().__init__()
         self._kernel_handle = kernel_handle
-        self._num_blocks = num_blocks_code
-        self._threads_per_block = threads_per_block_code
-        self._stream = stream_code
-        self._depends = depends
+        self._grid_size = grid_size
+        self._block_size = block_size
+        self._shared_memory_bytes = shared_memory_bytes
+        self._stream = stream
+
+    @property
+    def children(self) -> Sequence[SfgCallTreeNode]:
+        return (
+            (
+                self._grid_size,
+                self._block_size,
+            )
+            + (
+                (self._shared_memory_bytes,)
+                if self._shared_memory_bytes is not None
+                else ()
+            )
+            + ((self._stream,) if self._stream is not None else ())
+        )
 
     @property
     def depends(self) -> set[SfgVar]:
-        return set(self._kernel_handle.parameters) | self._depends
+        return set(self._kernel_handle.parameters)
 
     def get_code(self, cstyle: CodeStyle) -> str:
-        ast_params = self._kernel_handle.parameters
+        kparams = self._kernel_handle.parameters
         fnc_name = self._kernel_handle.fqname
-        call_parameters = ", ".join([p.name for p in ast_params])
+        call_parameters = ", ".join([p.name for p in kparams])
+
+        grid_args = [self._grid_size, self._block_size]
+        if self._shared_memory_bytes is not None:
+            grid_args += [self._shared_memory_bytes]
 
-        grid_args = [self._num_blocks, self._threads_per_block]
         if self._stream is not None:
             grid_args += [self._stream]
 
-        grid = "<<< " + ", ".join(grid_args) + " >>>"
+        grid = "<<< " + ", ".join(arg.get_code(cstyle) for arg in grid_args) + " >>>"
         return f"{fnc_name}{grid}({call_parameters});"
 
 
diff --git a/src/pystencilssfg/ir/postprocessing.py b/src/pystencilssfg/ir/postprocessing.py
index 1e692b0aa9f37368da1688ec2d3bac6892c5ac60..896693317c02dee67302221f810d64c01b5eb233 100644
--- a/src/pystencilssfg/ir/postprocessing.py
+++ b/src/pystencilssfg/ir/postprocessing.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 from typing import Sequence, Iterable
 import warnings
-from functools import reduce
 from dataclasses import dataclass
 
 from abc import ABC, abstractmethod
@@ -15,7 +14,7 @@ from pystencils.codegen.properties import FieldBasePtr, FieldShape, FieldStride
 from ..exceptions import SfgException
 from ..config import CodeStyle
 
-from .call_tree import SfgCallTreeNode, SfgCallTreeLeaf, SfgSequence, SfgStatements
+from .call_tree import SfgCallTreeNode, SfgSequence, SfgStatements
 from ..lang.expressions import SfgKernelParamVar
 from ..lang import (
     SfgVar,
@@ -163,17 +162,12 @@ class CallTreePostProcessing:
                 self.handle_sequence(node, ppc)
                 return ppc.live_variables
 
-            case SfgCallTreeLeaf():
-                return node.depends
-
             case SfgDeferredNode():
                 raise SfgException("Deferred nodes can only occur inside a sequence.")
 
             case _:
-                return reduce(
-                    lambda x, y: x | y,
-                    (self.get_live_variables(c) for c in node.children),
-                    set(),
+                return node.depends.union(
+                    *(self.get_live_variables(c) for c in node.children)
                 )
 
 
diff --git a/src/pystencilssfg/lang/cpp/std_tuple.py b/src/pystencilssfg/lang/cpp/std_tuple.py
index 645b6b56fbeb515d6324feafdb8588f7ca22e992..6d1e1c0da0987be055006ce696779a1e2ebaa56b 100644
--- a/src/pystencilssfg/lang/cpp/std_tuple.py
+++ b/src/pystencilssfg/lang/cpp/std_tuple.py
@@ -19,10 +19,13 @@ class StdTuple(AugExpr, SupportsVectorExtraction):
         dtype = self._template(ts=", ".join(elt_type_strings), const=const, ref=ref)
         super().__init__(dtype)
 
+    def get(self, idx: int | str) -> AugExpr:
+        return AugExpr.format("std::get< {} >({})", idx, self)
+
     def _extract_component(self, coordinate: int) -> AugExpr:
         if coordinate < 0 or coordinate >= self._length:
             raise ValueError(
                 f"Index {coordinate} out-of-bounds for std::tuple with {self._length} entries."
             )
 
-        return AugExpr.format("std::get< {} >({})", coordinate, self)
+        return self.get(coordinate)
diff --git a/src/pystencilssfg/lang/expressions.py b/src/pystencilssfg/lang/expressions.py
index 135a54eed92e4ba214244c8f46323ea81f6610db..8be59b0b37d72113814fa1489a44e7816f0e6f75 100644
--- a/src/pystencilssfg/lang/expressions.py
+++ b/src/pystencilssfg/lang/expressions.py
@@ -218,8 +218,12 @@ class AugExpr:
         return self._bind(expr)
 
     @staticmethod
-    def make(code: str, depends: Iterable[SfgVar | AugExpr]):
-        return AugExpr()._bind(DependentExpression(code, depends))
+    def make(
+        code: str,
+        depends: Iterable[SfgVar | AugExpr],
+        dtype: UserTypeSpec | None = None,
+    ):
+        return AugExpr(dtype)._bind(DependentExpression(code, depends))
 
     @staticmethod
     def format(fmt: str, *deps, **kwdeps) -> AugExpr:
diff --git a/src/pystencilssfg/lang/gpu.py b/src/pystencilssfg/lang/gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b55161eb536cc1dd6dcba2c0bd08827c91dc01
--- /dev/null
+++ b/src/pystencilssfg/lang/gpu.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from typing import Protocol
+
+from .expressions import CppClass, cpptype, AugExpr
+
+
+class Dim3Interface(CppClass):
+    """Interface definition for the ``dim3`` struct of Cuda and HIP."""
+
+    def ctor(self, dim0=1, dim1=1, dim2=1):
+        """Constructor invocation of ``dim3``"""
+        return self.ctor_bind(dim0, dim1, dim2)
+
+    @property
+    def x(self) -> AugExpr:
+        """The `x` coordinate member."""
+        return AugExpr.format("{}.x", self)
+
+    @property
+    def y(self) -> AugExpr:
+        """The `y` coordinate member."""
+        return AugExpr.format("{}.y", self)
+
+    @property
+    def z(self) -> AugExpr:
+        """The `z` coordinate member."""
+        return AugExpr.format("{}.z", self)
+
+    @property
+    def dims(self) -> tuple[AugExpr, AugExpr, AugExpr]:
+        """`x`, `y`, and `z` as a tuple."""
+        return (self.x, self.y, self.z)
+
+
+class ProvidesGpuRuntimeAPI(Protocol):
+    """Protocol definition for a GPU runtime API provider."""
+
+    dim3: type[Dim3Interface]
+    """The ``dim3`` struct type for this GPU runtime"""
+
+    stream_t: type[AugExpr]
+    """The ``stream_t`` type for this GPU runtime"""
+
+
+class CudaAPI(ProvidesGpuRuntimeAPI):
+    """Reflection of the CUDA runtime API"""
+
+    class dim3(Dim3Interface):
+        """Implements `Dim3Interface` for CUDA"""
+
+        template = cpptype("dim3", "<cuda_runtime.h>")
+
+    class stream_t(CppClass):
+        template = cpptype("cudaStream_t", "<cuda_runtime.h>")
+
+
+cuda = CudaAPI
+"""Alias for `CudaAPI`"""
+
+
+class HipAPI(ProvidesGpuRuntimeAPI):
+    """Reflection of the HIP runtime API"""
+
+    class dim3(Dim3Interface):
+        """Implements `Dim3Interface` for HIP"""
+
+        template = cpptype("dim3", "<hip/hip_runtime.h>")
+
+    class stream_t(CppClass):
+        template = cpptype("hipStream_t", "<hip/hip_runtime.h>")
+
+
+hip = HipAPI
+"""Alias for `HipAPI`"""
diff --git a/tests/extensions/test_gpu.py b/tests/extensions/test_gpu.py
deleted file mode 100644
index 2e8d133e80d0b8f9487e39290f4528aa1f11533e..0000000000000000000000000000000000000000
--- a/tests/extensions/test_gpu.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from pystencilssfg.extensions.gpu import dim3class
-from pystencilssfg.lang import HeaderFile, AugExpr
-
-
-def test_dim3():
-    cuda_runtime = "<cuda_runtime.h>"
-    dim3 = dim3class(cuda_runtime, cls_name="dim3")
-    assert HeaderFile.parse(cuda_runtime) in dim3.template.includes
-    assert str(dim3().ctor(128, 1, 1)) == "dim3{128, 1, 1}"
-    assert str(dim3().ctor()) == "dim3{1, 1, 1}"
-    assert str(dim3().ctor(1, 1, 128)) == "dim3{1, 1, 128}"
-
-    block = dim3(ref=True, const=True).var("block")
-
-    dims = [
-        AugExpr.format(
-            "uint32_t(({} + {} - 1)/ {})",
-            1024,
-            block.dims[i],
-            block.dims[i],
-        )
-        for i in range(3)
-    ]
-
-    grid = dim3().ctor(*dims)
-    assert str(grid) == f"dim3{{{', '.join((str(d) for d in dims))}}}"
diff --git a/tests/generator_scripts/index.yaml b/tests/generator_scripts/index.yaml
index 1c97aaf6a34b0171f4829c56da0f345934c50176..c87977f3761739985b65e64e5b069b0359efdce0 100644
--- a/tests/generator_scripts/index.yaml
+++ b/tests/generator_scripts/index.yaml
@@ -90,6 +90,33 @@ StlContainers1D:
 MdSpanFixedShapeLayouts:
 MdSpanLbStreaming:
 
+# CUDA
+
+CudaKernels:
+  sfg-args:
+    file-extensions: ["hpp", "cu"]
+  compile:
+    cxx: nvcc
+    cxx-flags: 
+      - -std=c++20
+      - -Werror
+      - all-warnings
+      - --expt-relaxed-constexpr
+    skip-if-not-found: true
+
+# HIP
+
+HipKernels:
+  sfg-args:
+    file-extensions: ["hpp", "hip"]
+  compile:
+    cxx: hipcc
+    cxx-flags: 
+      - -std=c++20
+      - -Wall
+      - -Werror
+    skip-if-not-found: true
+
 # SYCL
 
 SyclKernels:
diff --git a/tests/generator_scripts/source/CudaKernels.harness.cpp b/tests/generator_scripts/source/CudaKernels.harness.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7d34d5ab99a7bf2c913b20f4cdba88371f38f7e
--- /dev/null
+++ b/tests/generator_scripts/source/CudaKernels.harness.cpp
@@ -0,0 +1,116 @@
+#include "CudaKernels.hpp"
+
+#include <cuda_runtime.h>
+
+#include <experimental/mdspan>
+#include <random>
+#include <iostream>
+#include <functional>
+
+#undef NDEBUG
+#include <cassert>
+
+namespace stdex = std::experimental;
+
+using extents_t = stdex::dextents<uint64_t, 3>;
+using field_t = stdex::mdspan<double, extents_t, stdex::layout_right>;
+
+void checkCudaError(cudaError_t err)
+{
+    if (err != cudaSuccess)
+    {
+        std::cerr << "HIP Error: " << err << std::endl;
+        exit(2);
+    }
+}
+
+int main(void)
+{
+
+    extents_t extents{23, 25, 132};
+    size_t items{extents.extent(0) * extents.extent(1) * extents.extent(2)};
+
+    double *data_src;
+    checkCudaError(cudaMallocManaged<double>(&data_src, sizeof(double) * items));
+    field_t src{data_src, extents};
+
+    double *data_dst;
+    checkCudaError(cudaMallocManaged<double>(&data_dst, sizeof(double) * items));
+    field_t dst{data_dst, extents};
+
+    std::random_device rd;
+    std::mt19937 gen{rd()};
+    std::uniform_real_distribution<double> distrib{-1.0, 1.0};
+
+    auto check = [&](std::function<void()> invoke)
+    {
+        for (size_t i = 0; i < items; ++i)
+        {
+            data_src[i] = distrib(gen);
+            data_dst[i] = NAN;
+        }
+
+        invoke();
+
+        for (size_t i = 0; i < items; ++i)
+        {
+            const double desired = 2.0 * data_src[i];
+            if (std::abs(desired - data_dst[i]) >= 1e-12)
+            {
+                std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl;
+                exit(EXIT_FAILURE);
+            }
+        }
+    };
+
+    check([&]()
+          {
+        /* Linear3D Dynamic */
+        dim3 blockSize{64, 8, 1};
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::linear3d::scaleKernel(blockSize, dst, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Linear3D Automatic */
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::linear3d_automatic::scaleKernel(dst, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Blockwise4D Automatic */
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::blockwise4d::scaleKernel(dst, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Linear3D Manual */
+        dim3 blockSize{32, 8, 1};
+        dim3 gridSize{5, 4, 23};
+
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Blockwise4D Manual */
+        dim3 blockSize{132, 1, 1};
+        dim3 gridSize{25, 23, 1};
+        cudaStream_t stream;
+        checkCudaError(cudaStreamCreate(&stream));
+        gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkCudaError(cudaStreamSynchronize(stream)); });
+
+    checkCudaError(cudaFree(data_src));
+    checkCudaError(cudaFree(data_dst));
+
+    return EXIT_SUCCESS;
+}
diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..e019e4faf1915023000cc0e12bed770c07d66031
--- /dev/null
+++ b/tests/generator_scripts/source/CudaKernels.py
@@ -0,0 +1,102 @@
+from pystencilssfg import SourceFileGenerator
+from pystencilssfg.lang.cpp import std
+from pystencilssfg.lang.gpu import cuda
+
+import pystencils as ps
+
+std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
+
+
+src, dst = ps.fields("src, dst: double[3D]", layout="c")
+asm = ps.Assignment(dst(0), 2 * src(0))
+
+
+with SourceFileGenerator() as sfg:
+    sfg.namespace("gen")
+
+    base_config = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    block_size = cuda.dim3().var("blockSize")
+    grid_size = cuda.dim3().var("gridSize")
+    stream = cuda.stream_t().var("stream")
+
+    with sfg.namespace("linear3d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, stream=stream),
+        )
+
+    with sfg.namespace("linear3d_automatic"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, stream=stream),
+        )
+
+    with sfg.namespace("blockwise4d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, stream=stream),
+        )
+
+    with sfg.namespace("linear3d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(
+                khandle, block_size=block_size, grid_size=grid_size, stream=stream
+            ),
+        )
+
+    with sfg.namespace("blockwise4d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(
+                khandle, block_size=block_size, grid_size=grid_size, stream=stream
+            ),
+        )
diff --git a/tests/generator_scripts/source/HipKernels.harness.cpp b/tests/generator_scripts/source/HipKernels.harness.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bf7b83c1dae274c16b5f5e286e3edd6024280e5
--- /dev/null
+++ b/tests/generator_scripts/source/HipKernels.harness.cpp
@@ -0,0 +1,116 @@
+#include "HipKernels.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include <experimental/mdspan>
+#include <random>
+#include <iostream>
+#include <functional>
+
+#undef NDEBUG
+#include <cassert>
+
+namespace stdex = std::experimental;
+
+using extents_t = stdex::dextents<uint64_t, 3>;
+using field_t = stdex::mdspan<double, extents_t, stdex::layout_right>;
+
+void checkHipError(hipError_t err)
+{
+    if (err != hipSuccess)
+    {
+        std::cerr << "HIP Error: " << err << std::endl;
+        exit(2);
+    }
+}
+
+int main(void)
+{
+
+    extents_t extents{23, 25, 132};
+    size_t items{extents.extent(0) * extents.extent(1) * extents.extent(2)};
+
+    double *data_src;
+    checkHipError(hipMallocManaged<double>(&data_src, sizeof(double) * items));
+    field_t src{data_src, extents};
+
+    double *data_dst;
+    checkHipError(hipMallocManaged<double>(&data_dst, sizeof(double) * items));
+    field_t dst{data_dst, extents};
+
+    std::random_device rd;
+    std::mt19937 gen{rd()};
+    std::uniform_real_distribution<double> distrib{-1.0, 1.0};
+
+    auto check = [&](std::function<void()> invoke)
+    {
+        for (size_t i = 0; i < items; ++i)
+        {
+            data_src[i] = distrib(gen);
+            data_dst[i] = NAN;
+        }
+
+        invoke();
+
+        for (size_t i = 0; i < items; ++i)
+        {
+            const double desired = 2.0 * data_src[i];
+            if (std::abs(desired - data_dst[i]) >= 1e-12)
+            {
+                std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl;
+                exit(EXIT_FAILURE);
+            }
+        }
+    };
+
+    check([&]()
+          {
+        /* Linear3D Dynamic */
+        dim3 blockSize{64, 8, 1};
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::linear3d::scaleKernel(blockSize, dst, src, stream);
+        checkHipError(hipStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Linear3D Automatic */
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::linear3d_automatic::scaleKernel(dst, src, stream);
+        checkHipError(hipStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Blockwise4D Automatic */
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::blockwise4d::scaleKernel(dst, src, stream);
+        checkHipError(hipStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Linear3D Manual */
+        dim3 blockSize{32, 8, 1};
+        dim3 gridSize{5, 4, 23};
+
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkHipError(hipStreamSynchronize(stream)); });
+
+    check([&]()
+          {
+        /* Blockwise4D Manual */
+        dim3 blockSize{132, 1, 1};
+        dim3 gridSize{25, 23, 1};
+        hipStream_t stream;
+        checkHipError(hipStreamCreate(&stream));
+        gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream);
+        checkHipError(hipStreamSynchronize(stream)); });
+
+    checkHipError(hipFree(data_src));
+    checkHipError(hipFree(data_dst));
+
+    return EXIT_SUCCESS;
+}
diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..20d9df5552ef595330a7d1af8486bd5b47ffee43
--- /dev/null
+++ b/tests/generator_scripts/source/HipKernels.py
@@ -0,0 +1,102 @@
+from pystencilssfg import SourceFileGenerator
+from pystencilssfg.lang.cpp import std
+from pystencilssfg.lang.gpu import hip
+
+import pystencils as ps
+
+std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>")
+
+
+src, dst = ps.fields("src, dst: double[3D]", layout="c")
+asm = ps.Assignment(dst(0), 2 * src(0))
+
+
+with SourceFileGenerator() as sfg:
+    sfg.namespace("gen")
+
+    base_config = ps.CreateKernelConfig(target=ps.Target.HIP)
+
+    block_size = hip.dim3().var("blockSize")
+    grid_size = hip.dim3().var("gridSize")
+    stream = hip.stream_t().var("stream")
+
+    with sfg.namespace("linear3d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, block_size=block_size, stream=stream),
+        )
+
+    with sfg.namespace("linear3d_automatic"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, stream=stream),
+        )
+
+    with sfg.namespace("blockwise4d"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(khandle, stream=stream),
+        )
+
+    with sfg.namespace("linear3d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "linear3d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(
+                khandle, block_size=block_size, grid_size=grid_size, stream=stream
+            ),
+        )
+
+    with sfg.namespace("blockwise4d_manual"):
+        cfg = base_config.copy()
+        cfg.gpu.indexing_scheme = "blockwise4d"
+        cfg.gpu.manual_launch_grid = True
+        khandle = sfg.kernels.create(asm, "scale", cfg)
+
+        sfg.function("scaleKernel")(
+            sfg.map_field(
+                src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right")
+            ),
+            sfg.map_field(
+                dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right")
+            ),
+            sfg.gpu_invoke(
+                khandle, block_size=block_size, grid_size=grid_size, stream=stream
+            ),
+        )
diff --git a/tests/generator_scripts/test_generator_scripts.py b/tests/generator_scripts/test_generator_scripts.py
index 6f2ff160e8b9ec2367020a4a3f91d4a071a71e39..8901c3444370e238d9c6fea60a73141eaffa1596 100644
--- a/tests/generator_scripts/test_generator_scripts.py
+++ b/tests/generator_scripts/test_generator_scripts.py
@@ -12,16 +12,24 @@ import shutil
 import warnings
 import subprocess
 
+from pystencils.include import get_pystencils_include_path
+
 THIS_DIR = pathlib.Path(__file__).parent
 
 DEPS_DIR = THIS_DIR / "deps"
 MDSPAN_QUAL_PATH = "mdspan-mdspan-0.6.0/include/"
 
+PYSTENCILS_RT_INCLUDE_PATH = get_pystencils_include_path()
 
 TEST_INDEX = THIS_DIR / "index.yaml"
 SOURCE_DIR = THIS_DIR / "source"
 EXPECTED_DIR = THIS_DIR / "expected"
-CXX_INCLUDE_FLAGS = ["-I", f"{DEPS_DIR}/{MDSPAN_QUAL_PATH}"]
+CXX_INCLUDE_FLAGS = [
+    "-I",
+    f"{DEPS_DIR}/{MDSPAN_QUAL_PATH}",
+    "-I",
+    PYSTENCILS_RT_INCLUDE_PATH,
+]
 
 
 def prepare_deps():
@@ -101,7 +109,7 @@ class GenScriptTest:
         for ext in self._expected_extensions:
             fname = f"{self._name}.{ext}"
             self._expected_files.add(fname)
-            if ext in ("cpp", "cxx", "c++"):
+            if ext in ("cpp", "cxx", "c++", "cu", "hip"):
                 self._files_to_compile.append(fname)
 
         compile_descr: dict = test_description.get("compile", dict())