diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 474afbb113dbb1a72e6b077e9272a91749c5698d..a2ec00d16cc04af33a2d3e4e46183111e4cfbcda 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -337,6 +337,7 @@ build-documentation:
   artifacts:
     paths:
       - docs/build/html
+    when: always
 
 
 pages:
diff --git a/conftest.py b/conftest.py
index 8296641ed7b54c96fdf0124866f18d5299d66cf9..ff0467eff85d2b43854246ec98091a15ad347e63 100644
--- a/conftest.py
+++ b/conftest.py
@@ -55,7 +55,6 @@ try:
     import cupy
 except ImportError:
     collect_ignore += [
-        os.path.join(SCRIPT_FOLDER, "tests/kernelcreation/test_gpu.py"),
         os.path.join(SCRIPT_FOLDER, "src/pystencils/backend/jit/gpu_cupy.py"),
     ]
     add_path_to_ignore("src/pystencils/gpu")
diff --git a/docs/source/api/codegen.md b/docs/source/api/codegen.md
index b739a4f33c922c97c083f7f466525892544bc46d..c52f28f51944d7d1c7eb35dc1108906c90058c68 100644
--- a/docs/source/api/codegen.md
+++ b/docs/source/api/codegen.md
@@ -108,6 +108,7 @@ The following categories with target-specific options are exposed:
   VectorizationOptions
   GpuOptions
   SyclOptions
+  GpuIndexingScheme
 
 .. autosummary::
   :toctree: generated
@@ -176,5 +177,5 @@ The following categories with target-specific options are exposed:
   Kernel
   GpuKernel
   Parameter
-  GpuThreadsRange
+  Lambda
 ```
diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
new file mode 100644
index 0000000000000000000000000000000000000000..1082669e640c989cad0b994ac70818f22a3517bb
--- /dev/null
+++ b/docs/source/backend/gpu_codegen.md
@@ -0,0 +1,92 @@
+# GPU Code Generation
+
+The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components:
+
+ - The {any}`CudaPlatform` at `backend.platforms` which performs materialization of a kernel's iteration
+   space by mapping GPU block and thread indices to iteration space points. To perform this task,
+   it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping.
+   The platform also takes care of lowering mathematical functions to their CUDA runtime library implementation.
+ - In the code generation driver, the strings are drawn by the `GpuIndexing` helper class.
+   It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration
+   for the runtime system.
+
+:::{attention}
+
+Code generation for HIP through the `CudaPlatform` is experimental and not tested at the moment.
+:::
+
+## The CUDA Platform and Thread Mappings
+
+```{eval-rst}
+.. module:: pystencils.backend.platforms.cuda
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    ThreadMapping
+    Linear3DMapping
+    Blockwise4DMapping
+```
+
+## Thread Indexing In The Driver
+
+With regard to GPU thread indexing, the code generation driver has two tasks:
+it must provide the Cuda platform object with a valid thread mapping,
+and must also provide the runtime system with a [launch configuration](#gpu_launch_config)
+which defines the shape of the GPU block grid.
+Both of these are produced by the {any}`GpuIndexing` class.
+It is instantiated with the GPU indexing scheme and indexing options given by the user.
+
+At this time, the backend and code generation driver support two indexing schemes:
+"Linear3D" (see {any}`Linear3DMapping`) and "Blockwise4D" (see {any}`Blockwise4DMapping`).
+These are mostly reimplemented from the pystencils 1.3.x `"block"` and `"line"` indexing options.
+The GPU indexing system may be extended in the future.
+
+
+```{eval-rst}
+.. module:: pystencils.codegen.gpu_indexing
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    GpuIndexing
+```
+
+(gpu_launch_config)=
+## The Launch Configuration
+
+The launch configuration is attached to the `GpuKernel` and thus returned to the runtime system.
+Since a concrete launch configuration is not specific to the kernel itself, but to the kernels'
+invocation site, the code generator only attaches a *factory function* for launch configurations
+to `GpuKernel`. It is up to the runtime system to locally instantiate and configure a launch configuration.
+To determine the actual launch grid, the launch configuration must be evaluated at the kernel's call site
+by passing the required parameters to `GpuLaunchConfiguration.evaluate`
+
+The {any}`CupyJit`, for instance, will create the launch configuration object while preparing the JIT-compiled
+kernel wrapper object. The launch config is there exposed to the user, who may modify some of its properties.
+These depend on the type of the launch configuration:
+while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel
+parameters,
+the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size.
+
+The `evaluate` method can only be used from within a Python runtime environment.
+When exporting pystencils CUDA kernels for external use in C++ projects,
+equivalent C++ code evaluating the launch config must be generated.
+This is the task of, e.g., [pystencils-sfg](https://pycodegen.pages.i10git.cs.fau.de/pystencils-sfg/).
+
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    GpuLaunchConfiguration
+    AutomaticLaunchConfiguration
+    ManualLaunchConfiguration
+    DynamicBlockSizeLaunchConfiguration
+```
diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst
index 5ab8dbd34eb37fbc38230f3db0506c572d4b6964..0d384c55bc1e5933a055365f9d5ffe4143c902b6 100644
--- a/docs/source/backend/index.rst
+++ b/docs/source/backend/index.rst
@@ -17,6 +17,7 @@ who wish to customize or extend the behaviour of the code generator in their app
     translation
     platforms
     transformations
+    gpu_codegen
     errors
     extensions
 
diff --git a/docs/source/backend/platforms.md b/docs/source/backend/platforms.md
new file mode 100644
index 0000000000000000000000000000000000000000..e7ffc6f1523272621d273a17624d6323c25651b1
--- /dev/null
+++ b/docs/source/backend/platforms.md
@@ -0,0 +1,54 @@
+# Platforms
+
+All target-specific code generation in the pystencils backend is facilitated
+through the *platform classes*.
+This includes:
+
+ - Materialization of the iteration space, meaning the mapping of iteration space points to some indexing structure
+ - Lowering of mathematical functions to their implementation in some runtime environment
+ - Selection of vector intrinsics for SIMD-capable CPU targets
+
+Encapsulation of hardware- and environment-specific details into platform objects allows
+us to implement most of the code generator in a generic and hardware-agnostic way.
+It also makes it easier to extend pystencils with support for additional code generation
+targets in the future.
+
+## Base Classes
+
+```{eval-rst}
+.. module:: pystencils.backend.platforms
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    Platform
+    GenericCpu
+    GenericVectorCpu
+    GenericGpu
+```
+
+## CPU Platforms
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    X86VectorCpu
+    X86VectorArch
+```
+
+## GPU Platforms
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: autosummary/entire_class.rst
+
+    CudaPlatform
+    SyclPlatform
+```
diff --git a/docs/source/backend/platforms.rst b/docs/source/backend/platforms.rst
deleted file mode 100644
index 68b74504cfc94dd20e72a5852a2e45d399065aef..0000000000000000000000000000000000000000
--- a/docs/source/backend/platforms.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-*********
-Platforms
-*********
-
-.. automodule:: pystencils.backend.platforms
-    :members:
\ No newline at end of file
diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md
index 56c97509cbc4c0e3f312fade9fd08af90c4c9c3d..8be86cdd436bf4e7079a3a4cae3ac17748ceb9e6 100644
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@@ -1,3 +1,4 @@
+(contribution_guide)=
 # Contribution Guide
 
 Welcome to the Contributor's Guide to pystencils!
diff --git a/docs/source/installation.md b/docs/source/installation.md
index cea0acd2feba1c45f2e73b0d52292a92aaa28d07..deb2b0613564f98468f623544acf3cc1ca9d279e 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -38,12 +38,14 @@ The following feature sets are available:
 If you are developing pystencils, we recommend you perform an editable install of your
 local clone of the repository, with all optional features:
 ```bash
-pip install -e pystencils[alltrafos,interactive,use_cython,doc,tests]
+pip install -e pystencils[alltrafos,interactive,use_cython,doc,testsuite]
 ```
 
 This includes the additional feature groups `doc`, which contains all dependencies required
 to build this documentation, and `tests`, which adds `flake8` for code style checking,
 `mypy` for static type checking, and `pytest` plus plugins for running the test suite.
+
+For more information on developing pystencils, see the [](#contribution_guide).
 :::
 
 ### For Nvidia GPUs
diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md
index 4db2d79443bc1d09031b3f1a8c8af014a0c824a8..610c61ddf647331d7b77b06968e489b4dcc76293 100644
--- a/docs/source/user_manual/gpu_kernels.md
+++ b/docs/source/user_manual/gpu_kernels.md
@@ -28,7 +28,7 @@ import matplotlib.pyplot as plt
 
 Pystencils offers code generation for Nvidia GPUs using the CUDA programming model,
 as well as just-in-time compilation and execution of CUDA kernels from within Python
-based on the [cupy] library.w
+based on the [cupy] library.
 This section's objective is to give a detailed introduction into the creation of
 GPU kernels with pystencils.
 
@@ -51,16 +51,22 @@ ps.inspect(kernel)
 The `kernel` object returned by the code generator in above snippet is an instance
 of the {py:class}`GpuKernel` class.
 It extends {py:class}`Kernel` with some GPU-specific information.
-In particular, it defines the {any}`threads_range <GpuKernel.threads_range>`
-property, which tells us how many threads the kernel is expecting to be executed with:
-
-```{code-cell} ipython3
-kernel.threads_range
-```
 
 If a GPU is available and [CuPy][cupy] is installed in the current environment,
 the kernel can be compiled and run immediately.
-To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field.
+To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field:
+
+```{code-cell} ipython3
+:tags: [raises-exception]
+import cupy as cp
+
+rng = cp.random.default_rng(seed=42)
+f_arr = rng.random((16, 16, 16))
+g_arr = cp.zeros_like(f_arr)
+
+kfunc = kernel.compile()
+kfunc(f=f_arr, g=g_arr)
+```
 
 :::{note}
 [CuPy][cupy] is a Python library for numerical computations on GPU arrays,
@@ -68,54 +74,105 @@ which operates much in the same way that [NumPy][numpy] works on CPU arrays.
 Cupy and NumPy expose nearly the same APIs for array operations;
 the difference being that CuPy allocates all its arrays on the GPU
 and performs its operations as CUDA kernels.
-Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvcc].
+Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc].
 In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code,
 and to allocate and manage the data these kernels can be executed on.
 
 For more information on CuPy, refer to [their documentation][cupy-docs].
 :::
 
-```{code-cell} ipython3
-:tags: [raises-exception]
-import cupy as cp
+(indexing_and_launch_config)=
+## Modify the Indexing Scheme and Launch Configuration
+
+There are two key elements to how the work items of a GPU kernel's iteration space
+are mapped onto a GPU launch grid:
+ - The *indexing scheme* defines the relation between thread indices and iteration space points;
+   it can be modified through the {any}`gpu.indexing_scheme <GpuOptions.indexing_scheme>` option
+   and is fixed for the entire kernel.
+ - The *launch configuration* defines the number of threads per block, and the number of blocks on the grid,
+   with which the kernel should be launched.
+   The launch configuration mostly depends on the size of the arrays passed to the kernel,
+   but parts of it may also be modified.
+   The launch configuration may change at each kernel invocation.
+
+(linear3d)=
+### The Default "Linear3D" Indexing Scheme
+
+By default, *pystencils* will employ a 1:1-mapping between threads and iteration space points
+via the global thread indices inside the launch grid; e.g.
+
+```{code-block} C++
+ctr_0 = start_0 + step_0 * (blockSize.x * blockIdx.x + threadIdx.x);
+ctr_1 = start_1 + step_1 * (blockSize.y * blockIdx.y + threadIdx.y);
+ctr_2 = start_2 + step_2 * (blockSize.z * blockIdx.z + threadIdx.z);
+```
 
-rng = cp.random.default_rng(seed=42)
-f_arr = rng.random((16, 16, 16))
-g_arr = cp.zeros_like(f_arr)
+For most kernels with an at most three-dimensional iteration space,
+this behavior is sufficient and desired.
+It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`.
+
+If the `Linear3D` indexing scheme is used, you may modifiy the GPU thread block size in two places.
+The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` 
+code generator option;
+if none is specified, a default depending on the iteration space's dimensionality will be used.
 
+The block size can furthermore be modified at the compiled kernel's wrapper object via the
+`launch_config.block_size` attribute:
+
+```{code-cell} ipython3
 kfunc = kernel.compile()
+kfunc.launch_config.block_size = (256, 2, 1)
+
+# Run the kernel
 kfunc(f=f_arr, g=g_arr)
 ```
 
-### Modifying the Launch Grid
+In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments
+and the given thread block size.
 
-The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object.
-This object holds the kernel's launch grid configuration
-(i.e. the number of thread blocks, and the number of threads per block.)
-Pystencils specifies a default value for the block size and if possible, 
-the number of blocks is automatically inferred in order to cover the entire iteration space.
-In addition, the wrapper's interface allows us to customize the GPU launch grid,
-by manually setting both the number of threads per block, and the number of blocks on the grid:
+:::{attention}
 
-```{code-cell} ipython3
-kfunc.block_size = (16, 8, 8)
-kfunc.num_blocks = (1, 2, 2)
-```
+According to the way GPU architecture splits thread blocks into warps,
+pystencils will map the kernel's *fastest* spatial coordinate onto the `x` block and thread
+indices, the second-fastest to `y`, and the slowest coordiante to `z`.
 
-For most kernels, setting only the `block_size` is sufficient since pystencils will
-automatically compute the number of blocks;
-for exceptions to this, see [](#manual_launch_grids).
-If `num_blocks` is set manually and the launch grid thus specified is too small, only
-a part of the iteration space will be traversed by the kernel;
-similarily, if it is too large, it will cause any threads working outside of the iteration bounds to idle.
+This can mean that, when using `cupy` arrays with the default memory layout
+(corresponding to the `"numpy"` field layout specifier),
+the *thread coordinates* and the *spatial coordinates*
+map to each other in *opposite order*; e.g.
+
+| Spatial Coordinate | Thread Index  |
+|--------------------|---------------|
+| `x` (slowest)      | `threadIdx.z` |
+| `y`                | `threadIdx.y` |
+| `z` (fastest)      | `threadIdx.x` |
+
+:::
 
 (manual_launch_grids)=
 ### Manual Launch Grids and Non-Cuboid Iteration Patterns
 
-In some cases, it will be unavoidable to set the launch grid size manually;
-especially if the code generator is unable to automatically determine the size of the
-iteration space.
-An example for this is the triangular iteration previously described in the [Kernel Creation Guide](#example_triangular_iteration).
+By default, the above indexing schemes will automatically compute the GPU launch configuration
+from array shapes and optional user input.
+However, it is also possible to override this behavior and instead specify a launch grid manually.
+This will even be unavoidable if the code generator cannot precompute the number of points
+in the kernel's iteration space.
+
+To specify a manual launch configuration, set the {any}`gpu.manual_launch_grid <GpuOptions.manual_launch_grid>`
+option to `True`.
+Then, after compiling the kernel, set its block and grid size via the `launch_config` property:
+
+```{code-cell} ipython3
+cfg.gpu.manual_launch_grid = True
+
+kernel = ps.create_kernel(update, cfg)
+kfunc = kernel.compile()
+kfunc.launch_config.block_size = (64, 2, 1)
+kfunc.launch_config.grid_size = (4, 2, 1)
+```
+
+An example where this is necessary is the triangular iteration
+previously described in the [Kernel Creation Guide](#example_triangular_iteration).
 Let's set it up once more:
 
 ```{code-cell} ipython3
@@ -150,26 +207,21 @@ assignments = [
 
 ```{code-cell} ipython3
 y = ps.DEFAULTS.spatial_counters[0]
-cfg = ps.CreateKernelConfig(
-    target=ps.Target.CUDA,
-    iteration_slice=ps.make_slice[:, y:]
-)
-    
-kernel = ps.create_kernel(assignments, cfg).compile()
+cfg = ps.CreateKernelConfig()
+cfg.target= ps.Target.CUDA
+cfg.iteration_slice = ps.make_slice[:, y:]
 ```
 
-This warns us that the threads range could not be determined automatically.
-We can disable this warning by setting `manual_launch_grid` in the GPU option category:
+For this kernel, the code generator cannot figure out a launch configuration on its own,
+so we need to manually provide one:
 
-```{code-cell}
+```{code-cell} ipython3
 cfg.gpu.manual_launch_grid = True
-```
-
-Now, to execute our kernel, we have to manually specify its launch grid:
+    
+kernel = ps.create_kernel(assignments, cfg).compile()
 
-```{code-cell} ipython3
-kernel.block_size = (8, 8)
-kernel.num_blocks = (2, 2)
+kernel.launch_config.block_size = (8, 8)
+kernel.launch_config.grid_size = (2, 2)
 ```
 
 This way the kernel will cover this iteration space:
@@ -181,11 +233,11 @@ kernel(f=f_arr)
 _draw_ispace(cp.asnumpy(f_arr))
 ```
 
-We can also observe the effect of decreasing the launch grid size:
+We can also observe the effect of decreasing the launch grid size.
 
 ```{code-cell} ipython3
-kernel.block_size = (4, 4)
-kernel.num_blocks = (2, 3)
+kernel.launch_config.block_size = (4, 4)
+kernel.launch_config.grid_size = (2, 3)
 ```
 
 ```{code-cell} ipython3
@@ -199,15 +251,6 @@ Here, since there are only eight threads operating in $x$-direction,
 and twelve threads in $y$-direction,
 only a part of the triangle is being processed.
 
-## API Reference
-
-```{eval-rst}
-.. autosummary::
-  :nosignatures:
-
-  pystencils.codegen.GpuKernel
-  pystencils.jit.gpu_cupy.CupyKernelWrapper
-```
 
 :::{admonition} Developers To Do:
 
@@ -218,5 +261,5 @@ only a part of the triangle is being processed.
 
 [cupy]: https://cupy.dev "CuPy Homepage"
 [numpy]: https://numpy.org "NumPy Homepage"
-[nvcc]: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html "NVIDIA CUDA Compiler Driver"
+[nvrtc]: https://docs.nvidia.com/cuda/nvrtc/index.html "NVIDIA Runtime Compilation Library"
 [cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation"
diff --git a/pytest.ini b/pytest.ini
index 707a43b4548e99e8e6862e1b48a1844e4318b55e..744a74bc781b3e03568e3c3a67cefbe9395bd713 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -64,6 +64,7 @@ exclude_lines =
        if 0:
        if False:
        if __name__ == .__main__.:
+       assert False
 
        # Don't cover type checking imports
        if TYPE_CHECKING:
diff --git a/src/pystencils/backend/memory.py b/src/pystencils/backend/memory.py
index 7a5d62f691d81a0f251329c47216f65a981ef291..0e9b21d6c8268bec7fef97ac6fcb2b4b0e37f8f4 100644
--- a/src/pystencils/backend/memory.py
+++ b/src/pystencils/backend/memory.py
@@ -89,8 +89,13 @@ class PsSymbol:
         return f"PsSymbol({repr(self._name)}, {repr(self._dtype)})"
 
 
+class BackendPrivateProperty:
+    """Mix-in marker for symbol properties that are private to the backend
+    and should not be exported to parameters"""
+
+
 @dataclass(frozen=True)
-class BufferBasePtr(UniqueSymbolProperty):
+class BufferBasePtr(UniqueSymbolProperty, BackendPrivateProperty):
     """Symbol acts as a base pointer to a buffer."""
 
     buffer: PsBuffer
@@ -120,12 +125,12 @@ class PsBuffer:
         strides: Sequence[PsSymbol | PsConstant],
     ):
         bptr_type = base_ptr.get_dtype()
-        
+
         if not isinstance(bptr_type, PsPointerType):
             raise ValueError(
                 f"Type of buffer base pointer {base_ptr} was not a pointer type: {bptr_type}"
             )
-        
+
         if bptr_type.base_type != element_type:
             raise ValueError(
                 f"Base type of primary buffer base pointer {base_ptr} "
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index ff41d3a68888347e81e9ff65d1359467666f2e32..e896fc2bba9a484e33480b34b3e390d8c44eb4df 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
-from warnings import warn
-from typing import TYPE_CHECKING
+from abc import ABC, abstractmethod
 
-from ...types import constify
+from ...types import constify, deconstify
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
 
+from ..memory import PsSymbol
 from ..kernelcreation import (
     Typifier,
     IterationSpace,
@@ -29,8 +29,6 @@ from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
 from ..functions import PsMathFunction, MathFunctions, CFunction
 
-if TYPE_CHECKING:
-    from ...codegen import GpuThreadsRange
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
@@ -48,18 +46,143 @@ GRID_DIM = [
 ]
 
 
+class ThreadMapping(ABC):
+
+    @abstractmethod
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        """Map the current thread index onto a point in the given iteration space.
+
+        Implementations of this method must return a declaration for each dimension counter
+        of the given iteration space.
+        """
+
+
+class Linear3DMapping(ThreadMapping):
+    """3D globally linearized mapping, where each thread is assigned a work item according to
+    its location in the global launch grid."""
+
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        match ispace:
+            case FullIterationSpace():
+                return self._dense_mapping(ispace)
+            case SparseIterationSpace():
+                return self._sparse_mapping(ispace)
+            case _:
+                assert False, "unexpected iteration space"
+
+    def _dense_mapping(
+        self, ispace: FullIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 3:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Linear3D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for coord, dim in enumerate(dimensions[::-1]):
+            tid = self._linear_thread_idx(coord)
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+    def _sparse_mapping(
+        self, ispace: SparseIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
+        thread_idx = self._linear_thread_idx(0)
+        idx_map: dict[PsSymbol, PsExpression] = {
+            ispace.sparse_counter: PsCast(
+                deconstify(sparse_ctr.get_dtype()), thread_idx
+            )
+        }
+        return idx_map
+
+    def _linear_thread_idx(self, coord: int):
+        block_size = BLOCK_DIM[coord]
+        block_idx = BLOCK_IDX[coord]
+        thread_idx = THREAD_IDX[coord]
+        return block_idx * block_size + thread_idx
+
+
+class Blockwise4DMapping(ThreadMapping):
+    """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions
+    are mapped to block indices."""
+
+    _indices_fastest_first = [  # slowest to fastest
+        THREAD_IDX[0],
+        BLOCK_IDX[0],
+        BLOCK_IDX[1],
+        BLOCK_IDX[2]
+    ]
+
+    def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
+        match ispace:
+            case FullIterationSpace():
+                return self._dense_mapping(ispace)
+            case SparseIterationSpace():
+                return self._sparse_mapping(ispace)
+            case _:
+                assert False, "unexpected iteration space"
+
+    def _dense_mapping(
+        self, ispace: FullIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        if ispace.rank > 4:
+            raise MaterializationError(
+                f"Cannot handle {ispace.rank}-dimensional iteration space "
+                "using the Blockwise4D GPU thread index mapping."
+            )
+
+        dimensions = ispace.dimensions_in_loop_order()
+        idx_map: dict[PsSymbol, PsExpression] = dict()
+
+        for dim, tid in zip(dimensions[::-1], self._indices_fastest_first):
+            idx_map[dim.counter] = dim.start + dim.step * PsCast(
+                deconstify(dim.counter.get_dtype()), tid
+            )
+
+        return idx_map
+
+    def _sparse_mapping(
+        self, ispace: SparseIterationSpace
+    ) -> dict[PsSymbol, PsExpression]:
+        sparse_ctr = PsExpression.make(ispace.sparse_counter)
+        thread_idx = self._indices_fastest_first[0]
+        idx_map: dict[PsSymbol, PsExpression] = {
+            ispace.sparse_counter: PsCast(
+                deconstify(sparse_ctr.get_dtype()), thread_idx
+            )
+        }
+        return idx_map
+
+
 class CudaPlatform(GenericGpu):
-    """Platform for CUDA-based GPUs."""
+    """Platform for CUDA-based GPUs.
+    
+    Args:
+        ctx: The kernel creation context
+        omit_range_check: If `True`, generated index translation code will not check if the point identified
+            by block and thread indices is actually contained in the iteration space
+        thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
+    """
 
     def __init__(
-        self, ctx: KernelCreationContext,
+        self,
+        ctx: KernelCreationContext,
         omit_range_check: bool = False,
-        manual_launch_grid: bool = False,
+        thread_mapping: ThreadMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
         self._omit_range_check = omit_range_check
-        self._manual_launch_grid = manual_launch_grid
+        self._thread_mapping = (
+            thread_mapping if thread_mapping is not None else Linear3DMapping()
+        )
 
         self._typify = Typifier(ctx)
 
@@ -69,7 +192,7 @@ class CudaPlatform(GenericGpu):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
+    ) -> PsBlock:
         if isinstance(ispace, FullIterationSpace):
             return self._prepend_dense_translation(body, ispace)
         elif isinstance(ispace, SparseIterationSpace):
@@ -141,42 +264,26 @@ class CudaPlatform(GenericGpu):
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
-        dimensions = ispace.dimensions_in_loop_order()
-
-        if not self._manual_launch_grid:
-            try:
-                threads_range = self.threads_from_ispace(ispace)
-            except MaterializationError as e:
-                warn(
-                    str(e.args[0])
-                    + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.",
-                    UserWarning,
-                )
-                threads_range = None
-        else:
-            threads_range = None
+    ) -> PsBlock:
+        ctr_mapping = self._thread_mapping(ispace)
 
         indexing_decls = []
         conds = []
-        for i, dim in enumerate(dimensions[::-1]):
+
+        dimensions = ispace.dimensions_in_loop_order()
+
+        for dim in dimensions:
+            # counter declarations must be ordered slowest-to-fastest
+            # such that inner dimensions can depend on outer ones
+
             dim.counter.dtype = constify(dim.counter.get_dtype())
 
-            ctr = PsExpression.make(dim.counter)
+            ctr_expr = PsExpression.make(dim.counter)
             indexing_decls.append(
-                self._typify(
-                    PsDeclaration(
-                        ctr,
-                        dim.start
-                        + dim.step
-                        * PsCast(ctr.get_dtype(), self._linear_thread_idx(i)),
-                    )
-                )
+                self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
             if not self._omit_range_check:
-                conds.append(PsLt(ctr, dim.stop))
-
-        indexing_decls = indexing_decls[::-1]
+                conds.append(PsLt(ctr_expr, dim.stop))
 
         if conds:
             condition: PsExpression = conds[0]
@@ -187,18 +294,19 @@ class CudaPlatform(GenericGpu):
             body.statements = indexing_decls + body.statements
             ast = body
 
-        return ast, threads_range
+        return ast
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         factory = AstFactory(self._ctx)
         ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
 
-        sparse_ctr = PsExpression.make(ispace.sparse_counter)
-        thread_idx = self._linear_thread_idx(0)
+        sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
+        ctr_mapping = self._thread_mapping(ispace)
+
         sparse_idx_decl = self._typify(
-            PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx))
+            PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter])
         )
 
         mappings = [
@@ -207,7 +315,7 @@ class CudaPlatform(GenericGpu):
                 PsLookup(
                     PsBufferAcc(
                         ispace.index_list.base_pointer,
-                        (sparse_ctr, factory.parse_index(0)),
+                        (sparse_ctr_expr.clone(), factory.parse_index(0)),
                     ),
                     coord.name,
                 ),
@@ -218,16 +326,10 @@ class CudaPlatform(GenericGpu):
 
         if not self._omit_range_check:
             stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr, stop)
+            condition = PsLt(sparse_ctr_expr.clone(), stop)
             ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
         else:
             body.statements = [sparse_idx_decl] + body.statements
             ast = body
 
-        return ast, self.threads_from_ispace(ispace)
-
-    def _linear_thread_idx(self, coord: int):
-        block_size = BLOCK_DIM[coord]
-        block_idx = BLOCK_IDX[coord]
-        thread_idx = THREAD_IDX[coord]
-        return block_idx * block_size + thread_idx
+        return ast
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 15df36cdd9cf416a8438f12816cbe00cfaeea204..b5b35c8b03447f1d5c35ed1289b89542bb1127ca 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,61 +1,7 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
-from abc import abstractmethod
 
-from ..ast.expressions import PsExpression
-from ..ast.structural import PsBlock
-from ..kernelcreation.iteration_space import (
-    IterationSpace,
-    FullIterationSpace,
-    SparseIterationSpace,
-)
 from .platform import Platform
-from ..exceptions import MaterializationError
-
-if TYPE_CHECKING:
-    from ...codegen.kernel import GpuThreadsRange
 
 
 class GenericGpu(Platform):
-    @abstractmethod
-    def materialize_iteration_space(
-        self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange | None]:
-        pass
-
-    @classmethod
-    def threads_from_ispace(cls, ispace: IterationSpace) -> GpuThreadsRange:
-        from ...codegen.kernel import GpuThreadsRange
-
-        if isinstance(ispace, FullIterationSpace):
-            return cls._threads_from_full_ispace(ispace)
-        elif isinstance(ispace, SparseIterationSpace):
-            work_items = (PsExpression.make(ispace.index_list.shape[0]),)
-            return GpuThreadsRange(work_items)
-        else:
-            assert False
-
-    @classmethod
-    def _threads_from_full_ispace(cls, ispace: FullIterationSpace) -> GpuThreadsRange:
-        from ...codegen.kernel import GpuThreadsRange
-        
-        dimensions = ispace.dimensions_in_loop_order()[::-1]
-        if len(dimensions) > 3:
-            raise NotImplementedError(
-                f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space"
-            )
-        
-        from ..ast.analysis import collect_undefined_symbols as collect
-
-        for dim in dimensions:
-            symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
-            for ctr in ispace.counters:
-                if ctr in symbs:
-                    raise MaterializationError(
-                        "Unable to construct GPU threads range for iteration space: "
-                        f"Limits of dimension counter {dim.counter.name} "
-                        f"depend on another dimension's counter {ctr.name}"
-                    )
-
-        work_items = [ispace.actual_iterations(dim) for dim in dimensions]
-        return GpuThreadsRange(work_items)
+    """Base class for GPU platforms."""
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index 2c7ee1c5f4750eac0375bc31a3f44b9eea50642b..8ed4729a2d67777bbd132d9e48140e20e3656767 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -1,5 +1,4 @@
 from abc import ABC, abstractmethod
-from typing import Any
 
 from ..ast.structural import PsBlock
 from ..ast.expressions import PsCall, PsExpression
@@ -12,9 +11,9 @@ class Platform(ABC):
     """Abstract base class for all supported platforms.
 
     The platform performs all target-dependent tasks during code generation:
-
-     - Translation of the iteration space to an index source (loop nest, GPU indexing, ...)
-     - Platform-specific optimizations (e.g. vectorization, OpenMP)
+    
+    - Translation of the iteration space to an index source (loop nest, GPU indexing, ...)
+    - Platform-specific optimizations (e.g. vectorization, OpenMP)
     """
 
     def __init__(self, ctx: KernelCreationContext) -> None:
@@ -23,12 +22,16 @@ class Platform(ABC):
     @property
     @abstractmethod
     def required_headers(self) -> set[str]:
+        """Set of header files that must be included at the point of definition of a kernel
+        running on this platform."""
         pass
 
     @abstractmethod
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> PsBlock | tuple[PsBlock, Any]:
+    ) -> PsBlock:
+        """Materialize the given iteration space as an indexing structure and embed the given
+        kernel body into that structure."""
         pass
 
     @abstractmethod
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index e1da9e2237b504c4b6b681d48812e4d079a5b463..7f2bbf9f6bf54a8c9cdbcd4a9989e54b9db66923 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
 
 from ..functions import CFunction, PsMathFunction, MathFunctions
 from ..kernelcreation.iteration_space import (
@@ -29,9 +28,6 @@ from .generic_gpu import GenericGpu
 from ..exceptions import MaterializationError
 from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType
 
-if TYPE_CHECKING:
-    from ...codegen import GpuThreadsRange
-
 
 class SyclPlatform(GenericGpu):
 
@@ -39,7 +35,7 @@ class SyclPlatform(GenericGpu):
         self,
         ctx: KernelCreationContext,
         omit_range_check: bool = False,
-        automatic_block_size: bool = False
+        automatic_block_size: bool = False,
     ):
         super().__init__(ctx)
 
@@ -52,7 +48,7 @@ class SyclPlatform(GenericGpu):
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         if isinstance(ispace, FullIterationSpace):
             return self._prepend_dense_translation(body, ispace)
         elif isinstance(ispace, SparseIterationSpace):
@@ -113,15 +109,13 @@ class SyclPlatform(GenericGpu):
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         rank = ispace.rank
         id_type = self._id_type(rank)
         id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type))
         id_decl = self._id_declaration(rank, id_symbol)
 
         dimensions = ispace.dimensions_in_loop_order()
-        launch_config = self.threads_from_ispace(ispace)
-
         indexing_decls = [id_decl]
         conds = []
 
@@ -153,11 +147,11 @@ class SyclPlatform(GenericGpu):
             body.statements = indexing_decls + body.statements
             ast = body
 
-        return ast, launch_config
+        return ast
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
-    ) -> tuple[PsBlock, GpuThreadsRange]:
+    ) -> PsBlock:
         factory = AstFactory(self._ctx)
 
         id_type = PsCustomType("sycl::id< 1 >", const=True)
@@ -195,7 +189,7 @@ class SyclPlatform(GenericGpu):
             body.statements = [sparse_idx_decl] + body.statements
             ast = body
 
-        return ast, self.threads_from_ispace(ispace)
+        return ast
 
     def _item_type(self, rank: int):
         if not self._automatic_block_size:
diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py
index e13f911dd9c2f8dd1a7b264a79dcdcd51cbef003..1b2cd2ffb6256ecd2dbc287e5f4f4c7a3a723dc4 100644
--- a/src/pystencils/codegen/__init__.py
+++ b/src/pystencils/codegen/__init__.py
@@ -4,8 +4,10 @@ from .config import (
     AUTO,
 )
 from .parameters import Parameter
-from .kernel import Kernel, GpuKernel, GpuThreadsRange
+from .kernel import Kernel, GpuKernel
 from .driver import create_kernel, get_driver
+from .functions import Lambda
+from .errors import CodegenError
 
 __all__ = [
     "Target",
@@ -14,7 +16,8 @@ __all__ = [
     "Parameter",
     "Kernel",
     "GpuKernel",
-    "GpuThreadsRange",
+    "Lambda",
     "create_kernel",
     "get_driver",
+    "CodegenError",
 ]
diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py
index bce0757313b597cb9f4413e84652b9878c39a2d5..0d43b40e35b9e90b1f13e2ba7bc279274f466859 100644
--- a/src/pystencils/codegen/config.py
+++ b/src/pystencils/codegen/config.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from warnings import warn
 from abc import ABC
 from collections.abc import Collection
+from enum import Enum, auto
 
 from typing import TYPE_CHECKING, Sequence, Generic, TypeVar, Callable, Any, cast
 from dataclasses import dataclass, InitVar, fields
@@ -85,7 +86,9 @@ class Option(Generic[Option_T, Arg_T]):
         self._name = name
         self._lookup = f"_{name}"
 
-    def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Option_T | None:
+    def __get__(
+        self, obj: ConfigBase, objtype: type[ConfigBase] | None = None
+    ) -> Option_T | None:
         if obj is None:
             return None
 
@@ -193,7 +196,9 @@ class Category(Generic[Category_T]):
         self._name = name
         self._lookup = f"_{name}"
 
-    def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Category_T:
+    def __get__(
+        self, obj: ConfigBase, objtype: type[ConfigBase] | None = None
+    ) -> Category_T:
         if obj is None:
             return None
 
@@ -331,10 +336,42 @@ class CpuOptions(ConfigBase):
     """
 
 
+class GpuIndexingScheme(Enum):
+    """Available index translation schemes for GPU kernels."""
+
+    Linear3D = auto()
+    """Map coordinates to global thread indices.
+
+    Supports up to three-dimensional iteration spaces.
+    For each dimension (with known start, stop and step values), compute the current iteration
+    point as ``start + step * (blockIdx.c * blockDim.c * threadDim.c)``
+    (where c :math:`\\in` (x, y, z)).
+    """
+
+    Blockwise4D = auto()
+    """On a 3D grid of 1D blocks, map the fastest coordinate onto the intra-block thread index,
+    and slower coordinates onto the block index.
+
+    Supports up to four-dimensional iteration spaces.
+    Using this indexing scheme, the iteration counters of up to four dimensions are assigned
+    like follows, from slowest to fastest:
+
+    .. code-block:: C++
+
+        ctr_3 = blockIdx.z;
+        ctr_2 = blockIdx.y;
+        ctr_1 = blockIdx.x;
+        ctr_0 = threadIDx.x;
+    """
+
+
 @dataclass
 class GpuOptions(ConfigBase):
     """Configuration options specific to GPU targets."""
 
+    indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D)
+    """Thread indexing scheme for dense GPU kernels."""
+
     omit_range_check: BasicOption[bool] = BasicOption(False)
     """If set to `True`, omit the iteration counter range check.
     
@@ -343,8 +380,13 @@ class GpuOptions(ConfigBase):
     This check can be discarded through this option, at your own peril.
     """
 
-    block_size: BasicOption[tuple[int, int, int]] = BasicOption()
-    """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system."""
+    block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO)
+    """Desired block size for the execution of GPU kernels.
+    
+    This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>`
+    is chosen as an indexing scheme.
+    The block size may be overridden at runtime.
+    """
 
     manual_launch_grid: BasicOption[bool] = BasicOption(False)
     """Always require a manually specified launch grid when running this kernel.
@@ -354,6 +396,31 @@ class GpuOptions(ConfigBase):
     The launch grid will then have to be specified manually at runtime.
     """
 
+    @indexing_scheme.validate
+    def _validate_idx_scheme(self, val: str | GpuIndexingScheme):
+        if isinstance(val, GpuIndexingScheme):
+            return val
+
+        match val.lower():
+            case "block":
+                warn(
+                    "GPU indexing scheme name `block` is deprecated and will be removed in pystencils 2.1. "
+                    "Use `Linear3D` instead."
+                )
+                return GpuIndexingScheme.Linear3D
+            case "line":
+                warn(
+                    "GPU indexing scheme name `line` is deprecated and will be removed in pystencils 2.1. "
+                    "Use `Blockwise4D` instead."
+                )
+                return GpuIndexingScheme.Blockwise4D
+            case "linear3d":
+                return GpuIndexingScheme.Linear3D
+            case "blockwise4d":
+                return GpuIndexingScheme.Blockwise4D
+            case _:
+                raise ValueError(f"Invalid GPU indexing scheme: {val}")
+
 
 @dataclass
 class SyclOptions(ConfigBase):
@@ -506,6 +573,9 @@ class CreateKernelConfig(ConfigBase):
     cpu_vectorize_info: InitVar[dict | None] = None
     """Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead."""
 
+    gpu_indexing: InitVar[str | None] = None
+    """Deprecated; use `gpu.indexing_scheme <GpuOptions.indexing_scheme>` instead."""
+
     gpu_indexing_params: InitVar[dict | None] = None
     """Deprecated; set options in the `gpu` category instead."""
 
@@ -531,11 +601,8 @@ class CreateKernelConfig(ConfigBase):
             elif self.get_target() == Target.CUDA:
                 try:
                     from ..jit.gpu_cupy import CupyJit
-
-                    if self.gpu is not None and self.gpu.block_size is not None:
-                        return CupyJit(self.gpu.block_size)
-                    else:
-                        return CupyJit()
+                    
+                    return CupyJit()
 
                 except ImportError:
                     from ..jit import no_jit
@@ -564,6 +631,7 @@ class CreateKernelConfig(ConfigBase):
         data_type: UserTypeSpec | None,
         cpu_openmp: bool | int | None,
         cpu_vectorize_info: dict | None,
+        gpu_indexing: str | None,
         gpu_indexing_params: dict | None,
     ):  # pragma: no cover
         if data_type is not None:
@@ -593,9 +661,7 @@ class CreateKernelConfig(ConfigBase):
                     deprecated_omp.enable = True
                     deprecated_omp.num_threads = cpu_openmp
                 case _:
-                    raise ValueError(
-                        f"Invalid option for `cpu_openmp`: {cpu_openmp}"
-                    )
+                    raise ValueError(f"Invalid option for `cpu_openmp`: {cpu_openmp}")
 
             self.cpu.openmp = deprecated_omp
 
@@ -652,11 +718,20 @@ class CreateKernelConfig(ConfigBase):
 
             self.cpu.vectorize = deprecated_vec_opts
 
+        if gpu_indexing is not None:
+            _deprecated_option("gpu_indexing", "gpu.indexing_scheme")
+            warn(
+                "Setting the deprecated `gpu_indexing` will override the `gpu.indexing_scheme` option",
+                UserWarning,
+            )
+            self.gpu.indexing_scheme = gpu_indexing
+
         if gpu_indexing_params is not None:
-            _deprecated_option("gpu_indexing_params", "gpu_indexing")
+            _deprecated_option("gpu_indexing_params", "gpu")
             warn(
                 "Setting the deprecated `gpu_indexing_params` will override any options "
-                "passed in the `gpu` category."
+                "passed in the `gpu` category.",
+                UserWarning,
             )
 
             self.gpu = GpuOptions(
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 6f44e718d9aaebd393dacb9a99ae88702f07ccaa..14a95c84d899638ea796d13cfddf7dd4e7ccd04f 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import cast, Sequence, Iterable, TYPE_CHECKING
+from typing import cast, Sequence, Iterable, Callable, TYPE_CHECKING
 from dataclasses import dataclass, replace
 
 from .target import Target
@@ -10,10 +10,12 @@ from .config import (
     _AUTO_TYPE,
     GhostLayerSpec,
     IterationSliceSpec,
+    GpuIndexingScheme,
 )
-from .kernel import Kernel, GpuKernel, GpuThreadsRange
-from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
+from .kernel import Kernel, GpuKernel
+from .properties import PsSymbolProperty, FieldBasePtr
 from .parameters import Parameter
+from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration
 
 from ..field import Field
 from ..types import PsIntegerType, PsScalarType
@@ -38,7 +40,6 @@ from ..backend.platforms import (
     Platform,
     GenericCpu,
     GenericVectorCpu,
-    GenericGpu,
 )
 from ..backend.exceptions import VectorizationError
 
@@ -145,6 +146,7 @@ class DefaultKernelCreationDriver:
         )
 
         self._target = cfg.get_target()
+        self._gpu_indexing: GpuIndexing | None = self._get_gpu_indexing()
         self._platform = self._get_platform()
 
         self._intermediates: CodegenIntermediates | None
@@ -163,15 +165,9 @@ class DefaultKernelCreationDriver:
     ) -> Kernel:
         kernel_body = self.parse_kernel_body(assignments)
 
-        match self._platform:
-            case GenericCpu():
-                kernel_ast = self._platform.materialize_iteration_space(
-                    kernel_body, self._ctx.get_iteration_space()
-                )
-            case GenericGpu():
-                kernel_ast, gpu_threads = self._platform.materialize_iteration_space(
-                    kernel_body, self._ctx.get_iteration_space()
-                )
+        kernel_ast = self._platform.materialize_iteration_space(
+            kernel_body, self._ctx.get_iteration_space()
+        )
 
         if self._intermediates is not None:
             self._intermediates.materialized_ispace = kernel_ast.clone()
@@ -215,14 +211,16 @@ class DefaultKernelCreationDriver:
                 self._cfg.get_jit(),
             )
         else:
+            assert self._gpu_indexing is not None
+
             return create_gpu_kernel_function(
                 self._ctx,
                 self._platform,
                 kernel_ast,
-                gpu_threads,
                 self._cfg.get_option("function_name"),
                 self._target,
                 self._cfg.get_jit(),
+                self._gpu_indexing.get_launch_config_factory(),
             )
 
     def parse_kernel_body(
@@ -395,6 +393,18 @@ class DefaultKernelCreationDriver:
 
         return kernel_ast
 
+    def _get_gpu_indexing(self) -> GpuIndexing | None:
+        if self._target != Target.CUDA:
+            return None
+
+        from .gpu_indexing import dim3
+
+        idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
+        block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size")
+        manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
+
+        return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid)
+
     def _get_platform(self) -> Platform:
         if Target._CPU in self._target:
             if Target._X86 in self._target:
@@ -430,7 +440,9 @@ class DefaultKernelCreationDriver:
                 case Target.SYCL:
                     from ..backend.platforms import SyclPlatform
 
-                    auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size")
+                    auto_block_size: bool = self._cfg.sycl.get_option(
+                        "automatic_block_size"
+                    )
 
                     return SyclPlatform(
                         self._ctx,
@@ -440,12 +452,16 @@ class DefaultKernelCreationDriver:
                 case Target.CUDA:
                     from ..backend.platforms import CudaPlatform
 
-                    manual_grid = gpu_opts.get_option("manual_launch_grid")
+                    thread_mapping = (
+                        self._gpu_indexing.get_thread_mapping()
+                        if self._gpu_indexing is not None
+                        else None
+                    )
 
                     return CudaPlatform(
                         self._ctx,
                         omit_range_check=omit_range_check,
-                        manual_launch_grid=manual_grid,
+                        thread_mapping=thread_mapping,
                     )
 
         raise NotImplementedError(
@@ -475,51 +491,50 @@ def create_gpu_kernel_function(
     ctx: KernelCreationContext,
     platform: Platform,
     body: PsBlock,
-    threads_range: GpuThreadsRange | None,
     function_name: str,
     target_spec: Target,
     jit: JitBase,
+    launch_config_factory: Callable[[], GpuLaunchConfiguration],
 ) -> GpuKernel:
     undef_symbols = collect_undefined_symbols(body)
 
-    if threads_range is not None:
-        for threads in threads_range.num_work_items:
-            undef_symbols |= collect_undefined_symbols(threads)
-
     params = _get_function_params(ctx, undef_symbols)
     req_headers = _get_headers(ctx, platform, body)
 
     kfunc = GpuKernel(
         body,
-        threads_range,
         target_spec,
         function_name,
         params,
         req_headers,
         jit,
+        launch_config_factory,
     )
     kfunc.metadata.update(ctx.metadata)
     return kfunc
 
 
-def _get_function_params(
-    ctx: KernelCreationContext, symbols: Iterable[PsSymbol]
-) -> list[Parameter]:
-    params: list[Parameter] = []
+def _symbol_to_param(ctx: KernelCreationContext, symbol: PsSymbol):
+    from pystencils.backend.memory import BufferBasePtr, BackendPrivateProperty
 
-    from pystencils.backend.memory import BufferBasePtr
+    props: set[PsSymbolProperty] = set()
+    for prop in symbol.properties:
+        match prop:
+            case BufferBasePtr(buf):
+                field = ctx.find_field(buf.name)
+                props.add(FieldBasePtr(field))
+            case BackendPrivateProperty():
+                pass
+            case _:
+                props.add(prop)
 
-    for symb in symbols:
-        props: set[PsSymbolProperty] = set()
-        for prop in symb.properties:
-            match prop:
-                case FieldShape() | FieldStride():
-                    props.add(prop)
-                case BufferBasePtr(buf):
-                    field = ctx.find_field(buf.name)
-                    props.add(FieldBasePtr(field))
-        params.append(Parameter(symb.name, symb.get_dtype(), props))
+    return Parameter(symbol.name, symbol.get_dtype(), props)
 
+
+def _get_function_params(
+    ctx: KernelCreationContext, symbols: Iterable[PsSymbol]
+) -> list[Parameter]:
+    params: list[Parameter] = [_symbol_to_param(ctx, s) for s in symbols]
     params.sort(key=lambda p: p.name)
     return params
 
diff --git a/src/pystencils/codegen/errors.py b/src/pystencils/codegen/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..eceb53f611d92a2f3e8c5c4c9105d5fc8e4aa507
--- /dev/null
+++ b/src/pystencils/codegen/errors.py
@@ -0,0 +1,2 @@
+class CodegenError(Exception):
+    """Exception that indicates a fatal error in the code generation driver."""
diff --git a/src/pystencils/codegen/functions.py b/src/pystencils/codegen/functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6be3b1f3446c6b9a25a0013f0e06d099edf5bed
--- /dev/null
+++ b/src/pystencils/codegen/functions.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+from typing import Sequence, Any
+
+from .parameters import Parameter
+from ..types import PsType
+
+from ..backend.kernelcreation import KernelCreationContext
+from ..backend.ast.expressions import PsExpression
+
+
+class Lambda:
+    """A one-line function emitted by the code generator as an auxiliary object."""
+
+    @staticmethod
+    def from_expression(ctx: KernelCreationContext, expr: PsExpression):
+        from ..backend.ast.analysis import collect_undefined_symbols
+        from .driver import _get_function_params
+
+        params = _get_function_params(ctx, collect_undefined_symbols(expr))
+        return Lambda(expr, params)
+
+    def __init__(self, expr: PsExpression, params: Sequence[Parameter]):
+        self._expr = expr
+        self._params = tuple(params)
+        self._return_type = expr.get_dtype()
+
+    @property
+    def parameters(self) -> tuple[Parameter, ...]:
+        """Parameters to this lambda"""
+        return self._params
+
+    @property
+    def return_type(self) -> PsType:
+        """Return type of this lambda"""
+        return self._return_type
+
+    def __call__(self, **kwargs) -> Any:
+        """Evaluate this lambda with the given arguments.
+
+        The lambda must receive a value for each parameter listed in `parameters`.
+        """
+        from ..backend.ast.expressions import evaluate_expression
+
+        return evaluate_expression(self._expr, kwargs)
+
+    def __str__(self) -> str:
+        return str(self._expr)
+
+    def c_code(self) -> str:
+        """Print the C code of this lambda"""
+        from ..backend.emission import CAstPrinter
+
+        printer = CAstPrinter()
+        return printer(self._expr)
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d22ec624856d9cf8a0b825845fee04caaa4ee74
--- /dev/null
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -0,0 +1,356 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import cast, Any, Callable
+from itertools import chain
+
+from .functions import Lambda
+from .parameters import Parameter
+from .errors import CodegenError
+from .config import GpuIndexingScheme, _AUTO_TYPE
+
+from ..backend.kernelcreation import (
+    KernelCreationContext,
+    FullIterationSpace,
+    SparseIterationSpace,
+)
+from ..backend.platforms.cuda import ThreadMapping
+
+from ..backend.ast.expressions import PsExpression
+
+
+dim3 = tuple[int, int, int]
+_Dim3Lambda = tuple[Lambda, Lambda, Lambda]
+
+
+class GpuLaunchConfiguration(ABC):
+    """Base class for launch configurations for CUDA and HIP kernels.
+
+    Args:
+        block_size: A triple of lambdas determining the GPU block size
+        grid_size: A triple of lambdas determining the GPU grid size
+        config_parameters: Set containing all parameters to the given lambdas that are not also
+            parameters to the associated kernel
+    """
+
+    @property
+    @abstractmethod
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+
+    @abstractmethod
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        """Compute block and grid size for a kernel launch.
+
+        Args:
+            kwargs: Valuation providing a value for each parameter listed in `parameters`
+        """
+
+    @abstractmethod
+    def jit_cache_key(self) -> Any:
+        """Return a hashable object that represents any user-configurable options of
+        this launch configuration, such that when the configuration changes, the JIT parameter
+        cache is invalidated."""
+
+
+class AutomaticLaunchConfiguration(GpuLaunchConfiguration):
+    """Launch configuration that is dynamically computed from kernel parameters.
+
+    This launch configuration permits no further user customization.
+    """
+
+    def __init__(
+        self,
+        block_size: _Dim3Lambda,
+        grid_size: _Dim3Lambda,
+    ) -> None:
+        self._block_size = block_size
+        self._grid_size = grid_size
+
+        self._params: frozenset[Parameter] = frozenset().union(
+            *(lb.parameters for lb in chain(block_size, grid_size))
+        )
+
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        return self._params
+
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        block_size = tuple(int(bs(**kwargs)) for bs in self._block_size)
+        grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size)
+        return cast(dim3, block_size), cast(dim3, grid_size)
+
+    def jit_cache_key(self) -> Any:
+        return ()
+
+
+class ManualLaunchConfiguration(GpuLaunchConfiguration):
+    """Manual GPU launch configuration.
+
+    This launch configuration requires the user to set block and grid size.
+    """
+
+    def __init__(
+        self,
+    ) -> None:
+        self._block_size: dim3 | None = None
+        self._grid_size: dim3 | None = None
+
+    @property
+    def block_size(self) -> dim3 | None:
+        return self._block_size
+
+    @block_size.setter
+    def block_size(self, val: dim3):
+        self._block_size = val
+
+    @property
+    def grid_size(self) -> dim3 | None:
+        return self._grid_size
+
+    @grid_size.setter
+    def grid_size(self, val: dim3):
+        self._grid_size = val
+
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        return frozenset()
+
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        if self._block_size is None:
+            raise AttributeError("No GPU block size was set by the user.")
+
+        if self._grid_size is None:
+            raise AttributeError("No GPU grid size was set by the user.")
+
+        return self._block_size, self._grid_size
+
+    def jit_cache_key(self) -> Any:
+        return (self._block_size, self._grid_size)
+
+
+class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
+    """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size.
+
+    The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items
+    in the kernel's iteration space as follows.
+    For each dimension :math:`c \\in \\{ x, y, z \\}`,
+
+    - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``;
+    - otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``.
+    """
+
+    def __init__(
+        self,
+        num_work_items: _Dim3Lambda,
+        default_block_size: dim3 | None = None,
+    ) -> None:
+        self._num_work_items = num_work_items
+
+        self._block_size: dim3 | None = default_block_size
+
+        self._params: frozenset[Parameter] = frozenset().union(
+            *(wit.parameters for wit in num_work_items)
+        )
+
+    @property
+    def num_work_items(self) -> _Dim3Lambda:
+        """Lambda expressions that compute the number of work items in each iteration space
+        dimension from kernel parameters."""
+        return self._num_work_items
+
+    @property
+    def block_size(self) -> dim3 | None:
+        """The desired GPU block size."""
+        return self._block_size
+
+    @block_size.setter
+    def block_size(self, val: dim3):
+        self._block_size = val
+
+    @property
+    def parameters(self) -> frozenset[Parameter]:
+        """Parameters of this launch configuration"""
+        return self._params
+
+    def evaluate(self, **kwargs) -> tuple[dim3, dim3]:
+        if self._block_size is None:
+            raise AttributeError("No GPU block size was specified by the user!")
+
+        from ..utils import div_ceil
+
+        num_work_items = cast(
+            dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items)
+        )
+        reduced_block_size = cast(
+            dim3,
+            tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)),
+        )
+        grid_size = cast(
+            dim3,
+            tuple(
+                div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size)
+            ),
+        )
+
+        return reduced_block_size, grid_size
+
+    def jit_cache_key(self) -> Any:
+        return self._block_size
+
+
+class GpuIndexing:
+    """Factory for GPU indexing objects required during code generation.
+
+    This class acts as a helper class for the code generation driver.
+    It produces both the `ThreadMapping` required by the backend,
+    as well as factories for the launch configuration required later by the runtime system.
+
+    Args:
+        ctx: The kernel creation context
+        scheme: The desired GPU indexing scheme
+        block_size: A user-defined default block size, required only if the indexing scheme permits
+            modification of the block size
+        manual_launch_grid: If `True`, always emit a `ManualLaunchConfiguration` to force the runtime system
+            to set the launch configuration explicitly
+    """
+
+    def __init__(
+        self,
+        ctx: KernelCreationContext,
+        scheme: GpuIndexingScheme,
+        default_block_size: dim3 | _AUTO_TYPE | None = None,
+        manual_launch_grid: bool = False,
+    ) -> None:
+        self._ctx = ctx
+        self._scheme = scheme
+        self._default_block_size = default_block_size
+        self._manual_launch_grid = manual_launch_grid
+
+        from ..backend.kernelcreation import AstFactory
+
+        self._factory = AstFactory(self._ctx)
+
+    def get_thread_mapping(self) -> ThreadMapping:
+        """Retrieve a thread mapping object for use by the backend"""
+
+        from ..backend.platforms.cuda import Linear3DMapping, Blockwise4DMapping
+
+        match self._scheme:
+            case GpuIndexingScheme.Linear3D:
+                return Linear3DMapping()
+            case GpuIndexingScheme.Blockwise4D:
+                return Blockwise4DMapping()
+
+    def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]:
+        """Retrieve a factory for the launch configuration for later consumption by the runtime system"""
+        if self._manual_launch_grid:
+            return ManualLaunchConfiguration
+
+        match self._scheme:
+            case GpuIndexingScheme.Linear3D:
+                return self._get_linear3d_config_factory()
+            case GpuIndexingScheme.Blockwise4D:
+                return self._get_blockwise4d_config_factory()
+
+    def _get_linear3d_config_factory(
+        self,
+    ) -> Callable[[], DynamicBlockSizeLaunchConfiguration]:
+        work_items_expr = self._get_work_items()
+        rank = len(work_items_expr)
+
+        if rank > 3:
+            raise CodegenError(
+                "Cannot create a launch grid configuration using the Linear3D indexing scheme"
+                f" for a {rank}-dimensional kernel."
+            )
+
+        num_work_items = cast(
+            _Dim3Lambda,
+            tuple(Lambda.from_expression(self._ctx, wit) for wit in work_items_expr),
+        )
+
+        def factory():
+            return DynamicBlockSizeLaunchConfiguration(
+                num_work_items,
+                self._get_default_block_size(rank),
+            )
+
+        return factory
+
+    def _get_default_block_size(self, rank: int) -> dim3:
+        if self._default_block_size is None:
+            raise CodegenError("The default block size option was not set")
+
+        if isinstance(self._default_block_size, _AUTO_TYPE):
+            match rank:
+                case 1:
+                    return (256, 1, 1)
+                case 2:
+                    return (128, 2, 1)
+                case 3:
+                    return (128, 2, 2)
+                case _:
+                    assert False, "unreachable code"
+        else:
+            return self._default_block_size
+
+    def _get_blockwise4d_config_factory(
+        self,
+    ) -> Callable[[], AutomaticLaunchConfiguration]:
+        work_items = self._get_work_items()
+        rank = len(work_items)
+
+        if rank > 4:
+            raise ValueError(f"Iteration space rank is too large: {rank}")
+
+        block_size = (
+            Lambda.from_expression(self._ctx, work_items[0]),
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1)),
+        )
+
+        grid_size = tuple(
+            Lambda.from_expression(self._ctx, wit) for wit in work_items[1:]
+        ) + tuple(
+            Lambda.from_expression(self._ctx, self._factory.parse_index(1))
+            for _ in range(4 - rank)
+        )
+
+        def factory():
+            return AutomaticLaunchConfiguration(
+                block_size,
+                cast(_Dim3Lambda, grid_size),
+            )
+
+        return factory
+
+    def _get_work_items(self) -> tuple[PsExpression, ...]:
+        """Return a tuple of expressions representing the number of work items
+        in each dimension of the kernel's iteration space,
+        ordered from fastest to slowest dimension.
+        """
+        ispace = self._ctx.get_iteration_space()
+        match ispace:
+            case FullIterationSpace():
+                dimensions = ispace.dimensions_in_loop_order()[::-1]
+
+                from ..backend.ast.analysis import collect_undefined_symbols as collect
+
+                for i, dim in enumerate(dimensions):
+                    symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step)
+                    for ctr in ispace.counters:
+                        if ctr in symbs:
+                            raise CodegenError(
+                                "Unable to construct GPU launch grid constraints for this kernel: "
+                                f"Limits in dimension {i} "
+                                f"depend on another dimension's counter {ctr.name}"
+                            )
+
+                return tuple(ispace.actual_iterations(dim) for dim in dimensions)
+
+            case SparseIterationSpace():
+                return (self._factory.parse_index(ispace.index_list.shape[0]),)
+
+            case _:
+                assert False, "unexpected iteration space"
diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py
index 3adc47876dc36af02ee307dde25ad5d7250cd3fb..181e6ad3b5d1cb1f1835d8b4e656c39f65a1316b 100644
--- a/src/pystencils/codegen/kernel.py
+++ b/src/pystencils/codegen/kernel.py
@@ -6,8 +6,9 @@ from itertools import chain
 
 from .target import Target
 from .parameters import Parameter
+from .gpu_indexing import GpuLaunchConfiguration
+
 from ..backend.ast.structural import PsBlock
-from ..backend.ast.expressions import PsExpression
 from ..field import Field
 
 from .._deprecation import _deprecated
@@ -118,54 +119,16 @@ class GpuKernel(Kernel):
     def __init__(
         self,
         body: PsBlock,
-        threads_range: GpuThreadsRange | None,
         target: Target,
         name: str,
         parameters: Sequence[Parameter],
         required_headers: set[str],
         jit: JitBase,
+        launch_config_factory: Callable[[], GpuLaunchConfiguration],
     ):
         super().__init__(body, target, name, parameters, required_headers, jit)
-        self._threads_range = threads_range
+        self._launch_config_factory = launch_config_factory
 
-    @property
-    def threads_range(self) -> GpuThreadsRange | None:
+    def get_launch_configuration(self) -> GpuLaunchConfiguration:
         """Object exposing the total size of the launch grid this kernel expects to be executed with."""
-        return self._threads_range
-
-
-class GpuThreadsRange:
-    """Number of threads required by a GPU kernel, in order (x, y, z)."""
-
-    def __init__(
-        self,
-        num_work_items: Sequence[PsExpression],
-    ):
-        self._dim = len(num_work_items)
-        self._num_work_items = tuple(num_work_items)
-
-    # @property
-    # def grid_size(self) -> tuple[PsExpression, ...]:
-    #     return self._grid_size
-
-    # @property
-    # def block_size(self) -> tuple[PsExpression, ...]:
-    #     return self._block_size
-
-    @property
-    def num_work_items(self) -> tuple[PsExpression, ...]:
-        """Number of work items in (x, y, z)-order."""
-        return self._num_work_items
-
-    @property
-    def dim(self) -> int:
-        return self._dim
-
-    def __str__(self) -> str:
-        rep = "GpuThreadsRange { "
-        rep += "; ".join(f"{x}: {w}" for x, w in zip("xyz", self._num_work_items))
-        rep += " }"
-        return rep
-
-    def _repr_html_(self) -> str:
-        return str(self)
+        return self._launch_config_factory()
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index a407bb75e08bfde9911070aef03b4a1769a6221a..1c771a427ab5025c0f041d88f1acbec7da9f0920 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -1,4 +1,4 @@
-from typing import Any, Sequence, cast
+from typing import Any, Sequence, cast, Callable
 from dataclasses import dataclass
 
 try:
@@ -18,6 +18,7 @@ from ..codegen import (
     GpuKernel,
     Parameter,
 )
+from ..codegen.gpu_indexing import GpuLaunchConfiguration
 from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr
 from ..types import PsStructType, PsPointerType
 
@@ -35,12 +36,10 @@ class CupyKernelWrapper(KernelWrapper):
         self,
         kfunc: GpuKernel,
         raw_kernel: Any,
-        block_size: tuple[int, int, int],
     ):
         self._kfunc: GpuKernel = kfunc
+        self._launch_config = kfunc.get_launch_configuration()
         self._raw_kernel = raw_kernel
-        self._block_size = block_size
-        self._num_blocks: tuple[int, int, int] | None = None
         self._args_cache: dict[Any, tuple] = dict()
 
     @property
@@ -48,24 +47,12 @@ class CupyKernelWrapper(KernelWrapper):
         return self._kfunc
 
     @property
-    def raw_kernel(self):
-        return self._raw_kernel
-
-    @property
-    def block_size(self) -> tuple[int, int, int]:
-        return self._block_size
-
-    @block_size.setter
-    def block_size(self, bs: tuple[int, int, int]):
-        self._block_size = bs
+    def launch_config(self) -> GpuLaunchConfiguration:
+        return self._launch_config
 
     @property
-    def num_blocks(self) -> tuple[int, int, int] | None:
-        return self._num_blocks
-
-    @num_blocks.setter
-    def num_blocks(self, nb: tuple[int, int, int] | None):
-        self._num_blocks = nb
+    def raw_kernel(self):
+        return self._raw_kernel
 
     def __call__(self, **kwargs: Any):
         kernel_args, launch_grid = self._get_cached_args(**kwargs)
@@ -80,7 +67,9 @@ class CupyKernelWrapper(KernelWrapper):
         return devices.pop()
 
     def _get_cached_args(self, **kwargs):
-        key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items())
+        key = (self._launch_config.jit_cache_key(),) + tuple(
+            (k, id(v)) for k, v in kwargs.items()
+        )
 
         if key not in self._args_cache:
             args = self._get_args(**kwargs)
@@ -90,16 +79,19 @@ class CupyKernelWrapper(KernelWrapper):
             return self._args_cache[key]
 
     def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]:
-        args = []
+        kernel_args = []
         valuation: dict[str, Any] = dict()
 
-        def add_arg(name: str, arg: Any, dtype: PsType):
-            nptype = dtype.numpy_dtype
+        def add_arg(param: Parameter, arg: Any):
+            nptype = param.dtype.numpy_dtype
             assert nptype is not None
             typecast = nptype.type
             arg = typecast(arg)
-            args.append(arg)
-            valuation[name] = arg
+            valuation[param.name] = arg
+
+        def add_kernel_arg(param: Parameter, arg: Any):
+            add_arg(param, arg)
+            kernel_args.append(arg)
 
         field_shapes = set()
         index_shapes = set()
@@ -152,21 +144,23 @@ class CupyKernelWrapper(KernelWrapper):
                         )
 
         #   Collect parameter values
-        arr: cp.ndarray
 
-        for kparam in self._kfunc.parameters:
-            if kparam.is_field_parameter:
+        def process_param(param: Parameter, adder: Callable[[Parameter, Any], None]):
+            arr: cp.ndarray
+
+            if param.is_field_parameter:
                 #   Determine field-associated data to pass in
-                for prop in kparam.properties:
+                for prop in param.properties:
                     match prop:
                         case FieldBasePtr(field):
 
                             elem_dtype: PsType
 
                             from .. import DynamicType
+
                             if isinstance(field.dtype, DynamicType):
-                                assert isinstance(kparam.dtype, PsPointerType)
-                                elem_dtype = kparam.dtype.base_type
+                                assert isinstance(param.dtype, PsPointerType)
+                                elem_dtype = param.dtype.base_type
                             else:
                                 elem_dtype = field.dtype
 
@@ -176,65 +170,39 @@ class CupyKernelWrapper(KernelWrapper):
                                     f"Data type mismatch at array argument {field.name}:"
                                     f"Expected {field.dtype}, got {arr.dtype}"
                                 )
-                            check_shape(kparam, arr)
-                            args.append(arr)
+                            check_shape(param, arr)
+                            kernel_args.append(arr)
                             break
 
                         case FieldShape(field, coord):
                             arr = kwargs[field.name]
-                            add_arg(kparam.name, arr.shape[coord], kparam.dtype)
+                            adder(param, arr.shape[coord])
                             break
 
                         case FieldStride(field, coord):
                             arr = kwargs[field.name]
-                            add_arg(
-                                kparam.name,
+                            adder(
+                                param,
                                 arr.strides[coord] // arr.dtype.itemsize,
-                                kparam.dtype,
                             )
                             break
             else:
                 #   scalar parameter
-                val: Any = kwargs[kparam.name]
-                add_arg(kparam.name, val, kparam.dtype)
-
-        #   Determine launch grid
-        from ..backend.ast.expressions import evaluate_expression
-
-        symbolic_threads_range = self._kfunc.threads_range
-
-        if self._num_blocks is not None:
-            launch_grid = LaunchGrid(self._num_blocks, self._block_size)
-
-        elif symbolic_threads_range is not None:
-            threads_range: list[int] = [
-                evaluate_expression(expr, valuation)
-                for expr in symbolic_threads_range.num_work_items
-            ]
-
-            if symbolic_threads_range.dim < 3:
-                threads_range += [1] * (3 - symbolic_threads_range.dim)
+                val: Any = kwargs[param.name]
+                adder(param, val)
 
-            def div_ceil(a, b):
-                return a // b if a % b == 0 else a // b + 1
+        #   Process Arguments
 
-            #   TODO: Refine this?
-            num_blocks = tuple(
-                div_ceil(threads, tpb)
-                for threads, tpb in zip(threads_range, self._block_size)
-            )
-            assert len(num_blocks) == 3
+        for kparam in self._kfunc.parameters:
+            process_param(kparam, add_kernel_arg)
 
-            launch_grid = LaunchGrid(num_blocks, self._block_size)
+        for cparam in self._launch_config.parameters:
+            if cparam.name not in valuation:
+                process_param(cparam, add_arg)
 
-        else:
-            raise JitError(
-                "Unable to determine launch grid for GPU kernel invocation: "
-                "No manual grid size was specified, and the number of threads could not "
-                "be determined automatically."
-            )
+        block_size, grid_size = self._launch_config.evaluate(**valuation)
 
-        return tuple(args), launch_grid
+        return tuple(kernel_args), LaunchGrid(grid_size, block_size)
 
 
 class CupyJit(JitBase):
@@ -252,26 +220,26 @@ class CupyJit(JitBase):
             tuple(default_block_size) + (1,) * (3 - len(default_block_size)),
         )
 
-    def compile(self, kfunc: Kernel) -> KernelWrapper:
+    def compile(self, kernel: Kernel) -> KernelWrapper:
         if not HAVE_CUPY:
             raise JitError(
                 "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable."
             )
 
-        if not isinstance(kfunc, GpuKernel) or kfunc.target != Target.CUDA:
+        if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA:
             raise ValueError(
                 "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP"
             )
 
         options = self._compiler_options()
-        prelude = self._prelude(kfunc)
-        kernel_code = self._kernel_code(kfunc)
+        prelude = self._prelude(kernel)
+        kernel_code = self._kernel_code(kernel)
         code = prelude + kernel_code
 
         raw_kernel = cp.RawKernel(
-            code, kfunc.name, options=options, backend="nvrtc", jitify=True
+            code, kernel.name, options=options, backend="nvrtc", jitify=True
         )
-        return CupyKernelWrapper(kfunc, raw_kernel, self._default_block_size)
+        return CupyKernelWrapper(kernel, raw_kernel)
 
     def _compiler_options(self) -> tuple[str, ...]:
         options = ["-w", "-std=c++11"]
diff --git a/src/pystencils/sympyextensions/integer_functions.py b/src/pystencils/sympyextensions/integer_functions.py
index 42513ef9c587bd8f14228083afa4fff419220fa4..9d2c69502f99130699262dd9b63362a3841fccfc 100644
--- a/src/pystencils/sympyextensions/integer_functions.py
+++ b/src/pystencils/sympyextensions/integer_functions.py
@@ -140,10 +140,10 @@ class div_ceil(IntegerFunctionTwoArgsMixIn):
 
     @classmethod
     def eval(cls, arg1, arg2):
-        from ..utils import c_intdiv
+        from ..utils import div_ceil
 
         if is_integer_sequence((arg1, arg2)):
-            return c_intdiv(arg1 + arg2 - 1, arg2)
+            return div_ceil(arg1, arg2)
 
     def _eval_op(self, arg1, arg2):
         return self.eval(arg1, arg2)
diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py
index a53eb82896ab635c3995be918d31a03326766d5d..0049d0a2ce36471c3cfb98bf39c965d33c24868b 100644
--- a/src/pystencils/utils.py
+++ b/src/pystencils/utils.py
@@ -4,11 +4,13 @@ from itertools import groupby
 from collections import Counter
 from contextlib import contextmanager
 from tempfile import NamedTemporaryFile
-from typing import Mapping
+from typing import Mapping, overload
 
 import numpy as np
 import sympy as sp
 
+from numpy.typing import NDArray
+
 
 class DotDict(dict):
     """Normal dict with additional dot access for all keys"""
@@ -254,6 +256,24 @@ class ContextVar:
         return self.stack[-1]
 
 
+@overload
+def c_intdiv(num: int, denom: int) -> int: ...
+
+
+@overload
+def c_intdiv(
+    num: NDArray[np.integer], denom: NDArray[np.integer]
+) -> NDArray[np.integer]: ...
+
+
+@overload
+def c_intdiv(num: int, denom: NDArray[np.integer]) -> NDArray[np.integer]: ...
+
+
+@overload
+def c_intdiv(num: NDArray[np.integer], denom: int) -> NDArray[np.integer]: ...
+
+
 def c_intdiv(num, denom):
     """C-style integer division"""
     if isinstance(num, np.ndarray) or isinstance(denom, np.ndarray):
@@ -271,3 +291,28 @@ def c_rem(num, denom):
     """C-style integer remainder"""
     div = c_intdiv(num, denom)
     return num - div * denom
+
+
+@overload
+def div_ceil(divident: int, divisor: int) -> int: ...
+
+
+@overload
+def div_ceil(
+    divident: NDArray[np.integer], divisor: NDArray[np.integer]
+) -> NDArray[np.integer]: ...
+
+
+@overload
+def div_ceil(divident: int, divisor: NDArray[np.integer]) -> NDArray[np.integer]: ...
+
+
+@overload
+def div_ceil(divident: NDArray[np.integer], divisor: int) -> NDArray[np.integer]: ...
+
+
+def div_ceil(divident, divisor):
+    """For nonnegative integer arguments, compute ``ceil(num / denom)``.
+    The result is unspecified if either argument is negative."""
+
+    return c_intdiv(divident + divisor - 1, divisor)
diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py
index da261faec49940df31d59f44651956e2012b113a..0d71dbe1a250c865c0f637aa3a125837abfe39e7 100644
--- a/tests/kernelcreation/test_domain_kernels.py
+++ b/tests/kernelcreation/test_domain_kernels.py
@@ -32,14 +32,7 @@ def inspect_dp_kernel(kernel: Kernel, gen_config: CreateKernelConfig):
             assert "_mm512_storeu_pd" in code
 
 
-def test_filter_kernel(gen_config):
-    if gen_config.target == Target.CUDA:
-        import cupy as cp
-
-        xp = cp
-    else:
-        xp = np
-
+def test_filter_kernel(gen_config, xp):
     weight = sp.Symbol("weight")
     stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]
 
@@ -62,14 +55,7 @@ def test_filter_kernel(gen_config):
     xp.testing.assert_allclose(dst_arr, expected)
 
 
-def test_filter_kernel_fixedsize(gen_config):
-    if gen_config.target == Target.CUDA:
-        import cupy as cp
-
-        xp = cp
-    else:
-        xp = np
-
+def test_filter_kernel_fixedsize(gen_config, xp):
     weight = sp.Symbol("weight")
     stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]
 
diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py
index 97f0c0fa9ee847389aee71bc511abef55c9f2b93..10b37e610cebd23c9fc961f14118aee5f24582c4 100644
--- a/tests/kernelcreation/test_gpu.py
+++ b/tests/kernelcreation/test_gpu.py
@@ -4,16 +4,87 @@ import numpy as np
 import sympy as sp
 from scipy.ndimage import convolve
 
-from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target
-# from pystencils.gpu import BlockIndexing
-from pystencils.simp import sympy_cse_on_assignment_list
-from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice
+from pystencils import (
+    Assignment,
+    Field,
+    fields,
+    CreateKernelConfig,
+    create_kernel,
+    Target,
+)
+
+from pystencils.slicing import (
+    add_ghost_layers,
+    make_slice,
+    remove_ghost_layers,
+    normalize_slice,
+)
 
 try:
     import cupy as cp
+
     device_numbers = range(cp.cuda.runtime.getDeviceCount())
 except ImportError:
-    device_numbers = []
+    pytest.skip(reason="CuPy is not available", allow_module_level=True)
+
+
+@pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"])
+@pytest.mark.parametrize("omit_range_check", [False, True])
+@pytest.mark.parametrize("manual_grid", [False, True])
+def test_indexing_options(
+    indexing_scheme: str, omit_range_check: bool, manual_grid: bool
+):
+    src, dst = fields("src, dst: [3D]")
+    asm = Assignment(
+        dst.center(),
+        src[-1, 0, 0]
+        + src[1, 0, 0]
+        + src[0, -1, 0]
+        + src[0, 1, 0]
+        + src[0, 0, -1]
+        + src[0, 0, 1],
+    )
+
+    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg.gpu.indexing_scheme = indexing_scheme
+    cfg.gpu.omit_range_check = omit_range_check
+    cfg.gpu.manual_launch_grid = manual_grid
+
+    ast = create_kernel(asm, cfg)
+    kernel = ast.compile()
+
+    src_arr = cp.ones((18, 34, 42))
+    dst_arr = cp.zeros_like(src_arr)
+
+    if manual_grid:
+        match indexing_scheme:
+            case "linear3d":
+                kernel.launch_config.block_size = (10, 8, 8)
+                kernel.launch_config.grid_size = (4, 4, 2)
+            case "blockwise4d":
+                kernel.launch_config.block_size = (40, 1, 1)
+                kernel.launch_config.grid_size = (32, 16, 1)
+
+    elif indexing_scheme == "linear3d":
+        kernel.launch_config.block_size = (10, 8, 8)
+
+    kernel(src=src_arr, dst=dst_arr)
+
+    expected = cp.zeros_like(src_arr)
+    expected[1:-1, 1:-1, 1:-1].fill(6.0)
+
+    cp.testing.assert_allclose(dst_arr, expected)
+
+
+def test_invalid_indexing_schemes():
+    src, dst = fields("src, dst: [4D]")
+    asm = Assignment(src.center(0), dst.center(0))
+
+    cfg = CreateKernelConfig(target=Target.CUDA)
+    cfg.gpu.indexing_scheme = "linear3d"
+
+    with pytest.raises(Exception):
+        create_kernel(asm, cfg)
 
 
 def test_averaging_kernel():
@@ -21,14 +92,16 @@ def test_averaging_kernel():
     src_arr = np.random.rand(*size)
     src_arr = add_ghost_layers(src_arr)
     dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr)
+    src_field = Field.create_from_numpy_array("src", src_arr)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr)
 
-    update_rule = Assignment(dst_field[0, 0],
-                             (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4)
+    update_rule = Assignment(
+        dst_field[0, 0],
+        (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4,
+    )
 
     config = CreateKernelConfig(target=Target.GPU)
-    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     gpu_src_arr = cp.asarray(src_arr)
@@ -37,20 +110,24 @@ def test_averaging_kernel():
     dst_arr = gpu_dst_arr.get()
 
     stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
-    reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)
+    reference = convolve(
+        remove_ghost_layers(src_arr), stencil, mode="constant", cval=0.0
+    )
     reference = add_ghost_layers(reference)
     np.testing.assert_almost_equal(reference, dst_arr)
 
 
 def test_variable_sized_fields():
-    src_field = Field.create_generic('src', spatial_dimensions=2)
-    dst_field = Field.create_generic('dst', spatial_dimensions=2)
+    src_field = Field.create_generic("src", spatial_dimensions=2)
+    dst_field = Field.create_generic("dst", spatial_dimensions=2)
 
-    update_rule = Assignment(dst_field[0, 0],
-                             (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4)
+    update_rule = Assignment(
+        dst_field[0, 0],
+        (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4,
+    )
 
     config = CreateKernelConfig(target=Target.GPU)
-    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     size = (3, 3)
@@ -64,7 +141,9 @@ def test_variable_sized_fields():
     dst_arr = gpu_dst_arr.get()
 
     stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0
-    reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0)
+    reference = convolve(
+        remove_ghost_layers(src_arr), stencil, mode="constant", cval=0.0
+    )
     reference = add_ghost_layers(reference)
     np.testing.assert_almost_equal(reference, dst_arr)
 
@@ -76,12 +155,14 @@ def test_multiple_index_dimensions():
     src_arr = np.array(np.random.rand(*src_size))
     dst_arr = np.zeros(dst_size)
 
-    src_field = Field.create_from_numpy_array('src', src_arr, index_dimensions=1)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr, index_dimensions=0)
+    src_field = Field.create_from_numpy_array("src", src_arr, index_dimensions=1)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr, index_dimensions=0)
 
     offset = (-2, -1)
-    update_rule = Assignment(dst_field[0, 0],
-                             sum([src_field[offset[0], offset[1]](i) for i in range(src_size[-1])]))
+    update_rule = Assignment(
+        dst_field[0, 0],
+        sum([src_field[offset[0], offset[1]](i) for i in range(src_size[-1])]),
+    )
 
     config = CreateKernelConfig(target=Target.GPU)
     ast = create_kernel([update_rule], config=config)
@@ -94,26 +175,31 @@ def test_multiple_index_dimensions():
 
     reference = np.zeros_like(dst_arr)
     gl = np.max(np.abs(np.array(offset, dtype=int)))
-    for x in range(gl, src_size[0]-gl):
-        for y in range(gl, src_size[1]-gl):
-            reference[x, y] = sum([src_arr[x+offset[0], y+offset[1], i] for i in range(src_size[2])])
+    for x in range(gl, src_size[0] - gl):
+        for y in range(gl, src_size[1] - gl):
+            reference[x, y] = sum(
+                [src_arr[x + offset[0], y + offset[1], i] for i in range(src_size[2])]
+            )
 
     np.testing.assert_almost_equal(reference, dst_arr)
 
 
-@pytest.mark.xfail(reason="Line indexing not available yet")
 def test_ghost_layer():
     size = (6, 5)
     src_arr = np.ones(size)
     dst_arr = np.zeros_like(src_arr)
-    src_field = Field.create_from_numpy_array('src', src_arr, index_dimensions=0)
-    dst_field = Field.create_from_numpy_array('dst', dst_arr, index_dimensions=0)
+    src_field = Field.create_from_numpy_array("src", src_arr, index_dimensions=0)
+    dst_field = Field.create_from_numpy_array("dst", dst_arr, index_dimensions=0)
 
     update_rule = Assignment(dst_field[0, 0], src_field[0, 0])
     ghost_layers = [(1, 2), (2, 1)]
 
-    config = CreateKernelConfig(target=Target.GPU, ghost_layers=ghost_layers, gpu="line")
-    ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config)
+    config = CreateKernelConfig()
+    config.target = Target.CUDA
+    config.ghost_layers = ghost_layers
+    config.gpu.indexing_scheme = "blockwise4d"
+
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     gpu_src_arr = cp.asarray(src_arr)
@@ -122,11 +208,13 @@ def test_ghost_layer():
     dst_arr = gpu_dst_arr.get()
 
     reference = np.zeros_like(src_arr)
-    reference[ghost_layers[0][0]:-ghost_layers[0][1], ghost_layers[1][0]:-ghost_layers[1][1]] = 1
+    reference[
+        ghost_layers[0][0] : -ghost_layers[0][1],
+        ghost_layers[1][0] : -ghost_layers[1][1],
+    ] = 1
     np.testing.assert_equal(reference, dst_arr)
 
 
-@pytest.mark.xfail(reason="Line indexing not available yet")
 def test_setting_value():
     arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5)
     arr_gpu = cp.asarray(arr_cpu)
@@ -135,8 +223,12 @@ def test_setting_value():
     f = Field.create_generic("f", 2)
     update_rule = [Assignment(f(0), sp.Symbol("value"))]
 
-    config = CreateKernelConfig(target=Target.GPU, gpu="line", iteration_slice=iteration_slice)
-    ast = create_kernel(sympy_cse_on_assignment_list(update_rule), config=config)
+    config = CreateKernelConfig()
+    config.target = Target.CUDA
+    config.iteration_slice = iteration_slice
+    config.gpu.indexing_scheme = "blockwise4d"
+
+    ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
     kernel(f=arr_gpu, value=np.float64(42.0))
@@ -167,21 +259,30 @@ def test_periodicity():
 def test_block_indexing(device_number):
     f = fields("f: [3D]")
     s = normalize_slice(make_slice[:, :, :], f.spatial_shape)
-    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
-                       permute_block_size_dependent_on_layout=False)
-    assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32)
-    assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8)
-
-    bi = BlockIndexing(s, f.layout, block_size=(32, 1, 1),
-                       permute_block_size_dependent_on_layout=False)
-    assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2)
-
-    bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2),
-                       maximum_block_size="auto", device_number=device_number)
+    bi = BlockIndexing(
+        s, f.layout, block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False
+    )
+    assert bi.call_parameters((3, 2, 32))["block"] == (3, 2, 32)
+    assert bi.call_parameters((32, 2, 32))["block"] == (16, 2, 8)
+
+    bi = BlockIndexing(
+        s, f.layout, block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False
+    )
+    assert bi.call_parameters((1, 16, 16))["block"] == (1, 16, 2)
+
+    bi = BlockIndexing(
+        s,
+        f.layout,
+        block_size=(16, 8, 2),
+        maximum_block_size="auto",
+        device_number=device_number,
+    )
 
     # This function should be used if number of needed registers is known. Can be determined with func.num_regs
     registers_per_thread = 1000
-    blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread)
+    blocks = bi.limit_block_size_by_register_restriction(
+        [1024, 1024, 1], registers_per_thread
+    )
 
     if cp.cuda.runtime.is_hip:
         max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number)
@@ -193,9 +294,9 @@ def test_block_indexing(device_number):
     assert np.prod(blocks) * registers_per_thread < max_registers_per_block
 
 
-@pytest.mark.parametrize('gpu_indexing', ("block", "line"))
-@pytest.mark.parametrize('layout', ("C", "F"))
-@pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
+@pytest.mark.parametrize("gpu_indexing", ("block", "line"))
+@pytest.mark.parametrize("layout", ("C", "F"))
+@pytest.mark.parametrize("shape", ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11)))
 @pytest.mark.xfail(reason="4D kernels not available yet")
 def test_four_dimensional_kernel(gpu_indexing, layout, shape):
     n_elements = np.prod(shape)
@@ -207,7 +308,9 @@ def test_four_dimensional_kernel(gpu_indexing, layout, shape):
     f = Field.create_from_numpy_array("f", arr_cpu)
     update_rule = [Assignment(f.center, sp.Symbol("value"))]
 
-    config = CreateKernelConfig(target=Target.GPU, gpu=gpu_indexing, iteration_slice=iteration_slice)
+    config = CreateKernelConfig(
+        target=Target.GPU, gpu=gpu_indexing, iteration_slice=iteration_slice
+    )
     ast = create_kernel(update_rule, config=config)
     kernel = ast.compile()
 
diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py
index 02b6b99220d11748647f82ccd88679cec6832ae7..b1f2da576750b60e4fbb8d9d4d33e393bc00dcf3 100644
--- a/tests/kernelcreation/test_iteration_slices.py
+++ b/tests/kernelcreation/test_iteration_slices.py
@@ -19,6 +19,7 @@ from pystencils.sympyextensions.integer_functions import int_rem
 from pystencils.simp import sympy_cse_on_assignment_list
 from pystencils.slicing import normalize_slice
 from pystencils.jit.gpu_cupy import CupyKernelWrapper
+from pystencils.codegen.gpu_indexing import ManualLaunchConfiguration
 
 
 def test_sliced_iteration():
@@ -137,8 +138,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
         expected[r, r:] = 1.0
 
     update = Assignment(f.center(), 1)
-    outer_counter = DEFAULTS.spatial_counters[0]
-    islice = make_slice[:, outer_counter:]
+
+    #   Have NumPy data layout -> X is slowest coordinate, Y is fastest
+    slow_counter = DEFAULTS.spatial_counters[0]
+    islice = make_slice[:, slow_counter:]
     gen_config = replace(gen_config, iteration_slice=islice)
 
     if gen_config.target == Target.CUDA:
@@ -147,8 +150,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp):
     kernel = create_kernel(update, gen_config).compile()
 
     if isinstance(kernel, CupyKernelWrapper):
-        kernel.block_size = shape + (1,)
-        kernel.num_blocks = (1, 1, 1)
+        assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
+
+        kernel.launch_config.block_size = shape + (1,)
+        kernel.launch_config.grid_size = (1, 1, 1)
 
     kernel(f=f_arr)
 
@@ -182,8 +187,10 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp):
             pytest.xfail("Gather/Scatter not implemented yet")
 
     if isinstance(kernel, CupyKernelWrapper):
-        kernel.block_size = (8, 16, 1)
-        kernel.num_blocks = (1, 1, 1)
+        assert isinstance(kernel.launch_config, ManualLaunchConfiguration)
+
+        kernel.launch_config.block_size = (8, 16, 1)
+        kernel.launch_config.grid_size = (1, 1, 1)
 
     kernel(f=f_arr)
 
diff --git a/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py b/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py
deleted file mode 100644
index da2b3a5ad3a0e224bc47a5dd0fa4f16b0ccde520..0000000000000000000000000000000000000000
--- a/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import pytest
-
-from pystencils.field import Field
-
-from pystencils.backend.kernelcreation import (
-    KernelCreationContext,
-    FullIterationSpace
-)
-
-from pystencils.backend.ast.structural import PsBlock, PsComment
-
-from pystencils.backend.platforms import CudaPlatform, SyclPlatform
-
-
-@pytest.mark.parametrize("layout", ["fzyx", "zyxf", "c", "f"])
-@pytest.mark.parametrize("platform_class", [CudaPlatform, SyclPlatform])
-def test_thread_range(platform_class, layout):
-    ctx = KernelCreationContext()
-
-    body = PsBlock([PsComment("Kernel body goes here")])
-    platform = platform_class(ctx)
-
-    dim = 3
-    archetype_field = Field.create_generic("field", spatial_dimensions=dim, layout=layout)
-    ispace = FullIterationSpace.create_with_ghost_layers(ctx, 1, archetype_field)
-
-    _, threads_range = platform.materialize_iteration_space(body, ispace)
-
-    assert threads_range.dim == dim
-    
-    match layout:
-        case "fzyx" | "zyxf" | "f":
-            indexing_order = [0, 1, 2]
-        case "c":
-            indexing_order = [2, 1, 0]
-
-    for i in range(dim):
-        #   Slowest to fastest coordinate
-        coordinate = indexing_order[i]
-        dimension = ispace.dimensions[coordinate]
-        witems = threads_range.num_work_items[i]
-        desired = dimension.stop - dimension.start
-        assert witems.structurally_equal(desired)