diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 474afbb113dbb1a72e6b077e9272a91749c5698d..a2ec00d16cc04af33a2d3e4e46183111e4cfbcda 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -337,6 +337,7 @@ build-documentation: artifacts: paths: - docs/build/html + when: always pages: diff --git a/conftest.py b/conftest.py index 8296641ed7b54c96fdf0124866f18d5299d66cf9..ff0467eff85d2b43854246ec98091a15ad347e63 100644 --- a/conftest.py +++ b/conftest.py @@ -55,7 +55,6 @@ try: import cupy except ImportError: collect_ignore += [ - os.path.join(SCRIPT_FOLDER, "tests/kernelcreation/test_gpu.py"), os.path.join(SCRIPT_FOLDER, "src/pystencils/backend/jit/gpu_cupy.py"), ] add_path_to_ignore("src/pystencils/gpu") diff --git a/docs/source/api/codegen.md b/docs/source/api/codegen.md index b739a4f33c922c97c083f7f466525892544bc46d..c52f28f51944d7d1c7eb35dc1108906c90058c68 100644 --- a/docs/source/api/codegen.md +++ b/docs/source/api/codegen.md @@ -108,6 +108,7 @@ The following categories with target-specific options are exposed: VectorizationOptions GpuOptions SyclOptions + GpuIndexingScheme .. autosummary:: :toctree: generated @@ -176,5 +177,5 @@ The following categories with target-specific options are exposed: Kernel GpuKernel Parameter - GpuThreadsRange + Lambda ``` diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md new file mode 100644 index 0000000000000000000000000000000000000000..1082669e640c989cad0b994ac70818f22a3517bb --- /dev/null +++ b/docs/source/backend/gpu_codegen.md @@ -0,0 +1,92 @@ +# GPU Code Generation + +The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components: + + - The {any}`CudaPlatform` at `backend.platforms` which performs materialization of a kernel's iteration + space by mapping GPU block and thread indices to iteration space points. To perform this task, + it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping. + The platform also takes care of lowering mathematical functions to their CUDA runtime library implementation. + - In the code generation driver, the strings are drawn by the `GpuIndexing` helper class. + It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration + for the runtime system. + +:::{attention} + +Code generation for HIP through the `CudaPlatform` is experimental and not tested at the moment. +::: + +## The CUDA Platform and Thread Mappings + +```{eval-rst} +.. module:: pystencils.backend.platforms.cuda + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + ThreadMapping + Linear3DMapping + Blockwise4DMapping +``` + +## Thread Indexing In The Driver + +With regard to GPU thread indexing, the code generation driver has two tasks: +it must provide the Cuda platform object with a valid thread mapping, +and must also provide the runtime system with a [launch configuration](#gpu_launch_config) +which defines the shape of the GPU block grid. +Both of these are produced by the {any}`GpuIndexing` class. +It is instantiated with the GPU indexing scheme and indexing options given by the user. + +At this time, the backend and code generation driver support two indexing schemes: +"Linear3D" (see {any}`Linear3DMapping`) and "Blockwise4D" (see {any}`Blockwise4DMapping`). +These are mostly reimplemented from the pystencils 1.3.x `"block"` and `"line"` indexing options. +The GPU indexing system may be extended in the future. + + +```{eval-rst} +.. module:: pystencils.codegen.gpu_indexing + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + GpuIndexing +``` + +(gpu_launch_config)= +## The Launch Configuration + +The launch configuration is attached to the `GpuKernel` and thus returned to the runtime system. +Since a concrete launch configuration is not specific to the kernel itself, but to the kernels' +invocation site, the code generator only attaches a *factory function* for launch configurations +to `GpuKernel`. It is up to the runtime system to locally instantiate and configure a launch configuration. +To determine the actual launch grid, the launch configuration must be evaluated at the kernel's call site +by passing the required parameters to `GpuLaunchConfiguration.evaluate` + +The {any}`CupyJit`, for instance, will create the launch configuration object while preparing the JIT-compiled +kernel wrapper object. The launch config is there exposed to the user, who may modify some of its properties. +These depend on the type of the launch configuration: +while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel +parameters, +the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size. + +The `evaluate` method can only be used from within a Python runtime environment. +When exporting pystencils CUDA kernels for external use in C++ projects, +equivalent C++ code evaluating the launch config must be generated. +This is the task of, e.g., [pystencils-sfg](https://pycodegen.pages.i10git.cs.fau.de/pystencils-sfg/). + + +```{eval-rst} +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + GpuLaunchConfiguration + AutomaticLaunchConfiguration + ManualLaunchConfiguration + DynamicBlockSizeLaunchConfiguration +``` diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst index 5ab8dbd34eb37fbc38230f3db0506c572d4b6964..0d384c55bc1e5933a055365f9d5ffe4143c902b6 100644 --- a/docs/source/backend/index.rst +++ b/docs/source/backend/index.rst @@ -17,6 +17,7 @@ who wish to customize or extend the behaviour of the code generator in their app translation platforms transformations + gpu_codegen errors extensions diff --git a/docs/source/backend/platforms.md b/docs/source/backend/platforms.md new file mode 100644 index 0000000000000000000000000000000000000000..e7ffc6f1523272621d273a17624d6323c25651b1 --- /dev/null +++ b/docs/source/backend/platforms.md @@ -0,0 +1,54 @@ +# Platforms + +All target-specific code generation in the pystencils backend is facilitated +through the *platform classes*. +This includes: + + - Materialization of the iteration space, meaning the mapping of iteration space points to some indexing structure + - Lowering of mathematical functions to their implementation in some runtime environment + - Selection of vector intrinsics for SIMD-capable CPU targets + +Encapsulation of hardware- and environment-specific details into platform objects allows +us to implement most of the code generator in a generic and hardware-agnostic way. +It also makes it easier to extend pystencils with support for additional code generation +targets in the future. + +## Base Classes + +```{eval-rst} +.. module:: pystencils.backend.platforms + +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + Platform + GenericCpu + GenericVectorCpu + GenericGpu +``` + +## CPU Platforms + +```{eval-rst} +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + X86VectorCpu + X86VectorArch +``` + +## GPU Platforms + +```{eval-rst} +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + CudaPlatform + SyclPlatform +``` diff --git a/docs/source/backend/platforms.rst b/docs/source/backend/platforms.rst deleted file mode 100644 index 68b74504cfc94dd20e72a5852a2e45d399065aef..0000000000000000000000000000000000000000 --- a/docs/source/backend/platforms.rst +++ /dev/null @@ -1,6 +0,0 @@ -********* -Platforms -********* - -.. automodule:: pystencils.backend.platforms - :members: \ No newline at end of file diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md index 56c97509cbc4c0e3f312fade9fd08af90c4c9c3d..8be86cdd436bf4e7079a3a4cae3ac17748ceb9e6 100644 --- a/docs/source/contributing/index.md +++ b/docs/source/contributing/index.md @@ -1,3 +1,4 @@ +(contribution_guide)= # Contribution Guide Welcome to the Contributor's Guide to pystencils! diff --git a/docs/source/installation.md b/docs/source/installation.md index cea0acd2feba1c45f2e73b0d52292a92aaa28d07..deb2b0613564f98468f623544acf3cc1ca9d279e 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -38,12 +38,14 @@ The following feature sets are available: If you are developing pystencils, we recommend you perform an editable install of your local clone of the repository, with all optional features: ```bash -pip install -e pystencils[alltrafos,interactive,use_cython,doc,tests] +pip install -e pystencils[alltrafos,interactive,use_cython,doc,testsuite] ``` This includes the additional feature groups `doc`, which contains all dependencies required to build this documentation, and `tests`, which adds `flake8` for code style checking, `mypy` for static type checking, and `pytest` plus plugins for running the test suite. + +For more information on developing pystencils, see the [](#contribution_guide). ::: ### For Nvidia GPUs diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 4db2d79443bc1d09031b3f1a8c8af014a0c824a8..610c61ddf647331d7b77b06968e489b4dcc76293 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -28,7 +28,7 @@ import matplotlib.pyplot as plt Pystencils offers code generation for Nvidia GPUs using the CUDA programming model, as well as just-in-time compilation and execution of CUDA kernels from within Python -based on the [cupy] library.w +based on the [cupy] library. This section's objective is to give a detailed introduction into the creation of GPU kernels with pystencils. @@ -51,16 +51,22 @@ ps.inspect(kernel) The `kernel` object returned by the code generator in above snippet is an instance of the {py:class}`GpuKernel` class. It extends {py:class}`Kernel` with some GPU-specific information. -In particular, it defines the {any}`threads_range <GpuKernel.threads_range>` -property, which tells us how many threads the kernel is expecting to be executed with: - -```{code-cell} ipython3 -kernel.threads_range -``` If a GPU is available and [CuPy][cupy] is installed in the current environment, the kernel can be compiled and run immediately. -To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field. +To execute the kernel, a {any}`cupy.ndarray` has to be passed for each field: + +```{code-cell} ipython3 +:tags: [raises-exception] +import cupy as cp + +rng = cp.random.default_rng(seed=42) +f_arr = rng.random((16, 16, 16)) +g_arr = cp.zeros_like(f_arr) + +kfunc = kernel.compile() +kfunc(f=f_arr, g=g_arr) +``` :::{note} [CuPy][cupy] is a Python library for numerical computations on GPU arrays, @@ -68,54 +74,105 @@ which operates much in the same way that [NumPy][numpy] works on CPU arrays. Cupy and NumPy expose nearly the same APIs for array operations; the difference being that CuPy allocates all its arrays on the GPU and performs its operations as CUDA kernels. -Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvcc]. +Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc]. In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code, and to allocate and manage the data these kernels can be executed on. For more information on CuPy, refer to [their documentation][cupy-docs]. ::: -```{code-cell} ipython3 -:tags: [raises-exception] -import cupy as cp +(indexing_and_launch_config)= +## Modify the Indexing Scheme and Launch Configuration + +There are two key elements to how the work items of a GPU kernel's iteration space +are mapped onto a GPU launch grid: + - The *indexing scheme* defines the relation between thread indices and iteration space points; + it can be modified through the {any}`gpu.indexing_scheme <GpuOptions.indexing_scheme>` option + and is fixed for the entire kernel. + - The *launch configuration* defines the number of threads per block, and the number of blocks on the grid, + with which the kernel should be launched. + The launch configuration mostly depends on the size of the arrays passed to the kernel, + but parts of it may also be modified. + The launch configuration may change at each kernel invocation. + +(linear3d)= +### The Default "Linear3D" Indexing Scheme + +By default, *pystencils* will employ a 1:1-mapping between threads and iteration space points +via the global thread indices inside the launch grid; e.g. + +```{code-block} C++ +ctr_0 = start_0 + step_0 * (blockSize.x * blockIdx.x + threadIdx.x); +ctr_1 = start_1 + step_1 * (blockSize.y * blockIdx.y + threadIdx.y); +ctr_2 = start_2 + step_2 * (blockSize.z * blockIdx.z + threadIdx.z); +``` -rng = cp.random.default_rng(seed=42) -f_arr = rng.random((16, 16, 16)) -g_arr = cp.zeros_like(f_arr) +For most kernels with an at most three-dimensional iteration space, +this behavior is sufficient and desired. +It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`. + +If the `Linear3D` indexing scheme is used, you may modifiy the GPU thread block size in two places. +The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` +code generator option; +if none is specified, a default depending on the iteration space's dimensionality will be used. +The block size can furthermore be modified at the compiled kernel's wrapper object via the +`launch_config.block_size` attribute: + +```{code-cell} ipython3 kfunc = kernel.compile() +kfunc.launch_config.block_size = (256, 2, 1) + +# Run the kernel kfunc(f=f_arr, g=g_arr) ``` -### Modifying the Launch Grid +In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments +and the given thread block size. -The `kernel.compile()` invocation in the above code produces a {any}`CupyKernelWrapper` callable object. -This object holds the kernel's launch grid configuration -(i.e. the number of thread blocks, and the number of threads per block.) -Pystencils specifies a default value for the block size and if possible, -the number of blocks is automatically inferred in order to cover the entire iteration space. -In addition, the wrapper's interface allows us to customize the GPU launch grid, -by manually setting both the number of threads per block, and the number of blocks on the grid: +:::{attention} -```{code-cell} ipython3 -kfunc.block_size = (16, 8, 8) -kfunc.num_blocks = (1, 2, 2) -``` +According to the way GPU architecture splits thread blocks into warps, +pystencils will map the kernel's *fastest* spatial coordinate onto the `x` block and thread +indices, the second-fastest to `y`, and the slowest coordiante to `z`. -For most kernels, setting only the `block_size` is sufficient since pystencils will -automatically compute the number of blocks; -for exceptions to this, see [](#manual_launch_grids). -If `num_blocks` is set manually and the launch grid thus specified is too small, only -a part of the iteration space will be traversed by the kernel; -similarily, if it is too large, it will cause any threads working outside of the iteration bounds to idle. +This can mean that, when using `cupy` arrays with the default memory layout +(corresponding to the `"numpy"` field layout specifier), +the *thread coordinates* and the *spatial coordinates* +map to each other in *opposite order*; e.g. + +| Spatial Coordinate | Thread Index | +|--------------------|---------------| +| `x` (slowest) | `threadIdx.z` | +| `y` | `threadIdx.y` | +| `z` (fastest) | `threadIdx.x` | + +::: (manual_launch_grids)= ### Manual Launch Grids and Non-Cuboid Iteration Patterns -In some cases, it will be unavoidable to set the launch grid size manually; -especially if the code generator is unable to automatically determine the size of the -iteration space. -An example for this is the triangular iteration previously described in the [Kernel Creation Guide](#example_triangular_iteration). +By default, the above indexing schemes will automatically compute the GPU launch configuration +from array shapes and optional user input. +However, it is also possible to override this behavior and instead specify a launch grid manually. +This will even be unavoidable if the code generator cannot precompute the number of points +in the kernel's iteration space. + +To specify a manual launch configuration, set the {any}`gpu.manual_launch_grid <GpuOptions.manual_launch_grid>` +option to `True`. +Then, after compiling the kernel, set its block and grid size via the `launch_config` property: + +```{code-cell} ipython3 +cfg.gpu.manual_launch_grid = True + +kernel = ps.create_kernel(update, cfg) +kfunc = kernel.compile() +kfunc.launch_config.block_size = (64, 2, 1) +kfunc.launch_config.grid_size = (4, 2, 1) +``` + +An example where this is necessary is the triangular iteration +previously described in the [Kernel Creation Guide](#example_triangular_iteration). Let's set it up once more: ```{code-cell} ipython3 @@ -150,26 +207,21 @@ assignments = [ ```{code-cell} ipython3 y = ps.DEFAULTS.spatial_counters[0] -cfg = ps.CreateKernelConfig( - target=ps.Target.CUDA, - iteration_slice=ps.make_slice[:, y:] -) - -kernel = ps.create_kernel(assignments, cfg).compile() +cfg = ps.CreateKernelConfig() +cfg.target= ps.Target.CUDA +cfg.iteration_slice = ps.make_slice[:, y:] ``` -This warns us that the threads range could not be determined automatically. -We can disable this warning by setting `manual_launch_grid` in the GPU option category: +For this kernel, the code generator cannot figure out a launch configuration on its own, +so we need to manually provide one: -```{code-cell} +```{code-cell} ipython3 cfg.gpu.manual_launch_grid = True -``` - -Now, to execute our kernel, we have to manually specify its launch grid: + +kernel = ps.create_kernel(assignments, cfg).compile() -```{code-cell} ipython3 -kernel.block_size = (8, 8) -kernel.num_blocks = (2, 2) +kernel.launch_config.block_size = (8, 8) +kernel.launch_config.grid_size = (2, 2) ``` This way the kernel will cover this iteration space: @@ -181,11 +233,11 @@ kernel(f=f_arr) _draw_ispace(cp.asnumpy(f_arr)) ``` -We can also observe the effect of decreasing the launch grid size: +We can also observe the effect of decreasing the launch grid size. ```{code-cell} ipython3 -kernel.block_size = (4, 4) -kernel.num_blocks = (2, 3) +kernel.launch_config.block_size = (4, 4) +kernel.launch_config.grid_size = (2, 3) ``` ```{code-cell} ipython3 @@ -199,15 +251,6 @@ Here, since there are only eight threads operating in $x$-direction, and twelve threads in $y$-direction, only a part of the triangle is being processed. -## API Reference - -```{eval-rst} -.. autosummary:: - :nosignatures: - - pystencils.codegen.GpuKernel - pystencils.jit.gpu_cupy.CupyKernelWrapper -``` :::{admonition} Developers To Do: @@ -218,5 +261,5 @@ only a part of the triangle is being processed. [cupy]: https://cupy.dev "CuPy Homepage" [numpy]: https://numpy.org "NumPy Homepage" -[nvcc]: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html "NVIDIA CUDA Compiler Driver" +[nvrtc]: https://docs.nvidia.com/cuda/nvrtc/index.html "NVIDIA Runtime Compilation Library" [cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation" diff --git a/pytest.ini b/pytest.ini index 707a43b4548e99e8e6862e1b48a1844e4318b55e..744a74bc781b3e03568e3c3a67cefbe9395bd713 100644 --- a/pytest.ini +++ b/pytest.ini @@ -64,6 +64,7 @@ exclude_lines = if 0: if False: if __name__ == .__main__.: + assert False # Don't cover type checking imports if TYPE_CHECKING: diff --git a/src/pystencils/backend/memory.py b/src/pystencils/backend/memory.py index 7a5d62f691d81a0f251329c47216f65a981ef291..0e9b21d6c8268bec7fef97ac6fcb2b4b0e37f8f4 100644 --- a/src/pystencils/backend/memory.py +++ b/src/pystencils/backend/memory.py @@ -89,8 +89,13 @@ class PsSymbol: return f"PsSymbol({repr(self._name)}, {repr(self._dtype)})" +class BackendPrivateProperty: + """Mix-in marker for symbol properties that are private to the backend + and should not be exported to parameters""" + + @dataclass(frozen=True) -class BufferBasePtr(UniqueSymbolProperty): +class BufferBasePtr(UniqueSymbolProperty, BackendPrivateProperty): """Symbol acts as a base pointer to a buffer.""" buffer: PsBuffer @@ -120,12 +125,12 @@ class PsBuffer: strides: Sequence[PsSymbol | PsConstant], ): bptr_type = base_ptr.get_dtype() - + if not isinstance(bptr_type, PsPointerType): raise ValueError( f"Type of buffer base pointer {base_ptr} was not a pointer type: {bptr_type}" ) - + if bptr_type.base_type != element_type: raise ValueError( f"Base type of primary buffer base pointer {base_ptr} " diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index ff41d3a68888347e81e9ff65d1359467666f2e32..e896fc2bba9a484e33480b34b3e390d8c44eb4df 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,11 +1,11 @@ from __future__ import annotations -from warnings import warn -from typing import TYPE_CHECKING +from abc import ABC, abstractmethod -from ...types import constify +from ...types import constify, deconstify from ..exceptions import MaterializationError from .generic_gpu import GenericGpu +from ..memory import PsSymbol from ..kernelcreation import ( Typifier, IterationSpace, @@ -29,8 +29,6 @@ from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral from ..functions import PsMathFunction, MathFunctions, CFunction -if TYPE_CHECKING: - from ...codegen import GpuThreadsRange int32 = PsSignedIntegerType(width=32, const=False) @@ -48,18 +46,143 @@ GRID_DIM = [ ] +class ThreadMapping(ABC): + + @abstractmethod + def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: + """Map the current thread index onto a point in the given iteration space. + + Implementations of this method must return a declaration for each dimension counter + of the given iteration space. + """ + + +class Linear3DMapping(ThreadMapping): + """3D globally linearized mapping, where each thread is assigned a work item according to + its location in the global launch grid.""" + + def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: + match ispace: + case FullIterationSpace(): + return self._dense_mapping(ispace) + case SparseIterationSpace(): + return self._sparse_mapping(ispace) + case _: + assert False, "unexpected iteration space" + + def _dense_mapping( + self, ispace: FullIterationSpace + ) -> dict[PsSymbol, PsExpression]: + if ispace.rank > 3: + raise MaterializationError( + f"Cannot handle {ispace.rank}-dimensional iteration space " + "using the Linear3D GPU thread index mapping." + ) + + dimensions = ispace.dimensions_in_loop_order() + idx_map: dict[PsSymbol, PsExpression] = dict() + + for coord, dim in enumerate(dimensions[::-1]): + tid = self._linear_thread_idx(coord) + idx_map[dim.counter] = dim.start + dim.step * PsCast( + deconstify(dim.counter.get_dtype()), tid + ) + + return idx_map + + def _sparse_mapping( + self, ispace: SparseIterationSpace + ) -> dict[PsSymbol, PsExpression]: + sparse_ctr = PsExpression.make(ispace.sparse_counter) + thread_idx = self._linear_thread_idx(0) + idx_map: dict[PsSymbol, PsExpression] = { + ispace.sparse_counter: PsCast( + deconstify(sparse_ctr.get_dtype()), thread_idx + ) + } + return idx_map + + def _linear_thread_idx(self, coord: int): + block_size = BLOCK_DIM[coord] + block_idx = BLOCK_IDX[coord] + thread_idx = THREAD_IDX[coord] + return block_idx * block_size + thread_idx + + +class Blockwise4DMapping(ThreadMapping): + """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions + are mapped to block indices.""" + + _indices_fastest_first = [ # slowest to fastest + THREAD_IDX[0], + BLOCK_IDX[0], + BLOCK_IDX[1], + BLOCK_IDX[2] + ] + + def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: + match ispace: + case FullIterationSpace(): + return self._dense_mapping(ispace) + case SparseIterationSpace(): + return self._sparse_mapping(ispace) + case _: + assert False, "unexpected iteration space" + + def _dense_mapping( + self, ispace: FullIterationSpace + ) -> dict[PsSymbol, PsExpression]: + if ispace.rank > 4: + raise MaterializationError( + f"Cannot handle {ispace.rank}-dimensional iteration space " + "using the Blockwise4D GPU thread index mapping." + ) + + dimensions = ispace.dimensions_in_loop_order() + idx_map: dict[PsSymbol, PsExpression] = dict() + + for dim, tid in zip(dimensions[::-1], self._indices_fastest_first): + idx_map[dim.counter] = dim.start + dim.step * PsCast( + deconstify(dim.counter.get_dtype()), tid + ) + + return idx_map + + def _sparse_mapping( + self, ispace: SparseIterationSpace + ) -> dict[PsSymbol, PsExpression]: + sparse_ctr = PsExpression.make(ispace.sparse_counter) + thread_idx = self._indices_fastest_first[0] + idx_map: dict[PsSymbol, PsExpression] = { + ispace.sparse_counter: PsCast( + deconstify(sparse_ctr.get_dtype()), thread_idx + ) + } + return idx_map + + class CudaPlatform(GenericGpu): - """Platform for CUDA-based GPUs.""" + """Platform for CUDA-based GPUs. + + Args: + ctx: The kernel creation context + omit_range_check: If `True`, generated index translation code will not check if the point identified + by block and thread indices is actually contained in the iteration space + thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points + """ def __init__( - self, ctx: KernelCreationContext, + self, + ctx: KernelCreationContext, omit_range_check: bool = False, - manual_launch_grid: bool = False, + thread_mapping: ThreadMapping | None = None, ) -> None: super().__init__(ctx) self._omit_range_check = omit_range_check - self._manual_launch_grid = manual_launch_grid + self._thread_mapping = ( + thread_mapping if thread_mapping is not None else Linear3DMapping() + ) self._typify = Typifier(ctx) @@ -69,7 +192,7 @@ class CudaPlatform(GenericGpu): def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange | None]: + ) -> PsBlock: if isinstance(ispace, FullIterationSpace): return self._prepend_dense_translation(body, ispace) elif isinstance(ispace, SparseIterationSpace): @@ -141,42 +264,26 @@ class CudaPlatform(GenericGpu): def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange | None]: - dimensions = ispace.dimensions_in_loop_order() - - if not self._manual_launch_grid: - try: - threads_range = self.threads_from_ispace(ispace) - except MaterializationError as e: - warn( - str(e.args[0]) - + "\nIf this is intended, set `manual_launch_grid=True` in the code generator configuration.", - UserWarning, - ) - threads_range = None - else: - threads_range = None + ) -> PsBlock: + ctr_mapping = self._thread_mapping(ispace) indexing_decls = [] conds = [] - for i, dim in enumerate(dimensions[::-1]): + + dimensions = ispace.dimensions_in_loop_order() + + for dim in dimensions: + # counter declarations must be ordered slowest-to-fastest + # such that inner dimensions can depend on outer ones + dim.counter.dtype = constify(dim.counter.get_dtype()) - ctr = PsExpression.make(dim.counter) + ctr_expr = PsExpression.make(dim.counter) indexing_decls.append( - self._typify( - PsDeclaration( - ctr, - dim.start - + dim.step - * PsCast(ctr.get_dtype(), self._linear_thread_idx(i)), - ) - ) + self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) if not self._omit_range_check: - conds.append(PsLt(ctr, dim.stop)) - - indexing_decls = indexing_decls[::-1] + conds.append(PsLt(ctr_expr, dim.stop)) if conds: condition: PsExpression = conds[0] @@ -187,18 +294,19 @@ class CudaPlatform(GenericGpu): body.statements = indexing_decls + body.statements ast = body - return ast, threads_range + return ast def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> PsBlock: factory = AstFactory(self._ctx) ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) - sparse_ctr = PsExpression.make(ispace.sparse_counter) - thread_idx = self._linear_thread_idx(0) + sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) + ctr_mapping = self._thread_mapping(ispace) + sparse_idx_decl = self._typify( - PsDeclaration(sparse_ctr, PsCast(sparse_ctr.get_dtype(), thread_idx)) + PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter]) ) mappings = [ @@ -207,7 +315,7 @@ class CudaPlatform(GenericGpu): PsLookup( PsBufferAcc( ispace.index_list.base_pointer, - (sparse_ctr, factory.parse_index(0)), + (sparse_ctr_expr.clone(), factory.parse_index(0)), ), coord.name, ), @@ -218,16 +326,10 @@ class CudaPlatform(GenericGpu): if not self._omit_range_check: stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr, stop) + condition = PsLt(sparse_ctr_expr.clone(), stop) ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) else: body.statements = [sparse_idx_decl] + body.statements ast = body - return ast, self.threads_from_ispace(ispace) - - def _linear_thread_idx(self, coord: int): - block_size = BLOCK_DIM[coord] - block_idx = BLOCK_IDX[coord] - thread_idx = THREAD_IDX[coord] - return block_idx * block_size + thread_idx + return ast diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 15df36cdd9cf416a8438f12816cbe00cfaeea204..b5b35c8b03447f1d5c35ed1289b89542bb1127ca 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,61 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING -from abc import abstractmethod -from ..ast.expressions import PsExpression -from ..ast.structural import PsBlock -from ..kernelcreation.iteration_space import ( - IterationSpace, - FullIterationSpace, - SparseIterationSpace, -) from .platform import Platform -from ..exceptions import MaterializationError - -if TYPE_CHECKING: - from ...codegen.kernel import GpuThreadsRange class GenericGpu(Platform): - @abstractmethod - def materialize_iteration_space( - self, body: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange | None]: - pass - - @classmethod - def threads_from_ispace(cls, ispace: IterationSpace) -> GpuThreadsRange: - from ...codegen.kernel import GpuThreadsRange - - if isinstance(ispace, FullIterationSpace): - return cls._threads_from_full_ispace(ispace) - elif isinstance(ispace, SparseIterationSpace): - work_items = (PsExpression.make(ispace.index_list.shape[0]),) - return GpuThreadsRange(work_items) - else: - assert False - - @classmethod - def _threads_from_full_ispace(cls, ispace: FullIterationSpace) -> GpuThreadsRange: - from ...codegen.kernel import GpuThreadsRange - - dimensions = ispace.dimensions_in_loop_order()[::-1] - if len(dimensions) > 3: - raise NotImplementedError( - f"Cannot create a GPU threads range for an {len(dimensions)}-dimensional iteration space" - ) - - from ..ast.analysis import collect_undefined_symbols as collect - - for dim in dimensions: - symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step) - for ctr in ispace.counters: - if ctr in symbs: - raise MaterializationError( - "Unable to construct GPU threads range for iteration space: " - f"Limits of dimension counter {dim.counter.name} " - f"depend on another dimension's counter {ctr.name}" - ) - - work_items = [ispace.actual_iterations(dim) for dim in dimensions] - return GpuThreadsRange(work_items) + """Base class for GPU platforms.""" diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index 2c7ee1c5f4750eac0375bc31a3f44b9eea50642b..8ed4729a2d67777bbd132d9e48140e20e3656767 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from typing import Any from ..ast.structural import PsBlock from ..ast.expressions import PsCall, PsExpression @@ -12,9 +11,9 @@ class Platform(ABC): """Abstract base class for all supported platforms. The platform performs all target-dependent tasks during code generation: - - - Translation of the iteration space to an index source (loop nest, GPU indexing, ...) - - Platform-specific optimizations (e.g. vectorization, OpenMP) + + - Translation of the iteration space to an index source (loop nest, GPU indexing, ...) + - Platform-specific optimizations (e.g. vectorization, OpenMP) """ def __init__(self, ctx: KernelCreationContext) -> None: @@ -23,12 +22,16 @@ class Platform(ABC): @property @abstractmethod def required_headers(self) -> set[str]: + """Set of header files that must be included at the point of definition of a kernel + running on this platform.""" pass @abstractmethod def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> PsBlock | tuple[PsBlock, Any]: + ) -> PsBlock: + """Materialize the given iteration space as an indexing structure and embed the given + kernel body into that structure.""" pass @abstractmethod diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index e1da9e2237b504c4b6b681d48812e4d079a5b463..7f2bbf9f6bf54a8c9cdbcd4a9989e54b9db66923 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -1,5 +1,4 @@ from __future__ import annotations -from typing import TYPE_CHECKING from ..functions import CFunction, PsMathFunction, MathFunctions from ..kernelcreation.iteration_space import ( @@ -29,9 +28,6 @@ from .generic_gpu import GenericGpu from ..exceptions import MaterializationError from ...types import PsCustomType, PsIeeeFloatType, constify, PsIntegerType -if TYPE_CHECKING: - from ...codegen import GpuThreadsRange - class SyclPlatform(GenericGpu): @@ -39,7 +35,7 @@ class SyclPlatform(GenericGpu): self, ctx: KernelCreationContext, omit_range_check: bool = False, - automatic_block_size: bool = False + automatic_block_size: bool = False, ): super().__init__(ctx) @@ -52,7 +48,7 @@ class SyclPlatform(GenericGpu): def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> PsBlock: if isinstance(ispace, FullIterationSpace): return self._prepend_dense_translation(body, ispace) elif isinstance(ispace, SparseIterationSpace): @@ -113,15 +109,13 @@ class SyclPlatform(GenericGpu): def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> PsBlock: rank = ispace.rank id_type = self._id_type(rank) id_symbol = PsExpression.make(self._ctx.get_symbol("id", id_type)) id_decl = self._id_declaration(rank, id_symbol) dimensions = ispace.dimensions_in_loop_order() - launch_config = self.threads_from_ispace(ispace) - indexing_decls = [id_decl] conds = [] @@ -153,11 +147,11 @@ class SyclPlatform(GenericGpu): body.statements = indexing_decls + body.statements ast = body - return ast, launch_config + return ast def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace - ) -> tuple[PsBlock, GpuThreadsRange]: + ) -> PsBlock: factory = AstFactory(self._ctx) id_type = PsCustomType("sycl::id< 1 >", const=True) @@ -195,7 +189,7 @@ class SyclPlatform(GenericGpu): body.statements = [sparse_idx_decl] + body.statements ast = body - return ast, self.threads_from_ispace(ispace) + return ast def _item_type(self, rank: int): if not self._automatic_block_size: diff --git a/src/pystencils/codegen/__init__.py b/src/pystencils/codegen/__init__.py index e13f911dd9c2f8dd1a7b264a79dcdcd51cbef003..1b2cd2ffb6256ecd2dbc287e5f4f4c7a3a723dc4 100644 --- a/src/pystencils/codegen/__init__.py +++ b/src/pystencils/codegen/__init__.py @@ -4,8 +4,10 @@ from .config import ( AUTO, ) from .parameters import Parameter -from .kernel import Kernel, GpuKernel, GpuThreadsRange +from .kernel import Kernel, GpuKernel from .driver import create_kernel, get_driver +from .functions import Lambda +from .errors import CodegenError __all__ = [ "Target", @@ -14,7 +16,8 @@ __all__ = [ "Parameter", "Kernel", "GpuKernel", - "GpuThreadsRange", + "Lambda", "create_kernel", "get_driver", + "CodegenError", ] diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index bce0757313b597cb9f4413e84652b9878c39a2d5..0d43b40e35b9e90b1f13e2ba7bc279274f466859 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -3,6 +3,7 @@ from __future__ import annotations from warnings import warn from abc import ABC from collections.abc import Collection +from enum import Enum, auto from typing import TYPE_CHECKING, Sequence, Generic, TypeVar, Callable, Any, cast from dataclasses import dataclass, InitVar, fields @@ -85,7 +86,9 @@ class Option(Generic[Option_T, Arg_T]): self._name = name self._lookup = f"_{name}" - def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Option_T | None: + def __get__( + self, obj: ConfigBase, objtype: type[ConfigBase] | None = None + ) -> Option_T | None: if obj is None: return None @@ -193,7 +196,9 @@ class Category(Generic[Category_T]): self._name = name self._lookup = f"_{name}" - def __get__(self, obj: ConfigBase, objtype: type[ConfigBase] | None = None) -> Category_T: + def __get__( + self, obj: ConfigBase, objtype: type[ConfigBase] | None = None + ) -> Category_T: if obj is None: return None @@ -331,10 +336,42 @@ class CpuOptions(ConfigBase): """ +class GpuIndexingScheme(Enum): + """Available index translation schemes for GPU kernels.""" + + Linear3D = auto() + """Map coordinates to global thread indices. + + Supports up to three-dimensional iteration spaces. + For each dimension (with known start, stop and step values), compute the current iteration + point as ``start + step * (blockIdx.c * blockDim.c * threadDim.c)`` + (where c :math:`\\in` (x, y, z)). + """ + + Blockwise4D = auto() + """On a 3D grid of 1D blocks, map the fastest coordinate onto the intra-block thread index, + and slower coordinates onto the block index. + + Supports up to four-dimensional iteration spaces. + Using this indexing scheme, the iteration counters of up to four dimensions are assigned + like follows, from slowest to fastest: + + .. code-block:: C++ + + ctr_3 = blockIdx.z; + ctr_2 = blockIdx.y; + ctr_1 = blockIdx.x; + ctr_0 = threadIDx.x; + """ + + @dataclass class GpuOptions(ConfigBase): """Configuration options specific to GPU targets.""" + indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D) + """Thread indexing scheme for dense GPU kernels.""" + omit_range_check: BasicOption[bool] = BasicOption(False) """If set to `True`, omit the iteration counter range check. @@ -343,8 +380,13 @@ class GpuOptions(ConfigBase): This check can be discarded through this option, at your own peril. """ - block_size: BasicOption[tuple[int, int, int]] = BasicOption() - """Desired block size for the execution of GPU kernels. May be overridden later by the runtime system.""" + block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO) + """Desired block size for the execution of GPU kernels. + + This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>` + is chosen as an indexing scheme. + The block size may be overridden at runtime. + """ manual_launch_grid: BasicOption[bool] = BasicOption(False) """Always require a manually specified launch grid when running this kernel. @@ -354,6 +396,31 @@ class GpuOptions(ConfigBase): The launch grid will then have to be specified manually at runtime. """ + @indexing_scheme.validate + def _validate_idx_scheme(self, val: str | GpuIndexingScheme): + if isinstance(val, GpuIndexingScheme): + return val + + match val.lower(): + case "block": + warn( + "GPU indexing scheme name `block` is deprecated and will be removed in pystencils 2.1. " + "Use `Linear3D` instead." + ) + return GpuIndexingScheme.Linear3D + case "line": + warn( + "GPU indexing scheme name `line` is deprecated and will be removed in pystencils 2.1. " + "Use `Blockwise4D` instead." + ) + return GpuIndexingScheme.Blockwise4D + case "linear3d": + return GpuIndexingScheme.Linear3D + case "blockwise4d": + return GpuIndexingScheme.Blockwise4D + case _: + raise ValueError(f"Invalid GPU indexing scheme: {val}") + @dataclass class SyclOptions(ConfigBase): @@ -506,6 +573,9 @@ class CreateKernelConfig(ConfigBase): cpu_vectorize_info: InitVar[dict | None] = None """Deprecated; use `cpu.vectorize <CpuOptions.vectorize>` instead.""" + gpu_indexing: InitVar[str | None] = None + """Deprecated; use `gpu.indexing_scheme <GpuOptions.indexing_scheme>` instead.""" + gpu_indexing_params: InitVar[dict | None] = None """Deprecated; set options in the `gpu` category instead.""" @@ -531,11 +601,8 @@ class CreateKernelConfig(ConfigBase): elif self.get_target() == Target.CUDA: try: from ..jit.gpu_cupy import CupyJit - - if self.gpu is not None and self.gpu.block_size is not None: - return CupyJit(self.gpu.block_size) - else: - return CupyJit() + + return CupyJit() except ImportError: from ..jit import no_jit @@ -564,6 +631,7 @@ class CreateKernelConfig(ConfigBase): data_type: UserTypeSpec | None, cpu_openmp: bool | int | None, cpu_vectorize_info: dict | None, + gpu_indexing: str | None, gpu_indexing_params: dict | None, ): # pragma: no cover if data_type is not None: @@ -593,9 +661,7 @@ class CreateKernelConfig(ConfigBase): deprecated_omp.enable = True deprecated_omp.num_threads = cpu_openmp case _: - raise ValueError( - f"Invalid option for `cpu_openmp`: {cpu_openmp}" - ) + raise ValueError(f"Invalid option for `cpu_openmp`: {cpu_openmp}") self.cpu.openmp = deprecated_omp @@ -652,11 +718,20 @@ class CreateKernelConfig(ConfigBase): self.cpu.vectorize = deprecated_vec_opts + if gpu_indexing is not None: + _deprecated_option("gpu_indexing", "gpu.indexing_scheme") + warn( + "Setting the deprecated `gpu_indexing` will override the `gpu.indexing_scheme` option", + UserWarning, + ) + self.gpu.indexing_scheme = gpu_indexing + if gpu_indexing_params is not None: - _deprecated_option("gpu_indexing_params", "gpu_indexing") + _deprecated_option("gpu_indexing_params", "gpu") warn( "Setting the deprecated `gpu_indexing_params` will override any options " - "passed in the `gpu` category." + "passed in the `gpu` category.", + UserWarning, ) self.gpu = GpuOptions( diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 6f44e718d9aaebd393dacb9a99ae88702f07ccaa..14a95c84d899638ea796d13cfddf7dd4e7ccd04f 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import cast, Sequence, Iterable, TYPE_CHECKING +from typing import cast, Sequence, Iterable, Callable, TYPE_CHECKING from dataclasses import dataclass, replace from .target import Target @@ -10,10 +10,12 @@ from .config import ( _AUTO_TYPE, GhostLayerSpec, IterationSliceSpec, + GpuIndexingScheme, ) -from .kernel import Kernel, GpuKernel, GpuThreadsRange -from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr +from .kernel import Kernel, GpuKernel +from .properties import PsSymbolProperty, FieldBasePtr from .parameters import Parameter +from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration from ..field import Field from ..types import PsIntegerType, PsScalarType @@ -38,7 +40,6 @@ from ..backend.platforms import ( Platform, GenericCpu, GenericVectorCpu, - GenericGpu, ) from ..backend.exceptions import VectorizationError @@ -145,6 +146,7 @@ class DefaultKernelCreationDriver: ) self._target = cfg.get_target() + self._gpu_indexing: GpuIndexing | None = self._get_gpu_indexing() self._platform = self._get_platform() self._intermediates: CodegenIntermediates | None @@ -163,15 +165,9 @@ class DefaultKernelCreationDriver: ) -> Kernel: kernel_body = self.parse_kernel_body(assignments) - match self._platform: - case GenericCpu(): - kernel_ast = self._platform.materialize_iteration_space( - kernel_body, self._ctx.get_iteration_space() - ) - case GenericGpu(): - kernel_ast, gpu_threads = self._platform.materialize_iteration_space( - kernel_body, self._ctx.get_iteration_space() - ) + kernel_ast = self._platform.materialize_iteration_space( + kernel_body, self._ctx.get_iteration_space() + ) if self._intermediates is not None: self._intermediates.materialized_ispace = kernel_ast.clone() @@ -215,14 +211,16 @@ class DefaultKernelCreationDriver: self._cfg.get_jit(), ) else: + assert self._gpu_indexing is not None + return create_gpu_kernel_function( self._ctx, self._platform, kernel_ast, - gpu_threads, self._cfg.get_option("function_name"), self._target, self._cfg.get_jit(), + self._gpu_indexing.get_launch_config_factory(), ) def parse_kernel_body( @@ -395,6 +393,18 @@ class DefaultKernelCreationDriver: return kernel_ast + def _get_gpu_indexing(self) -> GpuIndexing | None: + if self._target != Target.CUDA: + return None + + from .gpu_indexing import dim3 + + idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") + block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") + manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") + + return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid) + def _get_platform(self) -> Platform: if Target._CPU in self._target: if Target._X86 in self._target: @@ -430,7 +440,9 @@ class DefaultKernelCreationDriver: case Target.SYCL: from ..backend.platforms import SyclPlatform - auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size") + auto_block_size: bool = self._cfg.sycl.get_option( + "automatic_block_size" + ) return SyclPlatform( self._ctx, @@ -440,12 +452,16 @@ class DefaultKernelCreationDriver: case Target.CUDA: from ..backend.platforms import CudaPlatform - manual_grid = gpu_opts.get_option("manual_launch_grid") + thread_mapping = ( + self._gpu_indexing.get_thread_mapping() + if self._gpu_indexing is not None + else None + ) return CudaPlatform( self._ctx, omit_range_check=omit_range_check, - manual_launch_grid=manual_grid, + thread_mapping=thread_mapping, ) raise NotImplementedError( @@ -475,51 +491,50 @@ def create_gpu_kernel_function( ctx: KernelCreationContext, platform: Platform, body: PsBlock, - threads_range: GpuThreadsRange | None, function_name: str, target_spec: Target, jit: JitBase, + launch_config_factory: Callable[[], GpuLaunchConfiguration], ) -> GpuKernel: undef_symbols = collect_undefined_symbols(body) - if threads_range is not None: - for threads in threads_range.num_work_items: - undef_symbols |= collect_undefined_symbols(threads) - params = _get_function_params(ctx, undef_symbols) req_headers = _get_headers(ctx, platform, body) kfunc = GpuKernel( body, - threads_range, target_spec, function_name, params, req_headers, jit, + launch_config_factory, ) kfunc.metadata.update(ctx.metadata) return kfunc -def _get_function_params( - ctx: KernelCreationContext, symbols: Iterable[PsSymbol] -) -> list[Parameter]: - params: list[Parameter] = [] +def _symbol_to_param(ctx: KernelCreationContext, symbol: PsSymbol): + from pystencils.backend.memory import BufferBasePtr, BackendPrivateProperty - from pystencils.backend.memory import BufferBasePtr + props: set[PsSymbolProperty] = set() + for prop in symbol.properties: + match prop: + case BufferBasePtr(buf): + field = ctx.find_field(buf.name) + props.add(FieldBasePtr(field)) + case BackendPrivateProperty(): + pass + case _: + props.add(prop) - for symb in symbols: - props: set[PsSymbolProperty] = set() - for prop in symb.properties: - match prop: - case FieldShape() | FieldStride(): - props.add(prop) - case BufferBasePtr(buf): - field = ctx.find_field(buf.name) - props.add(FieldBasePtr(field)) - params.append(Parameter(symb.name, symb.get_dtype(), props)) + return Parameter(symbol.name, symbol.get_dtype(), props) + +def _get_function_params( + ctx: KernelCreationContext, symbols: Iterable[PsSymbol] +) -> list[Parameter]: + params: list[Parameter] = [_symbol_to_param(ctx, s) for s in symbols] params.sort(key=lambda p: p.name) return params diff --git a/src/pystencils/codegen/errors.py b/src/pystencils/codegen/errors.py new file mode 100644 index 0000000000000000000000000000000000000000..eceb53f611d92a2f3e8c5c4c9105d5fc8e4aa507 --- /dev/null +++ b/src/pystencils/codegen/errors.py @@ -0,0 +1,2 @@ +class CodegenError(Exception): + """Exception that indicates a fatal error in the code generation driver.""" diff --git a/src/pystencils/codegen/functions.py b/src/pystencils/codegen/functions.py new file mode 100644 index 0000000000000000000000000000000000000000..f6be3b1f3446c6b9a25a0013f0e06d099edf5bed --- /dev/null +++ b/src/pystencils/codegen/functions.py @@ -0,0 +1,54 @@ +from __future__ import annotations +from typing import Sequence, Any + +from .parameters import Parameter +from ..types import PsType + +from ..backend.kernelcreation import KernelCreationContext +from ..backend.ast.expressions import PsExpression + + +class Lambda: + """A one-line function emitted by the code generator as an auxiliary object.""" + + @staticmethod + def from_expression(ctx: KernelCreationContext, expr: PsExpression): + from ..backend.ast.analysis import collect_undefined_symbols + from .driver import _get_function_params + + params = _get_function_params(ctx, collect_undefined_symbols(expr)) + return Lambda(expr, params) + + def __init__(self, expr: PsExpression, params: Sequence[Parameter]): + self._expr = expr + self._params = tuple(params) + self._return_type = expr.get_dtype() + + @property + def parameters(self) -> tuple[Parameter, ...]: + """Parameters to this lambda""" + return self._params + + @property + def return_type(self) -> PsType: + """Return type of this lambda""" + return self._return_type + + def __call__(self, **kwargs) -> Any: + """Evaluate this lambda with the given arguments. + + The lambda must receive a value for each parameter listed in `parameters`. + """ + from ..backend.ast.expressions import evaluate_expression + + return evaluate_expression(self._expr, kwargs) + + def __str__(self) -> str: + return str(self._expr) + + def c_code(self) -> str: + """Print the C code of this lambda""" + from ..backend.emission import CAstPrinter + + printer = CAstPrinter() + return printer(self._expr) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py new file mode 100644 index 0000000000000000000000000000000000000000..2d22ec624856d9cf8a0b825845fee04caaa4ee74 --- /dev/null +++ b/src/pystencils/codegen/gpu_indexing.py @@ -0,0 +1,356 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import cast, Any, Callable +from itertools import chain + +from .functions import Lambda +from .parameters import Parameter +from .errors import CodegenError +from .config import GpuIndexingScheme, _AUTO_TYPE + +from ..backend.kernelcreation import ( + KernelCreationContext, + FullIterationSpace, + SparseIterationSpace, +) +from ..backend.platforms.cuda import ThreadMapping + +from ..backend.ast.expressions import PsExpression + + +dim3 = tuple[int, int, int] +_Dim3Lambda = tuple[Lambda, Lambda, Lambda] + + +class GpuLaunchConfiguration(ABC): + """Base class for launch configurations for CUDA and HIP kernels. + + Args: + block_size: A triple of lambdas determining the GPU block size + grid_size: A triple of lambdas determining the GPU grid size + config_parameters: Set containing all parameters to the given lambdas that are not also + parameters to the associated kernel + """ + + @property + @abstractmethod + def parameters(self) -> frozenset[Parameter]: + """Parameters of this launch configuration""" + + @abstractmethod + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: + """Compute block and grid size for a kernel launch. + + Args: + kwargs: Valuation providing a value for each parameter listed in `parameters` + """ + + @abstractmethod + def jit_cache_key(self) -> Any: + """Return a hashable object that represents any user-configurable options of + this launch configuration, such that when the configuration changes, the JIT parameter + cache is invalidated.""" + + +class AutomaticLaunchConfiguration(GpuLaunchConfiguration): + """Launch configuration that is dynamically computed from kernel parameters. + + This launch configuration permits no further user customization. + """ + + def __init__( + self, + block_size: _Dim3Lambda, + grid_size: _Dim3Lambda, + ) -> None: + self._block_size = block_size + self._grid_size = grid_size + + self._params: frozenset[Parameter] = frozenset().union( + *(lb.parameters for lb in chain(block_size, grid_size)) + ) + + @property + def parameters(self) -> frozenset[Parameter]: + return self._params + + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: + block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) + grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) + return cast(dim3, block_size), cast(dim3, grid_size) + + def jit_cache_key(self) -> Any: + return () + + +class ManualLaunchConfiguration(GpuLaunchConfiguration): + """Manual GPU launch configuration. + + This launch configuration requires the user to set block and grid size. + """ + + def __init__( + self, + ) -> None: + self._block_size: dim3 | None = None + self._grid_size: dim3 | None = None + + @property + def block_size(self) -> dim3 | None: + return self._block_size + + @block_size.setter + def block_size(self, val: dim3): + self._block_size = val + + @property + def grid_size(self) -> dim3 | None: + return self._grid_size + + @grid_size.setter + def grid_size(self, val: dim3): + self._grid_size = val + + @property + def parameters(self) -> frozenset[Parameter]: + return frozenset() + + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: + if self._block_size is None: + raise AttributeError("No GPU block size was set by the user.") + + if self._grid_size is None: + raise AttributeError("No GPU grid size was set by the user.") + + return self._block_size, self._grid_size + + def jit_cache_key(self) -> Any: + return (self._block_size, self._grid_size) + + +class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): + """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size. + + The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items + in the kernel's iteration space as follows. + For each dimension :math:`c \\in \\{ x, y, z \\}`, + + - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``; + - otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``. + """ + + def __init__( + self, + num_work_items: _Dim3Lambda, + default_block_size: dim3 | None = None, + ) -> None: + self._num_work_items = num_work_items + + self._block_size: dim3 | None = default_block_size + + self._params: frozenset[Parameter] = frozenset().union( + *(wit.parameters for wit in num_work_items) + ) + + @property + def num_work_items(self) -> _Dim3Lambda: + """Lambda expressions that compute the number of work items in each iteration space + dimension from kernel parameters.""" + return self._num_work_items + + @property + def block_size(self) -> dim3 | None: + """The desired GPU block size.""" + return self._block_size + + @block_size.setter + def block_size(self, val: dim3): + self._block_size = val + + @property + def parameters(self) -> frozenset[Parameter]: + """Parameters of this launch configuration""" + return self._params + + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: + if self._block_size is None: + raise AttributeError("No GPU block size was specified by the user!") + + from ..utils import div_ceil + + num_work_items = cast( + dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items) + ) + reduced_block_size = cast( + dim3, + tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)), + ) + grid_size = cast( + dim3, + tuple( + div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size) + ), + ) + + return reduced_block_size, grid_size + + def jit_cache_key(self) -> Any: + return self._block_size + + +class GpuIndexing: + """Factory for GPU indexing objects required during code generation. + + This class acts as a helper class for the code generation driver. + It produces both the `ThreadMapping` required by the backend, + as well as factories for the launch configuration required later by the runtime system. + + Args: + ctx: The kernel creation context + scheme: The desired GPU indexing scheme + block_size: A user-defined default block size, required only if the indexing scheme permits + modification of the block size + manual_launch_grid: If `True`, always emit a `ManualLaunchConfiguration` to force the runtime system + to set the launch configuration explicitly + """ + + def __init__( + self, + ctx: KernelCreationContext, + scheme: GpuIndexingScheme, + default_block_size: dim3 | _AUTO_TYPE | None = None, + manual_launch_grid: bool = False, + ) -> None: + self._ctx = ctx + self._scheme = scheme + self._default_block_size = default_block_size + self._manual_launch_grid = manual_launch_grid + + from ..backend.kernelcreation import AstFactory + + self._factory = AstFactory(self._ctx) + + def get_thread_mapping(self) -> ThreadMapping: + """Retrieve a thread mapping object for use by the backend""" + + from ..backend.platforms.cuda import Linear3DMapping, Blockwise4DMapping + + match self._scheme: + case GpuIndexingScheme.Linear3D: + return Linear3DMapping() + case GpuIndexingScheme.Blockwise4D: + return Blockwise4DMapping() + + def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]: + """Retrieve a factory for the launch configuration for later consumption by the runtime system""" + if self._manual_launch_grid: + return ManualLaunchConfiguration + + match self._scheme: + case GpuIndexingScheme.Linear3D: + return self._get_linear3d_config_factory() + case GpuIndexingScheme.Blockwise4D: + return self._get_blockwise4d_config_factory() + + def _get_linear3d_config_factory( + self, + ) -> Callable[[], DynamicBlockSizeLaunchConfiguration]: + work_items_expr = self._get_work_items() + rank = len(work_items_expr) + + if rank > 3: + raise CodegenError( + "Cannot create a launch grid configuration using the Linear3D indexing scheme" + f" for a {rank}-dimensional kernel." + ) + + num_work_items = cast( + _Dim3Lambda, + tuple(Lambda.from_expression(self._ctx, wit) for wit in work_items_expr), + ) + + def factory(): + return DynamicBlockSizeLaunchConfiguration( + num_work_items, + self._get_default_block_size(rank), + ) + + return factory + + def _get_default_block_size(self, rank: int) -> dim3: + if self._default_block_size is None: + raise CodegenError("The default block size option was not set") + + if isinstance(self._default_block_size, _AUTO_TYPE): + match rank: + case 1: + return (256, 1, 1) + case 2: + return (128, 2, 1) + case 3: + return (128, 2, 2) + case _: + assert False, "unreachable code" + else: + return self._default_block_size + + def _get_blockwise4d_config_factory( + self, + ) -> Callable[[], AutomaticLaunchConfiguration]: + work_items = self._get_work_items() + rank = len(work_items) + + if rank > 4: + raise ValueError(f"Iteration space rank is too large: {rank}") + + block_size = ( + Lambda.from_expression(self._ctx, work_items[0]), + Lambda.from_expression(self._ctx, self._factory.parse_index(1)), + Lambda.from_expression(self._ctx, self._factory.parse_index(1)), + ) + + grid_size = tuple( + Lambda.from_expression(self._ctx, wit) for wit in work_items[1:] + ) + tuple( + Lambda.from_expression(self._ctx, self._factory.parse_index(1)) + for _ in range(4 - rank) + ) + + def factory(): + return AutomaticLaunchConfiguration( + block_size, + cast(_Dim3Lambda, grid_size), + ) + + return factory + + def _get_work_items(self) -> tuple[PsExpression, ...]: + """Return a tuple of expressions representing the number of work items + in each dimension of the kernel's iteration space, + ordered from fastest to slowest dimension. + """ + ispace = self._ctx.get_iteration_space() + match ispace: + case FullIterationSpace(): + dimensions = ispace.dimensions_in_loop_order()[::-1] + + from ..backend.ast.analysis import collect_undefined_symbols as collect + + for i, dim in enumerate(dimensions): + symbs = collect(dim.start) | collect(dim.stop) | collect(dim.step) + for ctr in ispace.counters: + if ctr in symbs: + raise CodegenError( + "Unable to construct GPU launch grid constraints for this kernel: " + f"Limits in dimension {i} " + f"depend on another dimension's counter {ctr.name}" + ) + + return tuple(ispace.actual_iterations(dim) for dim in dimensions) + + case SparseIterationSpace(): + return (self._factory.parse_index(ispace.index_list.shape[0]),) + + case _: + assert False, "unexpected iteration space" diff --git a/src/pystencils/codegen/kernel.py b/src/pystencils/codegen/kernel.py index 3adc47876dc36af02ee307dde25ad5d7250cd3fb..181e6ad3b5d1cb1f1835d8b4e656c39f65a1316b 100644 --- a/src/pystencils/codegen/kernel.py +++ b/src/pystencils/codegen/kernel.py @@ -6,8 +6,9 @@ from itertools import chain from .target import Target from .parameters import Parameter +from .gpu_indexing import GpuLaunchConfiguration + from ..backend.ast.structural import PsBlock -from ..backend.ast.expressions import PsExpression from ..field import Field from .._deprecation import _deprecated @@ -118,54 +119,16 @@ class GpuKernel(Kernel): def __init__( self, body: PsBlock, - threads_range: GpuThreadsRange | None, target: Target, name: str, parameters: Sequence[Parameter], required_headers: set[str], jit: JitBase, + launch_config_factory: Callable[[], GpuLaunchConfiguration], ): super().__init__(body, target, name, parameters, required_headers, jit) - self._threads_range = threads_range + self._launch_config_factory = launch_config_factory - @property - def threads_range(self) -> GpuThreadsRange | None: + def get_launch_configuration(self) -> GpuLaunchConfiguration: """Object exposing the total size of the launch grid this kernel expects to be executed with.""" - return self._threads_range - - -class GpuThreadsRange: - """Number of threads required by a GPU kernel, in order (x, y, z).""" - - def __init__( - self, - num_work_items: Sequence[PsExpression], - ): - self._dim = len(num_work_items) - self._num_work_items = tuple(num_work_items) - - # @property - # def grid_size(self) -> tuple[PsExpression, ...]: - # return self._grid_size - - # @property - # def block_size(self) -> tuple[PsExpression, ...]: - # return self._block_size - - @property - def num_work_items(self) -> tuple[PsExpression, ...]: - """Number of work items in (x, y, z)-order.""" - return self._num_work_items - - @property - def dim(self) -> int: - return self._dim - - def __str__(self) -> str: - rep = "GpuThreadsRange { " - rep += "; ".join(f"{x}: {w}" for x, w in zip("xyz", self._num_work_items)) - rep += " }" - return rep - - def _repr_html_(self) -> str: - return str(self) + return self._launch_config_factory() diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index a407bb75e08bfde9911070aef03b4a1769a6221a..1c771a427ab5025c0f041d88f1acbec7da9f0920 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -1,4 +1,4 @@ -from typing import Any, Sequence, cast +from typing import Any, Sequence, cast, Callable from dataclasses import dataclass try: @@ -18,6 +18,7 @@ from ..codegen import ( GpuKernel, Parameter, ) +from ..codegen.gpu_indexing import GpuLaunchConfiguration from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr from ..types import PsStructType, PsPointerType @@ -35,12 +36,10 @@ class CupyKernelWrapper(KernelWrapper): self, kfunc: GpuKernel, raw_kernel: Any, - block_size: tuple[int, int, int], ): self._kfunc: GpuKernel = kfunc + self._launch_config = kfunc.get_launch_configuration() self._raw_kernel = raw_kernel - self._block_size = block_size - self._num_blocks: tuple[int, int, int] | None = None self._args_cache: dict[Any, tuple] = dict() @property @@ -48,24 +47,12 @@ class CupyKernelWrapper(KernelWrapper): return self._kfunc @property - def raw_kernel(self): - return self._raw_kernel - - @property - def block_size(self) -> tuple[int, int, int]: - return self._block_size - - @block_size.setter - def block_size(self, bs: tuple[int, int, int]): - self._block_size = bs + def launch_config(self) -> GpuLaunchConfiguration: + return self._launch_config @property - def num_blocks(self) -> tuple[int, int, int] | None: - return self._num_blocks - - @num_blocks.setter - def num_blocks(self, nb: tuple[int, int, int] | None): - self._num_blocks = nb + def raw_kernel(self): + return self._raw_kernel def __call__(self, **kwargs: Any): kernel_args, launch_grid = self._get_cached_args(**kwargs) @@ -80,7 +67,9 @@ class CupyKernelWrapper(KernelWrapper): return devices.pop() def _get_cached_args(self, **kwargs): - key = (self._block_size, self._num_blocks) + tuple((k, id(v)) for k, v in kwargs.items()) + key = (self._launch_config.jit_cache_key(),) + tuple( + (k, id(v)) for k, v in kwargs.items() + ) if key not in self._args_cache: args = self._get_args(**kwargs) @@ -90,16 +79,19 @@ class CupyKernelWrapper(KernelWrapper): return self._args_cache[key] def _get_args(self, **kwargs) -> tuple[tuple, LaunchGrid]: - args = [] + kernel_args = [] valuation: dict[str, Any] = dict() - def add_arg(name: str, arg: Any, dtype: PsType): - nptype = dtype.numpy_dtype + def add_arg(param: Parameter, arg: Any): + nptype = param.dtype.numpy_dtype assert nptype is not None typecast = nptype.type arg = typecast(arg) - args.append(arg) - valuation[name] = arg + valuation[param.name] = arg + + def add_kernel_arg(param: Parameter, arg: Any): + add_arg(param, arg) + kernel_args.append(arg) field_shapes = set() index_shapes = set() @@ -152,21 +144,23 @@ class CupyKernelWrapper(KernelWrapper): ) # Collect parameter values - arr: cp.ndarray - for kparam in self._kfunc.parameters: - if kparam.is_field_parameter: + def process_param(param: Parameter, adder: Callable[[Parameter, Any], None]): + arr: cp.ndarray + + if param.is_field_parameter: # Determine field-associated data to pass in - for prop in kparam.properties: + for prop in param.properties: match prop: case FieldBasePtr(field): elem_dtype: PsType from .. import DynamicType + if isinstance(field.dtype, DynamicType): - assert isinstance(kparam.dtype, PsPointerType) - elem_dtype = kparam.dtype.base_type + assert isinstance(param.dtype, PsPointerType) + elem_dtype = param.dtype.base_type else: elem_dtype = field.dtype @@ -176,65 +170,39 @@ class CupyKernelWrapper(KernelWrapper): f"Data type mismatch at array argument {field.name}:" f"Expected {field.dtype}, got {arr.dtype}" ) - check_shape(kparam, arr) - args.append(arr) + check_shape(param, arr) + kernel_args.append(arr) break case FieldShape(field, coord): arr = kwargs[field.name] - add_arg(kparam.name, arr.shape[coord], kparam.dtype) + adder(param, arr.shape[coord]) break case FieldStride(field, coord): arr = kwargs[field.name] - add_arg( - kparam.name, + adder( + param, arr.strides[coord] // arr.dtype.itemsize, - kparam.dtype, ) break else: # scalar parameter - val: Any = kwargs[kparam.name] - add_arg(kparam.name, val, kparam.dtype) - - # Determine launch grid - from ..backend.ast.expressions import evaluate_expression - - symbolic_threads_range = self._kfunc.threads_range - - if self._num_blocks is not None: - launch_grid = LaunchGrid(self._num_blocks, self._block_size) - - elif symbolic_threads_range is not None: - threads_range: list[int] = [ - evaluate_expression(expr, valuation) - for expr in symbolic_threads_range.num_work_items - ] - - if symbolic_threads_range.dim < 3: - threads_range += [1] * (3 - symbolic_threads_range.dim) + val: Any = kwargs[param.name] + adder(param, val) - def div_ceil(a, b): - return a // b if a % b == 0 else a // b + 1 + # Process Arguments - # TODO: Refine this? - num_blocks = tuple( - div_ceil(threads, tpb) - for threads, tpb in zip(threads_range, self._block_size) - ) - assert len(num_blocks) == 3 + for kparam in self._kfunc.parameters: + process_param(kparam, add_kernel_arg) - launch_grid = LaunchGrid(num_blocks, self._block_size) + for cparam in self._launch_config.parameters: + if cparam.name not in valuation: + process_param(cparam, add_arg) - else: - raise JitError( - "Unable to determine launch grid for GPU kernel invocation: " - "No manual grid size was specified, and the number of threads could not " - "be determined automatically." - ) + block_size, grid_size = self._launch_config.evaluate(**valuation) - return tuple(args), launch_grid + return tuple(kernel_args), LaunchGrid(grid_size, block_size) class CupyJit(JitBase): @@ -252,26 +220,26 @@ class CupyJit(JitBase): tuple(default_block_size) + (1,) * (3 - len(default_block_size)), ) - def compile(self, kfunc: Kernel) -> KernelWrapper: + def compile(self, kernel: Kernel) -> KernelWrapper: if not HAVE_CUPY: raise JitError( "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable." ) - if not isinstance(kfunc, GpuKernel) or kfunc.target != Target.CUDA: + if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA: raise ValueError( "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP" ) options = self._compiler_options() - prelude = self._prelude(kfunc) - kernel_code = self._kernel_code(kfunc) + prelude = self._prelude(kernel) + kernel_code = self._kernel_code(kernel) code = prelude + kernel_code raw_kernel = cp.RawKernel( - code, kfunc.name, options=options, backend="nvrtc", jitify=True + code, kernel.name, options=options, backend="nvrtc", jitify=True ) - return CupyKernelWrapper(kfunc, raw_kernel, self._default_block_size) + return CupyKernelWrapper(kernel, raw_kernel) def _compiler_options(self) -> tuple[str, ...]: options = ["-w", "-std=c++11"] diff --git a/src/pystencils/sympyextensions/integer_functions.py b/src/pystencils/sympyextensions/integer_functions.py index 42513ef9c587bd8f14228083afa4fff419220fa4..9d2c69502f99130699262dd9b63362a3841fccfc 100644 --- a/src/pystencils/sympyextensions/integer_functions.py +++ b/src/pystencils/sympyextensions/integer_functions.py @@ -140,10 +140,10 @@ class div_ceil(IntegerFunctionTwoArgsMixIn): @classmethod def eval(cls, arg1, arg2): - from ..utils import c_intdiv + from ..utils import div_ceil if is_integer_sequence((arg1, arg2)): - return c_intdiv(arg1 + arg2 - 1, arg2) + return div_ceil(arg1, arg2) def _eval_op(self, arg1, arg2): return self.eval(arg1, arg2) diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py index a53eb82896ab635c3995be918d31a03326766d5d..0049d0a2ce36471c3cfb98bf39c965d33c24868b 100644 --- a/src/pystencils/utils.py +++ b/src/pystencils/utils.py @@ -4,11 +4,13 @@ from itertools import groupby from collections import Counter from contextlib import contextmanager from tempfile import NamedTemporaryFile -from typing import Mapping +from typing import Mapping, overload import numpy as np import sympy as sp +from numpy.typing import NDArray + class DotDict(dict): """Normal dict with additional dot access for all keys""" @@ -254,6 +256,24 @@ class ContextVar: return self.stack[-1] +@overload +def c_intdiv(num: int, denom: int) -> int: ... + + +@overload +def c_intdiv( + num: NDArray[np.integer], denom: NDArray[np.integer] +) -> NDArray[np.integer]: ... + + +@overload +def c_intdiv(num: int, denom: NDArray[np.integer]) -> NDArray[np.integer]: ... + + +@overload +def c_intdiv(num: NDArray[np.integer], denom: int) -> NDArray[np.integer]: ... + + def c_intdiv(num, denom): """C-style integer division""" if isinstance(num, np.ndarray) or isinstance(denom, np.ndarray): @@ -271,3 +291,28 @@ def c_rem(num, denom): """C-style integer remainder""" div = c_intdiv(num, denom) return num - div * denom + + +@overload +def div_ceil(divident: int, divisor: int) -> int: ... + + +@overload +def div_ceil( + divident: NDArray[np.integer], divisor: NDArray[np.integer] +) -> NDArray[np.integer]: ... + + +@overload +def div_ceil(divident: int, divisor: NDArray[np.integer]) -> NDArray[np.integer]: ... + + +@overload +def div_ceil(divident: NDArray[np.integer], divisor: int) -> NDArray[np.integer]: ... + + +def div_ceil(divident, divisor): + """For nonnegative integer arguments, compute ``ceil(num / denom)``. + The result is unspecified if either argument is negative.""" + + return c_intdiv(divident + divisor - 1, divisor) diff --git a/tests/kernelcreation/test_domain_kernels.py b/tests/kernelcreation/test_domain_kernels.py index da261faec49940df31d59f44651956e2012b113a..0d71dbe1a250c865c0f637aa3a125837abfe39e7 100644 --- a/tests/kernelcreation/test_domain_kernels.py +++ b/tests/kernelcreation/test_domain_kernels.py @@ -32,14 +32,7 @@ def inspect_dp_kernel(kernel: Kernel, gen_config: CreateKernelConfig): assert "_mm512_storeu_pd" in code -def test_filter_kernel(gen_config): - if gen_config.target == Target.CUDA: - import cupy as cp - - xp = cp - else: - xp = np - +def test_filter_kernel(gen_config, xp): weight = sp.Symbol("weight") stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]] @@ -62,14 +55,7 @@ def test_filter_kernel(gen_config): xp.testing.assert_allclose(dst_arr, expected) -def test_filter_kernel_fixedsize(gen_config): - if gen_config.target == Target.CUDA: - import cupy as cp - - xp = cp - else: - xp = np - +def test_filter_kernel_fixedsize(gen_config, xp): weight = sp.Symbol("weight") stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]] diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 97f0c0fa9ee847389aee71bc511abef55c9f2b93..10b37e610cebd23c9fc961f14118aee5f24582c4 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -4,16 +4,87 @@ import numpy as np import sympy as sp from scipy.ndimage import convolve -from pystencils import Assignment, Field, fields, CreateKernelConfig, create_kernel, Target -# from pystencils.gpu import BlockIndexing -from pystencils.simp import sympy_cse_on_assignment_list -from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers, normalize_slice +from pystencils import ( + Assignment, + Field, + fields, + CreateKernelConfig, + create_kernel, + Target, +) + +from pystencils.slicing import ( + add_ghost_layers, + make_slice, + remove_ghost_layers, + normalize_slice, +) try: import cupy as cp + device_numbers = range(cp.cuda.runtime.getDeviceCount()) except ImportError: - device_numbers = [] + pytest.skip(reason="CuPy is not available", allow_module_level=True) + + +@pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) +@pytest.mark.parametrize("omit_range_check", [False, True]) +@pytest.mark.parametrize("manual_grid", [False, True]) +def test_indexing_options( + indexing_scheme: str, omit_range_check: bool, manual_grid: bool +): + src, dst = fields("src, dst: [3D]") + asm = Assignment( + dst.center(), + src[-1, 0, 0] + + src[1, 0, 0] + + src[0, -1, 0] + + src[0, 1, 0] + + src[0, 0, -1] + + src[0, 0, 1], + ) + + cfg = CreateKernelConfig(target=Target.CUDA) + cfg.gpu.indexing_scheme = indexing_scheme + cfg.gpu.omit_range_check = omit_range_check + cfg.gpu.manual_launch_grid = manual_grid + + ast = create_kernel(asm, cfg) + kernel = ast.compile() + + src_arr = cp.ones((18, 34, 42)) + dst_arr = cp.zeros_like(src_arr) + + if manual_grid: + match indexing_scheme: + case "linear3d": + kernel.launch_config.block_size = (10, 8, 8) + kernel.launch_config.grid_size = (4, 4, 2) + case "blockwise4d": + kernel.launch_config.block_size = (40, 1, 1) + kernel.launch_config.grid_size = (32, 16, 1) + + elif indexing_scheme == "linear3d": + kernel.launch_config.block_size = (10, 8, 8) + + kernel(src=src_arr, dst=dst_arr) + + expected = cp.zeros_like(src_arr) + expected[1:-1, 1:-1, 1:-1].fill(6.0) + + cp.testing.assert_allclose(dst_arr, expected) + + +def test_invalid_indexing_schemes(): + src, dst = fields("src, dst: [4D]") + asm = Assignment(src.center(0), dst.center(0)) + + cfg = CreateKernelConfig(target=Target.CUDA) + cfg.gpu.indexing_scheme = "linear3d" + + with pytest.raises(Exception): + create_kernel(asm, cfg) def test_averaging_kernel(): @@ -21,14 +92,16 @@ def test_averaging_kernel(): src_arr = np.random.rand(*size) src_arr = add_ghost_layers(src_arr) dst_arr = np.zeros_like(src_arr) - src_field = Field.create_from_numpy_array('src', src_arr) - dst_field = Field.create_from_numpy_array('dst', dst_arr) + src_field = Field.create_from_numpy_array("src", src_arr) + dst_field = Field.create_from_numpy_array("dst", dst_arr) - update_rule = Assignment(dst_field[0, 0], - (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4) + update_rule = Assignment( + dst_field[0, 0], + (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4, + ) config = CreateKernelConfig(target=Target.GPU) - ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config) + ast = create_kernel(update_rule, config=config) kernel = ast.compile() gpu_src_arr = cp.asarray(src_arr) @@ -37,20 +110,24 @@ def test_averaging_kernel(): dst_arr = gpu_dst_arr.get() stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0 - reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0) + reference = convolve( + remove_ghost_layers(src_arr), stencil, mode="constant", cval=0.0 + ) reference = add_ghost_layers(reference) np.testing.assert_almost_equal(reference, dst_arr) def test_variable_sized_fields(): - src_field = Field.create_generic('src', spatial_dimensions=2) - dst_field = Field.create_generic('dst', spatial_dimensions=2) + src_field = Field.create_generic("src", spatial_dimensions=2) + dst_field = Field.create_generic("dst", spatial_dimensions=2) - update_rule = Assignment(dst_field[0, 0], - (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4) + update_rule = Assignment( + dst_field[0, 0], + (src_field[0, 1] + src_field[0, -1] + src_field[1, 0] + src_field[-1, 0]) / 4, + ) config = CreateKernelConfig(target=Target.GPU) - ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config) + ast = create_kernel(update_rule, config=config) kernel = ast.compile() size = (3, 3) @@ -64,7 +141,9 @@ def test_variable_sized_fields(): dst_arr = gpu_dst_arr.get() stencil = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 0]]) / 4.0 - reference = convolve(remove_ghost_layers(src_arr), stencil, mode='constant', cval=0.0) + reference = convolve( + remove_ghost_layers(src_arr), stencil, mode="constant", cval=0.0 + ) reference = add_ghost_layers(reference) np.testing.assert_almost_equal(reference, dst_arr) @@ -76,12 +155,14 @@ def test_multiple_index_dimensions(): src_arr = np.array(np.random.rand(*src_size)) dst_arr = np.zeros(dst_size) - src_field = Field.create_from_numpy_array('src', src_arr, index_dimensions=1) - dst_field = Field.create_from_numpy_array('dst', dst_arr, index_dimensions=0) + src_field = Field.create_from_numpy_array("src", src_arr, index_dimensions=1) + dst_field = Field.create_from_numpy_array("dst", dst_arr, index_dimensions=0) offset = (-2, -1) - update_rule = Assignment(dst_field[0, 0], - sum([src_field[offset[0], offset[1]](i) for i in range(src_size[-1])])) + update_rule = Assignment( + dst_field[0, 0], + sum([src_field[offset[0], offset[1]](i) for i in range(src_size[-1])]), + ) config = CreateKernelConfig(target=Target.GPU) ast = create_kernel([update_rule], config=config) @@ -94,26 +175,31 @@ def test_multiple_index_dimensions(): reference = np.zeros_like(dst_arr) gl = np.max(np.abs(np.array(offset, dtype=int))) - for x in range(gl, src_size[0]-gl): - for y in range(gl, src_size[1]-gl): - reference[x, y] = sum([src_arr[x+offset[0], y+offset[1], i] for i in range(src_size[2])]) + for x in range(gl, src_size[0] - gl): + for y in range(gl, src_size[1] - gl): + reference[x, y] = sum( + [src_arr[x + offset[0], y + offset[1], i] for i in range(src_size[2])] + ) np.testing.assert_almost_equal(reference, dst_arr) -@pytest.mark.xfail(reason="Line indexing not available yet") def test_ghost_layer(): size = (6, 5) src_arr = np.ones(size) dst_arr = np.zeros_like(src_arr) - src_field = Field.create_from_numpy_array('src', src_arr, index_dimensions=0) - dst_field = Field.create_from_numpy_array('dst', dst_arr, index_dimensions=0) + src_field = Field.create_from_numpy_array("src", src_arr, index_dimensions=0) + dst_field = Field.create_from_numpy_array("dst", dst_arr, index_dimensions=0) update_rule = Assignment(dst_field[0, 0], src_field[0, 0]) ghost_layers = [(1, 2), (2, 1)] - config = CreateKernelConfig(target=Target.GPU, ghost_layers=ghost_layers, gpu="line") - ast = create_kernel(sympy_cse_on_assignment_list([update_rule]), config=config) + config = CreateKernelConfig() + config.target = Target.CUDA + config.ghost_layers = ghost_layers + config.gpu.indexing_scheme = "blockwise4d" + + ast = create_kernel(update_rule, config=config) kernel = ast.compile() gpu_src_arr = cp.asarray(src_arr) @@ -122,11 +208,13 @@ def test_ghost_layer(): dst_arr = gpu_dst_arr.get() reference = np.zeros_like(src_arr) - reference[ghost_layers[0][0]:-ghost_layers[0][1], ghost_layers[1][0]:-ghost_layers[1][1]] = 1 + reference[ + ghost_layers[0][0] : -ghost_layers[0][1], + ghost_layers[1][0] : -ghost_layers[1][1], + ] = 1 np.testing.assert_equal(reference, dst_arr) -@pytest.mark.xfail(reason="Line indexing not available yet") def test_setting_value(): arr_cpu = np.arange(25, dtype=np.float64).reshape(5, 5) arr_gpu = cp.asarray(arr_cpu) @@ -135,8 +223,12 @@ def test_setting_value(): f = Field.create_generic("f", 2) update_rule = [Assignment(f(0), sp.Symbol("value"))] - config = CreateKernelConfig(target=Target.GPU, gpu="line", iteration_slice=iteration_slice) - ast = create_kernel(sympy_cse_on_assignment_list(update_rule), config=config) + config = CreateKernelConfig() + config.target = Target.CUDA + config.iteration_slice = iteration_slice + config.gpu.indexing_scheme = "blockwise4d" + + ast = create_kernel(update_rule, config=config) kernel = ast.compile() kernel(f=arr_gpu, value=np.float64(42.0)) @@ -167,21 +259,30 @@ def test_periodicity(): def test_block_indexing(device_number): f = fields("f: [3D]") s = normalize_slice(make_slice[:, :, :], f.spatial_shape) - bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2), - permute_block_size_dependent_on_layout=False) - assert bi.call_parameters((3, 2, 32))['block'] == (3, 2, 32) - assert bi.call_parameters((32, 2, 32))['block'] == (16, 2, 8) - - bi = BlockIndexing(s, f.layout, block_size=(32, 1, 1), - permute_block_size_dependent_on_layout=False) - assert bi.call_parameters((1, 16, 16))['block'] == (1, 16, 2) - - bi = BlockIndexing(s, f.layout, block_size=(16, 8, 2), - maximum_block_size="auto", device_number=device_number) + bi = BlockIndexing( + s, f.layout, block_size=(16, 8, 2), permute_block_size_dependent_on_layout=False + ) + assert bi.call_parameters((3, 2, 32))["block"] == (3, 2, 32) + assert bi.call_parameters((32, 2, 32))["block"] == (16, 2, 8) + + bi = BlockIndexing( + s, f.layout, block_size=(32, 1, 1), permute_block_size_dependent_on_layout=False + ) + assert bi.call_parameters((1, 16, 16))["block"] == (1, 16, 2) + + bi = BlockIndexing( + s, + f.layout, + block_size=(16, 8, 2), + maximum_block_size="auto", + device_number=device_number, + ) # This function should be used if number of needed registers is known. Can be determined with func.num_regs registers_per_thread = 1000 - blocks = bi.limit_block_size_by_register_restriction([1024, 1024, 1], registers_per_thread) + blocks = bi.limit_block_size_by_register_restriction( + [1024, 1024, 1], registers_per_thread + ) if cp.cuda.runtime.is_hip: max_registers_per_block = cp.cuda.runtime.deviceGetAttribute(71, device_number) @@ -193,9 +294,9 @@ def test_block_indexing(device_number): assert np.prod(blocks) * registers_per_thread < max_registers_per_block -@pytest.mark.parametrize('gpu_indexing', ("block", "line")) -@pytest.mark.parametrize('layout', ("C", "F")) -@pytest.mark.parametrize('shape', ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11))) +@pytest.mark.parametrize("gpu_indexing", ("block", "line")) +@pytest.mark.parametrize("layout", ("C", "F")) +@pytest.mark.parametrize("shape", ((5, 5, 5, 5), (3, 17, 387, 4), (23, 44, 21, 11))) @pytest.mark.xfail(reason="4D kernels not available yet") def test_four_dimensional_kernel(gpu_indexing, layout, shape): n_elements = np.prod(shape) @@ -207,7 +308,9 @@ def test_four_dimensional_kernel(gpu_indexing, layout, shape): f = Field.create_from_numpy_array("f", arr_cpu) update_rule = [Assignment(f.center, sp.Symbol("value"))] - config = CreateKernelConfig(target=Target.GPU, gpu=gpu_indexing, iteration_slice=iteration_slice) + config = CreateKernelConfig( + target=Target.GPU, gpu=gpu_indexing, iteration_slice=iteration_slice + ) ast = create_kernel(update_rule, config=config) kernel = ast.compile() diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py index 02b6b99220d11748647f82ccd88679cec6832ae7..b1f2da576750b60e4fbb8d9d4d33e393bc00dcf3 100644 --- a/tests/kernelcreation/test_iteration_slices.py +++ b/tests/kernelcreation/test_iteration_slices.py @@ -19,6 +19,7 @@ from pystencils.sympyextensions.integer_functions import int_rem from pystencils.simp import sympy_cse_on_assignment_list from pystencils.slicing import normalize_slice from pystencils.jit.gpu_cupy import CupyKernelWrapper +from pystencils.codegen.gpu_indexing import ManualLaunchConfiguration def test_sliced_iteration(): @@ -137,8 +138,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp): expected[r, r:] = 1.0 update = Assignment(f.center(), 1) - outer_counter = DEFAULTS.spatial_counters[0] - islice = make_slice[:, outer_counter:] + + # Have NumPy data layout -> X is slowest coordinate, Y is fastest + slow_counter = DEFAULTS.spatial_counters[0] + islice = make_slice[:, slow_counter:] gen_config = replace(gen_config, iteration_slice=islice) if gen_config.target == Target.CUDA: @@ -147,8 +150,10 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp): kernel = create_kernel(update, gen_config).compile() if isinstance(kernel, CupyKernelWrapper): - kernel.block_size = shape + (1,) - kernel.num_blocks = (1, 1, 1) + assert isinstance(kernel.launch_config, ManualLaunchConfiguration) + + kernel.launch_config.block_size = shape + (1,) + kernel.launch_config.grid_size = (1, 1, 1) kernel(f=f_arr) @@ -182,8 +187,10 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp): pytest.xfail("Gather/Scatter not implemented yet") if isinstance(kernel, CupyKernelWrapper): - kernel.block_size = (8, 16, 1) - kernel.num_blocks = (1, 1, 1) + assert isinstance(kernel.launch_config, ManualLaunchConfiguration) + + kernel.launch_config.block_size = (8, 16, 1) + kernel.launch_config.grid_size = (1, 1, 1) kernel(f=f_arr) diff --git a/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py b/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py deleted file mode 100644 index da2b3a5ad3a0e224bc47a5dd0fa4f16b0ccde520..0000000000000000000000000000000000000000 --- a/tests/nbackend/kernelcreation/platform/test_gpu_platforms.py +++ /dev/null @@ -1,43 +0,0 @@ -import pytest - -from pystencils.field import Field - -from pystencils.backend.kernelcreation import ( - KernelCreationContext, - FullIterationSpace -) - -from pystencils.backend.ast.structural import PsBlock, PsComment - -from pystencils.backend.platforms import CudaPlatform, SyclPlatform - - -@pytest.mark.parametrize("layout", ["fzyx", "zyxf", "c", "f"]) -@pytest.mark.parametrize("platform_class", [CudaPlatform, SyclPlatform]) -def test_thread_range(platform_class, layout): - ctx = KernelCreationContext() - - body = PsBlock([PsComment("Kernel body goes here")]) - platform = platform_class(ctx) - - dim = 3 - archetype_field = Field.create_generic("field", spatial_dimensions=dim, layout=layout) - ispace = FullIterationSpace.create_with_ghost_layers(ctx, 1, archetype_field) - - _, threads_range = platform.materialize_iteration_space(body, ispace) - - assert threads_range.dim == dim - - match layout: - case "fzyx" | "zyxf" | "f": - indexing_order = [0, 1, 2] - case "c": - indexing_order = [2, 1, 0] - - for i in range(dim): - # Slowest to fastest coordinate - coordinate = indexing_order[i] - dimension = ispace.dimensions[coordinate] - witems = threads_range.num_work_items[i] - desired = dimension.stop - dimension.start - assert witems.structurally_equal(desired)