diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1420bd2b53e918cfce8e6b5491e94888ad36a0f9..ffee8829de5e64a54fcc0937745b08b97429d61b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -24,7 +24,6 @@ typechecker: - nox --session typecheck .testsuite-base: - extends: .nox-base stage: "Tests" needs: [] coverage: '/TOTAL.*\s+(\d+%)$/' @@ -38,13 +37,18 @@ typechecker: coverage_format: cobertura path: coverage.xml -"testsuite-py3.10": +"testsuite-py3.10+cuda": extends: .testsuite-base + image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:ubuntu24.04-cuda12.6 script: - nox --session testsuite-3.10 + tags: + - docker + - cuda11 "testsuite-py3.13": extends: .testsuite-base + image: i10git.cs.fau.de:5005/pycodegen/pycodegen/nox:alpine script: - nox --session testsuite-3.13 diff --git a/docs/source/_util/sfg_monkeypatch.py b/docs/source/_util/sfg_monkeypatch.py index 0269d40f43492ea1540f51f49c8e78c5ebebf37d..91a96233b0af6831dbd0c8ac4db06d862517d994 100644 --- a/docs/source/_util/sfg_monkeypatch.py +++ b/docs/source/_util/sfg_monkeypatch.py @@ -1,6 +1,8 @@ import pystencilssfg from pystencilssfg.config import SfgConfig +from os.path import splitext + class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator): """Mockup wrapper around SourceFileGenerator for use in documentation @@ -28,23 +30,23 @@ class DocsPatchedGenerator(pystencilssfg.SourceFileGenerator): def __exit__(self, exc_type, exc_value, traceback): if exc_type is None: self._finish_files() + emitter = self._get_emitter() - header_code = self._emitter.dumps(self._header_file) - impl_code = ( - None - if self._impl_file is None - else self._emitter.dumps(self._impl_file) - ) + header_code = emitter.dumps(self._header_file) + header_ext = splitext(self._header_file.name)[1] mdcode = ":::::{tab-set}\n" - mdcode += "::::{tab-item} Generated Header (.hpp)\n" + mdcode += f"::::{{tab-item}} Generated Header ({header_ext})\n" mdcode += ":::{code-block} C++\n\n" mdcode += header_code mdcode += "\n:::\n::::\n" - if impl_code: - mdcode += "::::{tab-item} Generated Implementation (.cpp)\n" + if self._impl_file is not None: + impl_code = emitter.dumps(self._impl_file) + impl_ext = splitext(self._impl_file.name)[1] + + mdcode += f"::::{{tab-item}} Generated Implementation ({impl_ext})\n" mdcode += ":::{code-block} C++\n\n" mdcode += impl_code mdcode += "\n:::\n::::\n" diff --git a/docs/source/api/composer.rst b/docs/source/api/composer.rst index 124d0fb97ac8f94bc2a6d4c38815edee8403c65b..8b470b033cfb2755dd57c5f8cce682f64756b0d2 100644 --- a/docs/source/api/composer.rst +++ b/docs/source/api/composer.rst @@ -16,6 +16,9 @@ Composer API (``pystencilssfg.composer``) .. autoclass:: SfgClassComposer :members: +.. autoclass:: SfgGpuComposer + :members: + Custom Generators ================= @@ -37,6 +40,7 @@ Helper Methods and Builders .. autoclass:: SfgFunctionSequencer :members: + :inherited-members: .. autoclass:: SfgNodeBuilder :members: @@ -47,6 +51,12 @@ Helper Methods and Builders .. autoclass:: SfgSwitchBuilder :members: +.. module:: pystencilssfg.composer.class_composer + +.. autoclass:: SfgMethodSequencer + :members: + :inherited-members: + Context and Cursor ================== diff --git a/docs/source/api/lang.rst b/docs/source/api/lang.rst index bd5e4fa7993f30d4c8f6e5c532eb3d234c64061d..cdf8f242cf58b3f0d9889ad94cef6c7a8b7a8155 100644 --- a/docs/source/api/lang.rst +++ b/docs/source/api/lang.rst @@ -41,3 +41,9 @@ Implementation .. automodule:: pystencilssfg.lang.cpp :members: + +GPU Runtime APIs +---------------- + +.. automodule:: pystencilssfg.lang.gpu + :members: diff --git a/docs/source/usage/config_and_cli.md b/docs/source/usage/config_and_cli.md index 785ff52ed1d5153b04be9e900363af156ff94e7c..b6060c0b07f5a7a4f0165f6c681db5fb97bea976 100644 --- a/docs/source/usage/config_and_cli.md +++ b/docs/source/usage/config_and_cli.md @@ -12,7 +12,7 @@ different configuration sources: the generator script to set some of its configuration options; see [Command-Line Options](#cmdline_options) - **Project Configuration:** When embedded into a larger project, using a build system such as CMake, generator scripts may be configured globally within that project by the use of a *configuration module*. - Settings specified inside that configuration module are always overridden by the former to configuration sources. + Settings specified inside that configuration module are always overridden by the two other configuration sources listed above. For details on configuration modules, refer to the guide on [Project and Build System Integration](#guide_project_integration). (inline_config)= @@ -60,14 +60,26 @@ set {any}`cfg.outer_namespace <SfgConfig.outer_namespace>`. ### Code Style and Formatting - - Modify the values in the {any}`cfg.code_style <CodeStyle>` category to affect - certain formatting aspects of the generated code. - - To change, enforce, or disable auto-formatting of generated code through `clang-format`, - take a look at the {any}`cfg.clang_format <ClangFormatOptions>` category. - - Clang-format will, by default, sort `#include` statements alphabetically and separate - local and system header includes. - To override this, you can set a custom sorting key for `#include` sorting via - {any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`. +Pystencils-sfg gives you some options to affect its output code style. +These are controlled by the options in the {any}`cfg.code_style <CodeStyle>` category. + +Furthermore, pystencils-sfg uses `clang-format` to beautify generated code. +The behaviour of the clang-format integration is managed by the +the {any}`cfg.clang_format <ClangFormatOptions>` category, +where you can set options to skip or enforce formatting, +or change the formatter binary. +To set the code style used by `clang-format` either create a `.clang-format` file +in any of the parent folders of your generator script, +or modify the {any}`cfg.clang_format.code_style <ClangFormatOptions.code_style>` option. + +:::{seealso} +[Clang-Format Style Options](https://clang.llvm.org/docs/ClangFormatStyleOptions.html) +::: + +Clang-format will, by default, sort `#include` statements alphabetically and separate +local and system header includes. +To override this, you can set a custom sorting key for `#include` sorting via +{any}`cfg.code_style.includes_sorting_key <CodeStyle.includes_sorting_key>`. (cmdline_options)= ## Command-Line Options diff --git a/docs/source/usage/how_to_composer.md b/docs/source/usage/how_to_composer.md index 966a9a661b8f7c5d4d863b07c2a9549a95032591..849de18a3cca12e8bfc697e7b84aeb8ae3950239 100644 --- a/docs/source/usage/how_to_composer.md +++ b/docs/source/usage/how_to_composer.md @@ -283,7 +283,7 @@ The composer gives us access to the default kernel namespace (`<current_namespac via `sfg.kernels`. To add a kernel, - - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.reate() <KernelsAdder.create>`, + - either pass its assignments and the pystencils code generator configuration directly to {any}`kernels.create() <KernelsAdder.create>`, - or create the kernel separately through {any}`pystencils.create_kernel <pystencils.codegen.create_kernel>` and register it using {any}`kernels.add() <KernelsAdder.add>`. @@ -392,13 +392,176 @@ with SourceFileGenerator() as sfg: ) ``` -(exposed_inline_kernels)= -### Exposed and Inline Kernels +## GPU Kernels + +Pystencils also allows us to generate kernels for the CUDA and HIP GPU programming models. +This section describes how to generate GPU kernels through pystencils-sfg; +how to invoke them with various launch configurations, +and how GPU execution streams are reflected. + +### Generate and Invoke CUDA and HIP Kernels + +To generate a kernel targetting either of these, set the +{any}`target <pystencils.codegen.config.CreateKernelConfig.target>` +code generator option to either `Target.CUDA` or `Target.HIP`. +After registering a GPU kernel, +its invocation can be rendered using {any}`sfg.gpu_invoke <SfgGpuComposer.gpu_invoke>`. +Here is an example using CUDA: + +```{code-cell} ipython3 +from pystencilssfg import SfgConfig +sfg_config = SfgConfig() +sfg_config.extensions.impl = "cu" + +with SourceFileGenerator(sfg_config) as sfg: + # Configure the code generator to use CUDA + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + + # Create fields, assemble assignments + f, g = ps.fields("f, g: double[128, 128]") + asm = ps.Assignment(f(0), g(0)) + + # Register kernel + khandle = sfg.kernels.create(asm, "gpu_kernel", cfg) + + # Invoke it + sfg.function("kernel_wrapper")( + sfg.gpu_invoke(khandle) + ) +``` + +In this snippet, we used the [generator configuration](#how_to_generator_scripts_config) +to change the suffix of the generated implementation file to `.cu`. + +When investigating the generated `.cu` file, you can see that the GPU launch configuration parameters +*grid size* and *block size* are being computed automatically from the array sizes. +This behavior can be changed by modifying options in the {any}`gpu <pystencils.codegen.config.GpuOptions>` +category of the `CreateKernelConfig`. + +### Adapting the Launch Configuration + +GPU kernel invocations usually require the user to provide a launch grid, defined +by the GPU thread block size and the number of blocks on the grid. +In the simplest case (seen above), pystencils-sfg will emit code that automatically +computes these parameters from the size of the arrays passed to the kernel, +using a default block size defined by pystencils. + +The code generator also permits customization of the launch configuration. +You may provide a custom block size to override the default, in which case the +grid size will still be computed by dividing the array sizes by your block size. +Otherwise, you can also fully take over control of both block and grid size. +For both cases, instructions are given in the following. + +#### User-Defined Block Size for Auto-Computed Grid Size + +To merely modify the block size argument while still automatically inferring the grid size, +pass a variable or expression of type `dim3` to the `block_size` parameter of `gpu_invoke`. +Pystencils-sfg exposes two versions of `dim3`, which differ primarily in their associated +runtime headers: + + - {any}`pystencilssfg.lang.gpu.cuda.dim3 <CudaAPI.dim3>` for CUDA, and + - {any}`pystencilssfg.lang.gpu.hip.dim3 <HipAPI.dim3>` for HIP. + +The following snippet selects the correct `dim3` type according to the kernel target; +it then creates a variable of that type and turns that into an argument to the kernel invocation: + +```{code-cell} ipython3 +:tags: [remove-cell] +target = ps.Target.HIP +cfg = ps.CreateKernelConfig(target=target) +f, g = ps.fields("f, g: double[128, 128]") +asm = ps.Assignment(f(0), g(0)) +``` + +```{code-cell} ipython3 +from pystencilssfg.lang.gpu import hip + +with SourceFileGenerator(sfg_config) as sfg: + # ... define kernel ... + khandle = sfg.kernels.create(asm, "gpu_kernel", cfg) + + # Select dim3 reflection + match target: + case ps.Target.CUDA: + from pystencilssfg.lang.gpu import cuda as gpu_api + case ps.Target.HIP: + from pystencilssfg.lang.gpu import hip as gpu_api + + # Create dim3 variable and pass it to kernel invocation + block_size = gpu_api.dim3(const=True).var("block_size") + + sfg.function("kernel_wrapper")( + sfg.gpu_invoke(khandle, block_size=block_size) + ) +``` + +#### Manual Launch Configurations + +To take full control of the launch configuration, we must disable its automatic inferrence +by setting the {any}`gpu.manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>` +code generator option to `True`. +Then, we must pass `dim3` arguments for both `block_size` and `grid_size` to the kernel invocation: + +```{code-cell} ipython3 +from pystencilssfg.lang.gpu import hip + +with SourceFileGenerator(sfg_config) as sfg: + # ... define kernel ... + + # Configure for manual launch config + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + cfg.gpu.manual_launch_grid = True + + # Register kernel + khandle = sfg.kernels.create(asm, "gpu_kernel", cfg) + + # Create dim3 variables + from pystencilssfg.lang.gpu import cuda + block_size = cuda.dim3(const=True).var("block_size") + grid_size = cuda.dim3(const=True).var("grid_size") + + sfg.function("kernel_wrapper")( + sfg.gpu_invoke(khandle, block_size=block_size, grid_size=grid_size) + ) +``` + +### Using Streams + +CUDA and HIP kernels can be enqueued into streams for concurrent execution. +This is mirrored in pystencils-sfg; +all overloads of `gpu_invoke` take an optional `stream` argument. +The `stream_t` data types of both CUDA and HIP are made available +through the respective API reflections: + + - {any}`lang.gpu.cuda.stream_t <CudaAPI.stream_t>` reflects `cudaStream_t`, and + - {any}`lang.gpu.hip.stream_t <HipAPI.stream_t>` reflects `hipStream_t`. + +Here is an example that creates a variable of the HIP stream type +and passes it to `gpu_invoke`: + +```{code-cell} ipython3 +:tags: [remove-cell] +cfg = ps.CreateKernelConfig(target=ps.Target.HIP) +f, g = ps.fields("f, g: double[128, 128]") +asm = ps.Assignment(f(0), g(0)) +``` + +```{code-cell} ipython3 +from pystencilssfg.lang.gpu import hip + +with SourceFileGenerator(sfg_config) as sfg: + # ... define kernel ... + khandle = sfg.kernels.create(asm, "gpu_kernel", cfg) + + stream = hip.stream_t(const=True).var("stream") + + sfg.function("kernel_wrapper")( + sfg.gpu_invoke(khandle, stream=stream) + ) +``` :::{admonition} To Do - - Creating and calling kernels - - Invoking GPU kernels and the CUDA API Mirror - Defining classes, their fields constructors, and methods ::: diff --git a/src/pystencilssfg/composer/__init__.py b/src/pystencilssfg/composer/__init__.py index f6af76b8b9c36445990fc451983fec5c14a4cf34..c8f279ecd43c9e7809e8f7796c5ad4ad36ba7a76 100644 --- a/src/pystencilssfg/composer/__init__.py +++ b/src/pystencilssfg/composer/__init__.py @@ -9,6 +9,7 @@ from .basic_composer import ( ) from .mixin import SfgComposerMixIn from .class_composer import SfgClassComposer +from .gpu_composer import SfgGpuComposer __all__ = [ "SfgIComposer", @@ -20,4 +21,5 @@ __all__ = [ "ExprLike", "SfgBasicComposer", "SfgClassComposer", + "SfgGpuComposer", ] diff --git a/src/pystencilssfg/composer/basic_composer.py b/src/pystencilssfg/composer/basic_composer.py index 31337a6282932420f8d5b6d9093deec5c2caea1a..d78e43deb53345aceca56baa6cf63f07b3a1d8de 100644 --- a/src/pystencilssfg/composer/basic_composer.py +++ b/src/pystencilssfg/composer/basic_composer.py @@ -13,7 +13,7 @@ from pystencils import ( Assignment, AssignmentCollection, ) -from pystencils.codegen import Kernel +from pystencils.codegen import Kernel, Lambda from pystencils.types import create_type, UserTypeSpec, PsType from ..context import SfgContext, SfgCursor @@ -21,7 +21,6 @@ from .custom import CustomGenerator from ..ir import ( SfgCallTreeNode, SfgKernelCallNode, - SfgCudaKernelInvocation, SfgStatements, SfgFunctionParams, SfgRequireIncludes, @@ -53,6 +52,7 @@ from ..lang import ( HeaderFile, includes, SfgVar, + SfgKernelParamVar, AugExpr, SupportsFieldExtraction, SupportsVectorExtraction, @@ -390,34 +390,14 @@ class SfgBasicComposer(SfgIComposer): """Use inside a function body to directly call a kernel. When using `call`, the given kernel will simply be called as a function. - To invoke a GPU kernel on a specified launch grid, use `cuda_invoke` - or the interfaces of ``pystencilssfg.extensions.sycl`` instead. + To invoke a GPU kernel on a specified launch grid, + use `gpu_invoke <SfgGpuComposer.gpu_invoke>` instead. Args: kernel_handle: Handle to a kernel previously added to some kernel namespace. """ return SfgKernelCallNode(kernel_handle) - def cuda_invoke( - self, - kernel_handle: SfgKernelHandle, - num_blocks: ExprLike, - threads_per_block: ExprLike, - stream: ExprLike | None, - ): - """Dispatch a CUDA kernel to the device.""" - num_blocks_str = str(num_blocks) - tpb_str = str(threads_per_block) - stream_str = str(stream) if stream is not None else None - - deps = depends(num_blocks) | depends(threads_per_block) - if stream is not None: - deps |= depends(stream) - - return SfgCudaKernelInvocation( - kernel_handle, num_blocks_str, tpb_str, stream_str, deps - ) - def seq(self, *args: tuple | str | SfgCallTreeNode | SfgNodeBuilder) -> SfgSequence: """Syntax sequencing. For details, see `make_sequence`""" return make_sequence(*args) @@ -511,6 +491,11 @@ class SfgBasicComposer(SfgIComposer): """ return AugExpr.format(fmt, *deps, **kwdeps) + def expr_from_lambda(self, lamb: Lambda) -> AugExpr: + depends = set(SfgKernelParamVar(p) for p in lamb.parameters) + code = lamb.c_code() + return AugExpr.make(code, depends, dtype=lamb.return_type) + @property def branch(self) -> SfgBranchBuilder: """Use inside a function body to create an if/else conditonal branch. @@ -564,7 +549,11 @@ class SfgBasicComposer(SfgIComposer): var: SfgVar | sp.Symbol = asvar(param) if isinstance(param, _VarLike) else param return SfgDeferredParamSetter(var, expr) - def map_vector(self, lhs_components: Sequence[VarLike | sp.Symbol], rhs: SupportsVectorExtraction): + def map_vector( + self, + lhs_components: Sequence[VarLike | sp.Symbol], + rhs: SupportsVectorExtraction, + ): """Extracts scalar numerical values from a vector data type. Args: diff --git a/src/pystencilssfg/composer/composer.py b/src/pystencilssfg/composer/composer.py index bba479e3289f2b589a1d32e61997bdaa915eb47e..b1cfc4b1d5cbbe704f3fa1b7d7082be379e379b5 100644 --- a/src/pystencilssfg/composer/composer.py +++ b/src/pystencilssfg/composer/composer.py @@ -3,12 +3,13 @@ from typing import TYPE_CHECKING from .basic_composer import SfgBasicComposer from .class_composer import SfgClassComposer +from .gpu_composer import SfgGpuComposer if TYPE_CHECKING: from ..context import SfgContext -class SfgComposer(SfgBasicComposer, SfgClassComposer): +class SfgComposer(SfgBasicComposer, SfgClassComposer, SfgGpuComposer): """Primary interface for constructing source files in pystencils-sfg. The SfgComposer combines the `SfgBasicComposer` @@ -19,3 +20,4 @@ class SfgComposer(SfgBasicComposer, SfgClassComposer): def __init__(self, sfg: SfgContext | SfgBasicComposer): SfgBasicComposer.__init__(self, sfg) SfgClassComposer.__init__(self) + SfgGpuComposer.__init__(self) diff --git a/src/pystencilssfg/composer/gpu_composer.py b/src/pystencilssfg/composer/gpu_composer.py new file mode 100644 index 0000000000000000000000000000000000000000..aaffb02820baabd93aead08db30a35b57851f3fd --- /dev/null +++ b/src/pystencilssfg/composer/gpu_composer.py @@ -0,0 +1,307 @@ +from __future__ import annotations + +from typing import overload + +from pystencils.codegen import GpuKernel, Target +from pystencils.codegen.gpu_indexing import ( + ManualLaunchConfiguration, + AutomaticLaunchConfiguration, + DynamicBlockSizeLaunchConfiguration, +) + +from .mixin import SfgComposerMixIn +from .basic_composer import make_statements, make_sequence + +from ..context import SfgContext +from ..ir import ( + SfgKernelHandle, + SfgCallTreeNode, + SfgGpuKernelInvocation, + SfgBlock, + SfgSequence, +) +from ..lang import ExprLike, AugExpr +from ..lang.gpu import CudaAPI, HipAPI, ProvidesGpuRuntimeAPI + + +class SfgGpuComposer(SfgComposerMixIn): + """Composer mix-in providing methods to generate GPU kernel invocations. + + .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, **kwargs) + + Invoke a GPU kernel with launch configuration parameters depending on its code generator configuration. + + The overloads of this method are listed below. + They all (partially) mirror the CUDA and HIP ``kernel<<< Gs, Bs, Sm, St >>>()`` syntax; + for details on the launch configuration arguments, + refer to `Launch Configurations in CUDA`_ + or `Launch Configurations in HIP`_. + + .. function:: gpu_invoke(kernel_handle: SfgKernelHandle, *, grid_size: ExprLike, block_size: ExprLike, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode + :noindex: + + Invoke a GPU kernel with a manual launch grid. + + Requires that the kernel was generated + with `manual_launch_grid <pystencils.codegen.config.GpuOptions.manual_launch_grid>` + set to `True`. + + .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode + :noindex: + + Invoke a GPU kernel with an automatic launch grid. + + This signature accepts kernels generated with an indexing scheme that + causes the launch grid to be determined automatically, + such as `Blockwise4D <pystencils.codegen.config.GpuIndexingScheme.Blockwise4D>`. + + .. function:: gpu_invoke(self, kernel_handle: SfgKernelHandle, *, block_size: ExprLike | None = None, shared_memory_bytes: ExprLike = "0", stream: ExprLike | None = None, ) -> SfgCallTreeNode + :noindex: + + Invoke a GPU kernel with a dynamic launch grid. + + This signature accepts kernels generated with an indexing scheme that permits a user-defined + blocks size, such as `Linear3D <pystencils.codegen.config.GpuIndexingScheme.Linear3D>`. + The grid size is calculated automatically by dividing the number of work items in each + dimension by the block size, rounding up. + + .. _Launch Configurations in CUDA: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration + + .. _Launch Configurations in HIP: https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions + """ # NOQA: E501 + + @overload + def gpu_invoke( + self, + kernel_handle: SfgKernelHandle, + *, + grid_size: ExprLike, + block_size: ExprLike, + shared_memory_bytes: ExprLike = "0", + stream: ExprLike | None = None, + ) -> SfgCallTreeNode: ... + + @overload + def gpu_invoke( + self, + kernel_handle: SfgKernelHandle, + *, + shared_memory_bytes: ExprLike = "0", + stream: ExprLike | None = None, + ) -> SfgCallTreeNode: ... + + @overload + def gpu_invoke( + self, + kernel_handle: SfgKernelHandle, + *, + block_size: ExprLike | None = None, + shared_memory_bytes: ExprLike = "0", + stream: ExprLike | None = None, + ) -> SfgCallTreeNode: ... + + def gpu_invoke( + self, + kernel_handle: SfgKernelHandle, + shared_memory_bytes: ExprLike = "0", + stream: ExprLike | None = None, + **kwargs, + ) -> SfgCallTreeNode: + builder = GpuInvocationBuilder(self._ctx, kernel_handle) + builder.shared_memory_bytes = shared_memory_bytes + builder.stream = stream + + return builder(**kwargs) + + def cuda_invoke( + self, + kernel_handle: SfgKernelHandle, + num_blocks: ExprLike, + threads_per_block: ExprLike, + stream: ExprLike | None, + ): + from warnings import warn + + warn( + "cuda_invoke is deprecated and will be removed before version 0.1. " + "Use `gpu_invoke` instead.", + FutureWarning, + ) + + return self.gpu_invoke( + kernel_handle, + grid_size=num_blocks, + block_size=threads_per_block, + stream=stream, + ) + + +class GpuInvocationBuilder: + def __init__( + self, + ctx: SfgContext, + kernel_handle: SfgKernelHandle, + ): + self._ctx = ctx + self._kernel_handle = kernel_handle + + ker = kernel_handle.kernel + + if not isinstance(ker, GpuKernel): + raise ValueError(f"Non-GPU kernel was passed to `gpu_invoke`: {ker}") + + launch_config = ker.get_launch_configuration() + + self._launch_config = launch_config + + gpu_api: type[ProvidesGpuRuntimeAPI] + match ker.target: + case Target.CUDA: + gpu_api = CudaAPI + case Target.HIP: + gpu_api = HipAPI + case _: + assert False, "unexpected GPU target" + + self._gpu_api = gpu_api + self._dim3 = gpu_api.dim3 + + self._shared_memory_bytes: ExprLike = "0" + self._stream: ExprLike | None = None + + @property + def shared_memory_bytes(self) -> ExprLike: + return self._shared_memory_bytes + + @shared_memory_bytes.setter + def shared_memory_bytes(self, bs: ExprLike): + self._shared_memory_bytes = bs + + @property + def stream(self) -> ExprLike | None: + return self._stream + + @stream.setter + def stream(self, s: ExprLike | None): + self._stream = s + + def _render_invocation( + self, grid_size: ExprLike, block_size: ExprLike + ) -> SfgSequence: + stmt_grid_size = make_statements(grid_size) + stmt_block_size = make_statements(block_size) + stmt_smem = make_statements(self._shared_memory_bytes) + stmt_stream = ( + make_statements(self._stream) if self._stream is not None else None + ) + + return make_sequence( + "// clang-format off: " + "[pystencils-sfg] Formatting may add illegal spaces between angular brackets in `<<< >>>`.", + SfgGpuKernelInvocation( + self._kernel_handle, + stmt_grid_size, + stmt_block_size, + shared_memory_bytes=stmt_smem, + stream=stmt_stream, + ), + "// clang-format on", + ) + + def __call__(self, **kwargs: ExprLike) -> SfgCallTreeNode: + match self._launch_config: + case ManualLaunchConfiguration(): + return self._invoke_manual(**kwargs) + case AutomaticLaunchConfiguration(): + return self._invoke_automatic(**kwargs) + case DynamicBlockSizeLaunchConfiguration(): + return self._invoke_dynamic(**kwargs) + case _: + raise ValueError( + f"Unexpected launch configuration: {self._launch_config}" + ) + + def _invoke_manual(self, grid_size: ExprLike, block_size: ExprLike): + assert isinstance(self._launch_config, ManualLaunchConfiguration) + return self._render_invocation(grid_size, block_size) + + def _invoke_automatic(self): + assert isinstance(self._launch_config, AutomaticLaunchConfiguration) + + from .composer import SfgComposer + + sfg = SfgComposer(self._ctx) + + grid_size_entries = [ + self._to_uint32_t(sfg.expr_from_lambda(gs)) + for gs in self._launch_config._grid_size + ] + grid_size_var = self._dim3(const=True).var("__grid_size") + + block_size_entries = [ + self._to_uint32_t(sfg.expr_from_lambda(bs)) + for bs in self._launch_config._block_size + ] + block_size_var = self._dim3(const=True).var("__block_size") + + nodes = [ + sfg.init(grid_size_var)(*grid_size_entries), + sfg.init(block_size_var)(*block_size_entries), + self._render_invocation(grid_size_var, block_size_var), + ] + + return SfgBlock(SfgSequence(nodes)) + + def _invoke_dynamic(self, block_size: ExprLike | None = None): + assert isinstance(self._launch_config, DynamicBlockSizeLaunchConfiguration) + + from .composer import SfgComposer + + sfg = SfgComposer(self._ctx) + + block_size_init_args: tuple[ExprLike, ...] + if block_size is None: + block_size_init_args = tuple( + str(bs) for bs in self._launch_config.default_block_size + ) + else: + block_size_init_args = (block_size,) + + block_size_var = self._dim3(const=True).var("__block_size") + + from ..lang.cpp import std + + work_items_entries = [ + sfg.expr_from_lambda(wit) for wit in self._launch_config.num_work_items + ] + work_items_var = std.tuple("uint32_t", "uint32_t", "uint32_t", const=True).var( + "__work_items" + ) + + def _div_ceil(a: ExprLike, b: ExprLike): + return AugExpr.format("({a} + {b} - 1) / {b}", a=a, b=b) + + grid_size_entries = [ + _div_ceil(work_items_var.get(i), bs) + for i, bs in enumerate( + [ + block_size_var.x, + block_size_var.y, + block_size_var.z, + ] + ) + ] + grid_size_var = self._dim3(const=True).var("__grid_size") + + nodes = [ + sfg.init(block_size_var)(*block_size_init_args), + sfg.init(work_items_var)(*work_items_entries), + sfg.init(grid_size_var)(*grid_size_entries), + self._render_invocation(grid_size_var, block_size_var), + ] + + return SfgBlock(SfgSequence(nodes)) + + @staticmethod + def _to_uint32_t(expr: AugExpr) -> AugExpr: + return AugExpr("uint32_t").format("uint32_t({})", expr) diff --git a/src/pystencilssfg/context.py b/src/pystencilssfg/context.py index 1622a1e3bd33259cd89eef171ad043c4ea9ef536..5773455198187fef7d670ca1f4f9232fb1d7adde 100644 --- a/src/pystencilssfg/context.py +++ b/src/pystencilssfg/context.py @@ -2,7 +2,7 @@ from __future__ import annotations from typing import Sequence, Any, Generator from contextlib import contextmanager -from .config import CodeStyle +from .config import CodeStyle, ClangFormatOptions from .ir import ( SfgSourceFile, SfgNamespace, @@ -23,6 +23,7 @@ class SfgContext: impl_file: SfgSourceFile | None, namespace: str | None = None, codestyle: CodeStyle | None = None, + clang_format_opts: ClangFormatOptions | None = None, argv: Sequence[str] | None = None, project_info: Any = None, ): @@ -33,6 +34,9 @@ class SfgContext: self._inner_namespace: str | None = None self._codestyle = codestyle if codestyle is not None else CodeStyle() + self._clang_format: ClangFormatOptions = ( + clang_format_opts if clang_format_opts is not None else ClangFormatOptions() + ) self._header_file = header_file self._impl_file = impl_file @@ -73,6 +77,10 @@ class SfgContext: """The code style object for this generation context.""" return self._codestyle + @property + def clang_format(self) -> ClangFormatOptions: + return self._clang_format + @property def header_file(self) -> SfgSourceFile: return self._header_file @@ -150,6 +158,9 @@ class SfgCursor: self._loc[f].append(block) self._loc[f] = block.elements + outer_namespace = self._cur_namespace + self._cur_namespace = namespace + @contextmanager def ctxmgr(): try: @@ -157,5 +168,6 @@ class SfgCursor: finally: # Have the cursor step back out of the nested namespace blocks self._loc = outer_locs + self._cur_namespace = outer_namespace return ctxmgr() diff --git a/src/pystencilssfg/extensions/gpu.py b/src/pystencilssfg/extensions/gpu.py deleted file mode 100644 index b4242ac81aa166085dca20b4936cc56e1b829720..0000000000000000000000000000000000000000 --- a/src/pystencilssfg/extensions/gpu.py +++ /dev/null @@ -1,38 +0,0 @@ -from pystencilssfg import lang - - -def dim3class(gpu_runtime_header: str, *, cls_name: str = "dim3"): - """ - >>> dim3 = dim3class("<hip/hip_runtime.h>") - >>> dim3().ctor(64, 1, 1) - dim3{64, 1, 1} - - Args: - gpu_runtime_header: String with the name of the gpu runtime header - cls_name: String with the acutal name (default "dim3") - Returns: - Dim3Class: A `lang.CppClass` that mimics cuda's/hip's `dim3` - """ - @lang.cppclass(cls_name, gpu_runtime_header) - class Dim3Class: - def ctor(self, dim0=1, dim1=1, dim2=1): - return self.ctor_bind(dim0, dim1, dim2) - - @property - def x(self): - return lang.AugExpr.format("{}.x", self) - - @property - def y(self): - return lang.AugExpr.format("{}.y", self) - - @property - def z(self): - return lang.AugExpr.format("{}.z", self) - - @property - def dims(self): - """The dims property.""" - return [self.x, self.y, self.z] - - return Dim3Class diff --git a/src/pystencilssfg/generator.py b/src/pystencilssfg/generator.py index c314d67bbc45d5da639d3b7a2b2f92667cc77586..fe4eb99d4cf15056e457fd14a7976d54bb0db6ae 100644 --- a/src/pystencilssfg/generator.py +++ b/src/pystencilssfg/generator.py @@ -95,9 +95,7 @@ class SourceFileGenerator: self._impl_file = SfgSourceFile( output_files[1].name, SfgSourceFileType.TRANSLATION_UNIT ) - self._impl_file.includes.append( - HeaderFile.parse(self._header_file.name) - ) + self._impl_file.includes.append(HeaderFile.parse(self._header_file.name)) # TODO: Find a way to not hard-code the restrict qualifier in pystencils self._header_file.elements.append("#define RESTRICT __restrict__") @@ -115,14 +113,11 @@ class SourceFileGenerator: self._impl_file, namespace, config.codestyle, + config.clang_format, argv=script_args, project_info=cli_params.get_project_info(), ) - self._emitter = SfgCodeEmitter( - self._output_dir, config.codestyle, config.clang_format - ) - sort_key = config.codestyle.get_option("includes_sorting_key") if sort_key is None: @@ -161,6 +156,13 @@ class SourceFileGenerator: ) self._impl_file.includes.sort(key=self._include_sort_key) + def _get_emitter(self): + return SfgCodeEmitter( + self._output_dir, + self._context.codestyle, + self._context.clang_format, + ) + def __enter__(self) -> SfgComposer: self.clean_files() return SfgComposer(self._context) @@ -169,6 +171,7 @@ class SourceFileGenerator: if exc_type is None: self._finish_files() - self._emitter.emit(self._header_file) + emitter = self._get_emitter() + emitter.emit(self._header_file) if self._impl_file is not None: - self._emitter.emit(self._impl_file) + emitter.emit(self._impl_file) diff --git a/src/pystencilssfg/ir/__init__.py b/src/pystencilssfg/ir/__init__.py index 8f03fed0d4c2467377cdaab6cf100a13f7ded9fb..0d93fb148333921d60f25cf68ace89f61873e383 100644 --- a/src/pystencilssfg/ir/__init__.py +++ b/src/pystencilssfg/ir/__init__.py @@ -3,7 +3,7 @@ from .call_tree import ( SfgCallTreeLeaf, SfgEmptyNode, SfgKernelCallNode, - SfgCudaKernelInvocation, + SfgGpuKernelInvocation, SfgBlock, SfgSequence, SfgStatements, @@ -47,7 +47,7 @@ __all__ = [ "SfgCallTreeLeaf", "SfgEmptyNode", "SfgKernelCallNode", - "SfgCudaKernelInvocation", + "SfgGpuKernelInvocation", "SfgSequence", "SfgBlock", "SfgStatements", diff --git a/src/pystencilssfg/ir/call_tree.py b/src/pystencilssfg/ir/call_tree.py index 24a315d5a0ae0319cbc1b906f98deacc72828176..61c5bbbe01cdd42ae7b94b7843aa1e916c3afedb 100644 --- a/src/pystencilssfg/ir/call_tree.py +++ b/src/pystencilssfg/ir/call_tree.py @@ -19,6 +19,7 @@ class SfgCallTreeNode(ABC): Therefore, every instantiable call tree node must implement the method `get_code`. By convention, the string returned by `get_code` should not contain a trailing newline. """ + def __init__(self) -> None: self._includes: set[HeaderFile] = set() @@ -34,6 +35,11 @@ class SfgCallTreeNode(ABC): By convention, the code block emitted by this function should not contain a trailing newline. """ + @property + def depends(self) -> set[SfgVar]: + """Set of objects this leaf depends on""" + return set() + @property def required_includes(self) -> set[HeaderFile]: """Return a set of header includes required by this node""" @@ -53,11 +59,6 @@ class SfgCallTreeLeaf(SfgCallTreeNode, ABC): def children(self) -> Sequence[SfgCallTreeNode]: return () - @property - @abstractmethod - def depends(self) -> set[SfgVar]: - """Set of objects this leaf depends on""" - class SfgEmptyNode(SfgCallTreeLeaf): """A leaf node that does not emit any code. @@ -202,52 +203,76 @@ class SfgKernelCallNode(SfgCallTreeLeaf): return set(self._kernel_handle.parameters) def get_code(self, cstyle: CodeStyle) -> str: - ast_params = self._kernel_handle.parameters + kparams = self._kernel_handle.parameters fnc_name = self._kernel_handle.fqname - call_parameters = ", ".join([p.name for p in ast_params]) + call_parameters = ", ".join([p.name for p in kparams]) return f"{fnc_name}({call_parameters});" -class SfgCudaKernelInvocation(SfgCallTreeLeaf): +class SfgGpuKernelInvocation(SfgCallTreeNode): + """A CUDA or HIP kernel invocation. + + See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#execution-configuration + or https://rocmdocs.amd.com/projects/HIP/en/latest/how-to/hip_cpp_language_extensions.html#calling-global-functions + for the syntax. + """ + def __init__( self, kernel_handle: SfgKernelHandle, - num_blocks_code: str, - threads_per_block_code: str, - stream_code: str | None, - depends: set[SfgVar], + grid_size: SfgStatements, + block_size: SfgStatements, + shared_memory_bytes: SfgStatements | None, + stream: SfgStatements | None, ): - from pystencils import Target from pystencils.codegen import GpuKernel kernel = kernel_handle.kernel - if not (isinstance(kernel, GpuKernel) and kernel.target == Target.CUDA): + if not isinstance(kernel, GpuKernel): raise ValueError( - "An `SfgCudaKernelInvocation` node can only call a CUDA kernel." + "An `SfgGpuKernelInvocation` node can only call GPU kernels." ) super().__init__() self._kernel_handle = kernel_handle - self._num_blocks = num_blocks_code - self._threads_per_block = threads_per_block_code - self._stream = stream_code - self._depends = depends + self._grid_size = grid_size + self._block_size = block_size + self._shared_memory_bytes = shared_memory_bytes + self._stream = stream + + @property + def children(self) -> Sequence[SfgCallTreeNode]: + return ( + ( + self._grid_size, + self._block_size, + ) + + ( + (self._shared_memory_bytes,) + if self._shared_memory_bytes is not None + else () + ) + + ((self._stream,) if self._stream is not None else ()) + ) @property def depends(self) -> set[SfgVar]: - return set(self._kernel_handle.parameters) | self._depends + return set(self._kernel_handle.parameters) def get_code(self, cstyle: CodeStyle) -> str: - ast_params = self._kernel_handle.parameters + kparams = self._kernel_handle.parameters fnc_name = self._kernel_handle.fqname - call_parameters = ", ".join([p.name for p in ast_params]) + call_parameters = ", ".join([p.name for p in kparams]) + + grid_args = [self._grid_size, self._block_size] + if self._shared_memory_bytes is not None: + grid_args += [self._shared_memory_bytes] - grid_args = [self._num_blocks, self._threads_per_block] if self._stream is not None: grid_args += [self._stream] - grid = "<<< " + ", ".join(grid_args) + " >>>" + grid = "<<< " + ", ".join(arg.get_code(cstyle) for arg in grid_args) + " >>>" return f"{fnc_name}{grid}({call_parameters});" diff --git a/src/pystencilssfg/ir/postprocessing.py b/src/pystencilssfg/ir/postprocessing.py index 1e692b0aa9f37368da1688ec2d3bac6892c5ac60..896693317c02dee67302221f810d64c01b5eb233 100644 --- a/src/pystencilssfg/ir/postprocessing.py +++ b/src/pystencilssfg/ir/postprocessing.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import Sequence, Iterable import warnings -from functools import reduce from dataclasses import dataclass from abc import ABC, abstractmethod @@ -15,7 +14,7 @@ from pystencils.codegen.properties import FieldBasePtr, FieldShape, FieldStride from ..exceptions import SfgException from ..config import CodeStyle -from .call_tree import SfgCallTreeNode, SfgCallTreeLeaf, SfgSequence, SfgStatements +from .call_tree import SfgCallTreeNode, SfgSequence, SfgStatements from ..lang.expressions import SfgKernelParamVar from ..lang import ( SfgVar, @@ -163,17 +162,12 @@ class CallTreePostProcessing: self.handle_sequence(node, ppc) return ppc.live_variables - case SfgCallTreeLeaf(): - return node.depends - case SfgDeferredNode(): raise SfgException("Deferred nodes can only occur inside a sequence.") case _: - return reduce( - lambda x, y: x | y, - (self.get_live_variables(c) for c in node.children), - set(), + return node.depends.union( + *(self.get_live_variables(c) for c in node.children) ) diff --git a/src/pystencilssfg/lang/cpp/std_tuple.py b/src/pystencilssfg/lang/cpp/std_tuple.py index 645b6b56fbeb515d6324feafdb8588f7ca22e992..6d1e1c0da0987be055006ce696779a1e2ebaa56b 100644 --- a/src/pystencilssfg/lang/cpp/std_tuple.py +++ b/src/pystencilssfg/lang/cpp/std_tuple.py @@ -19,10 +19,13 @@ class StdTuple(AugExpr, SupportsVectorExtraction): dtype = self._template(ts=", ".join(elt_type_strings), const=const, ref=ref) super().__init__(dtype) + def get(self, idx: int | str) -> AugExpr: + return AugExpr.format("std::get< {} >({})", idx, self) + def _extract_component(self, coordinate: int) -> AugExpr: if coordinate < 0 or coordinate >= self._length: raise ValueError( f"Index {coordinate} out-of-bounds for std::tuple with {self._length} entries." ) - return AugExpr.format("std::get< {} >({})", coordinate, self) + return self.get(coordinate) diff --git a/src/pystencilssfg/lang/expressions.py b/src/pystencilssfg/lang/expressions.py index 135a54eed92e4ba214244c8f46323ea81f6610db..8be59b0b37d72113814fa1489a44e7816f0e6f75 100644 --- a/src/pystencilssfg/lang/expressions.py +++ b/src/pystencilssfg/lang/expressions.py @@ -218,8 +218,12 @@ class AugExpr: return self._bind(expr) @staticmethod - def make(code: str, depends: Iterable[SfgVar | AugExpr]): - return AugExpr()._bind(DependentExpression(code, depends)) + def make( + code: str, + depends: Iterable[SfgVar | AugExpr], + dtype: UserTypeSpec | None = None, + ): + return AugExpr(dtype)._bind(DependentExpression(code, depends)) @staticmethod def format(fmt: str, *deps, **kwdeps) -> AugExpr: diff --git a/src/pystencilssfg/lang/gpu.py b/src/pystencilssfg/lang/gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b55161eb536cc1dd6dcba2c0bd08827c91dc01 --- /dev/null +++ b/src/pystencilssfg/lang/gpu.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from typing import Protocol + +from .expressions import CppClass, cpptype, AugExpr + + +class Dim3Interface(CppClass): + """Interface definition for the ``dim3`` struct of Cuda and HIP.""" + + def ctor(self, dim0=1, dim1=1, dim2=1): + """Constructor invocation of ``dim3``""" + return self.ctor_bind(dim0, dim1, dim2) + + @property + def x(self) -> AugExpr: + """The `x` coordinate member.""" + return AugExpr.format("{}.x", self) + + @property + def y(self) -> AugExpr: + """The `y` coordinate member.""" + return AugExpr.format("{}.y", self) + + @property + def z(self) -> AugExpr: + """The `z` coordinate member.""" + return AugExpr.format("{}.z", self) + + @property + def dims(self) -> tuple[AugExpr, AugExpr, AugExpr]: + """`x`, `y`, and `z` as a tuple.""" + return (self.x, self.y, self.z) + + +class ProvidesGpuRuntimeAPI(Protocol): + """Protocol definition for a GPU runtime API provider.""" + + dim3: type[Dim3Interface] + """The ``dim3`` struct type for this GPU runtime""" + + stream_t: type[AugExpr] + """The ``stream_t`` type for this GPU runtime""" + + +class CudaAPI(ProvidesGpuRuntimeAPI): + """Reflection of the CUDA runtime API""" + + class dim3(Dim3Interface): + """Implements `Dim3Interface` for CUDA""" + + template = cpptype("dim3", "<cuda_runtime.h>") + + class stream_t(CppClass): + template = cpptype("cudaStream_t", "<cuda_runtime.h>") + + +cuda = CudaAPI +"""Alias for `CudaAPI`""" + + +class HipAPI(ProvidesGpuRuntimeAPI): + """Reflection of the HIP runtime API""" + + class dim3(Dim3Interface): + """Implements `Dim3Interface` for HIP""" + + template = cpptype("dim3", "<hip/hip_runtime.h>") + + class stream_t(CppClass): + template = cpptype("hipStream_t", "<hip/hip_runtime.h>") + + +hip = HipAPI +"""Alias for `HipAPI`""" diff --git a/tests/extensions/test_gpu.py b/tests/extensions/test_gpu.py deleted file mode 100644 index 2e8d133e80d0b8f9487e39290f4528aa1f11533e..0000000000000000000000000000000000000000 --- a/tests/extensions/test_gpu.py +++ /dev/null @@ -1,26 +0,0 @@ -from pystencilssfg.extensions.gpu import dim3class -from pystencilssfg.lang import HeaderFile, AugExpr - - -def test_dim3(): - cuda_runtime = "<cuda_runtime.h>" - dim3 = dim3class(cuda_runtime, cls_name="dim3") - assert HeaderFile.parse(cuda_runtime) in dim3.template.includes - assert str(dim3().ctor(128, 1, 1)) == "dim3{128, 1, 1}" - assert str(dim3().ctor()) == "dim3{1, 1, 1}" - assert str(dim3().ctor(1, 1, 128)) == "dim3{1, 1, 128}" - - block = dim3(ref=True, const=True).var("block") - - dims = [ - AugExpr.format( - "uint32_t(({} + {} - 1)/ {})", - 1024, - block.dims[i], - block.dims[i], - ) - for i in range(3) - ] - - grid = dim3().ctor(*dims) - assert str(grid) == f"dim3{{{', '.join((str(d) for d in dims))}}}" diff --git a/tests/generator_scripts/index.yaml b/tests/generator_scripts/index.yaml index 1c97aaf6a34b0171f4829c56da0f345934c50176..c87977f3761739985b65e64e5b069b0359efdce0 100644 --- a/tests/generator_scripts/index.yaml +++ b/tests/generator_scripts/index.yaml @@ -90,6 +90,33 @@ StlContainers1D: MdSpanFixedShapeLayouts: MdSpanLbStreaming: +# CUDA + +CudaKernels: + sfg-args: + file-extensions: ["hpp", "cu"] + compile: + cxx: nvcc + cxx-flags: + - -std=c++20 + - -Werror + - all-warnings + - --expt-relaxed-constexpr + skip-if-not-found: true + +# HIP + +HipKernels: + sfg-args: + file-extensions: ["hpp", "hip"] + compile: + cxx: hipcc + cxx-flags: + - -std=c++20 + - -Wall + - -Werror + skip-if-not-found: true + # SYCL SyclKernels: diff --git a/tests/generator_scripts/source/CudaKernels.harness.cpp b/tests/generator_scripts/source/CudaKernels.harness.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a7d34d5ab99a7bf2c913b20f4cdba88371f38f7e --- /dev/null +++ b/tests/generator_scripts/source/CudaKernels.harness.cpp @@ -0,0 +1,116 @@ +#include "CudaKernels.hpp" + +#include <cuda_runtime.h> + +#include <experimental/mdspan> +#include <random> +#include <iostream> +#include <functional> + +#undef NDEBUG +#include <cassert> + +namespace stdex = std::experimental; + +using extents_t = stdex::dextents<uint64_t, 3>; +using field_t = stdex::mdspan<double, extents_t, stdex::layout_right>; + +void checkCudaError(cudaError_t err) +{ + if (err != cudaSuccess) + { + std::cerr << "HIP Error: " << err << std::endl; + exit(2); + } +} + +int main(void) +{ + + extents_t extents{23, 25, 132}; + size_t items{extents.extent(0) * extents.extent(1) * extents.extent(2)}; + + double *data_src; + checkCudaError(cudaMallocManaged<double>(&data_src, sizeof(double) * items)); + field_t src{data_src, extents}; + + double *data_dst; + checkCudaError(cudaMallocManaged<double>(&data_dst, sizeof(double) * items)); + field_t dst{data_dst, extents}; + + std::random_device rd; + std::mt19937 gen{rd()}; + std::uniform_real_distribution<double> distrib{-1.0, 1.0}; + + auto check = [&](std::function<void()> invoke) + { + for (size_t i = 0; i < items; ++i) + { + data_src[i] = distrib(gen); + data_dst[i] = NAN; + } + + invoke(); + + for (size_t i = 0; i < items; ++i) + { + const double desired = 2.0 * data_src[i]; + if (std::abs(desired - data_dst[i]) >= 1e-12) + { + std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl; + exit(EXIT_FAILURE); + } + } + }; + + check([&]() + { + /* Linear3D Dynamic */ + dim3 blockSize{64, 8, 1}; + cudaStream_t stream; + checkCudaError(cudaStreamCreate(&stream)); + gen::linear3d::scaleKernel(blockSize, dst, src, stream); + checkCudaError(cudaStreamSynchronize(stream)); }); + + check([&]() + { + /* Linear3D Automatic */ + cudaStream_t stream; + checkCudaError(cudaStreamCreate(&stream)); + gen::linear3d_automatic::scaleKernel(dst, src, stream); + checkCudaError(cudaStreamSynchronize(stream)); }); + + check([&]() + { + /* Blockwise4D Automatic */ + cudaStream_t stream; + checkCudaError(cudaStreamCreate(&stream)); + gen::blockwise4d::scaleKernel(dst, src, stream); + checkCudaError(cudaStreamSynchronize(stream)); }); + + check([&]() + { + /* Linear3D Manual */ + dim3 blockSize{32, 8, 1}; + dim3 gridSize{5, 4, 23}; + + cudaStream_t stream; + checkCudaError(cudaStreamCreate(&stream)); + gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream); + checkCudaError(cudaStreamSynchronize(stream)); }); + + check([&]() + { + /* Blockwise4D Manual */ + dim3 blockSize{132, 1, 1}; + dim3 gridSize{25, 23, 1}; + cudaStream_t stream; + checkCudaError(cudaStreamCreate(&stream)); + gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream); + checkCudaError(cudaStreamSynchronize(stream)); }); + + checkCudaError(cudaFree(data_src)); + checkCudaError(cudaFree(data_dst)); + + return EXIT_SUCCESS; +} diff --git a/tests/generator_scripts/source/CudaKernels.py b/tests/generator_scripts/source/CudaKernels.py new file mode 100644 index 0000000000000000000000000000000000000000..e019e4faf1915023000cc0e12bed770c07d66031 --- /dev/null +++ b/tests/generator_scripts/source/CudaKernels.py @@ -0,0 +1,102 @@ +from pystencilssfg import SourceFileGenerator +from pystencilssfg.lang.cpp import std +from pystencilssfg.lang.gpu import cuda + +import pystencils as ps + +std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>") + + +src, dst = ps.fields("src, dst: double[3D]", layout="c") +asm = ps.Assignment(dst(0), 2 * src(0)) + + +with SourceFileGenerator() as sfg: + sfg.namespace("gen") + + base_config = ps.CreateKernelConfig(target=ps.Target.CUDA) + + block_size = cuda.dim3().var("blockSize") + grid_size = cuda.dim3().var("gridSize") + stream = cuda.stream_t().var("stream") + + with sfg.namespace("linear3d"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, block_size=block_size, stream=stream), + ) + + with sfg.namespace("linear3d_automatic"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, stream=stream), + ) + + with sfg.namespace("blockwise4d"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "blockwise4d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, stream=stream), + ) + + with sfg.namespace("linear3d_manual"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + cfg.gpu.manual_launch_grid = True + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke( + khandle, block_size=block_size, grid_size=grid_size, stream=stream + ), + ) + + with sfg.namespace("blockwise4d_manual"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "blockwise4d" + cfg.gpu.manual_launch_grid = True + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke( + khandle, block_size=block_size, grid_size=grid_size, stream=stream + ), + ) diff --git a/tests/generator_scripts/source/HipKernels.harness.cpp b/tests/generator_scripts/source/HipKernels.harness.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2bf7b83c1dae274c16b5f5e286e3edd6024280e5 --- /dev/null +++ b/tests/generator_scripts/source/HipKernels.harness.cpp @@ -0,0 +1,116 @@ +#include "HipKernels.hpp" + +#include <hip/hip_runtime.h> + +#include <experimental/mdspan> +#include <random> +#include <iostream> +#include <functional> + +#undef NDEBUG +#include <cassert> + +namespace stdex = std::experimental; + +using extents_t = stdex::dextents<uint64_t, 3>; +using field_t = stdex::mdspan<double, extents_t, stdex::layout_right>; + +void checkHipError(hipError_t err) +{ + if (err != hipSuccess) + { + std::cerr << "HIP Error: " << err << std::endl; + exit(2); + } +} + +int main(void) +{ + + extents_t extents{23, 25, 132}; + size_t items{extents.extent(0) * extents.extent(1) * extents.extent(2)}; + + double *data_src; + checkHipError(hipMallocManaged<double>(&data_src, sizeof(double) * items)); + field_t src{data_src, extents}; + + double *data_dst; + checkHipError(hipMallocManaged<double>(&data_dst, sizeof(double) * items)); + field_t dst{data_dst, extents}; + + std::random_device rd; + std::mt19937 gen{rd()}; + std::uniform_real_distribution<double> distrib{-1.0, 1.0}; + + auto check = [&](std::function<void()> invoke) + { + for (size_t i = 0; i < items; ++i) + { + data_src[i] = distrib(gen); + data_dst[i] = NAN; + } + + invoke(); + + for (size_t i = 0; i < items; ++i) + { + const double desired = 2.0 * data_src[i]; + if (std::abs(desired - data_dst[i]) >= 1e-12) + { + std::cerr << "Mismatch at element " << i << "; Desired: " << desired << "; Actual: " << data_dst[i] << std::endl; + exit(EXIT_FAILURE); + } + } + }; + + check([&]() + { + /* Linear3D Dynamic */ + dim3 blockSize{64, 8, 1}; + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + gen::linear3d::scaleKernel(blockSize, dst, src, stream); + checkHipError(hipStreamSynchronize(stream)); }); + + check([&]() + { + /* Linear3D Automatic */ + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + gen::linear3d_automatic::scaleKernel(dst, src, stream); + checkHipError(hipStreamSynchronize(stream)); }); + + check([&]() + { + /* Blockwise4D Automatic */ + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + gen::blockwise4d::scaleKernel(dst, src, stream); + checkHipError(hipStreamSynchronize(stream)); }); + + check([&]() + { + /* Linear3D Manual */ + dim3 blockSize{32, 8, 1}; + dim3 gridSize{5, 4, 23}; + + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + gen::linear3d_manual::scaleKernel(blockSize, dst, gridSize, src, stream); + checkHipError(hipStreamSynchronize(stream)); }); + + check([&]() + { + /* Blockwise4D Manual */ + dim3 blockSize{132, 1, 1}; + dim3 gridSize{25, 23, 1}; + hipStream_t stream; + checkHipError(hipStreamCreate(&stream)); + gen::blockwise4d_manual::scaleKernel(blockSize, dst, gridSize, src, stream); + checkHipError(hipStreamSynchronize(stream)); }); + + checkHipError(hipFree(data_src)); + checkHipError(hipFree(data_dst)); + + return EXIT_SUCCESS; +} diff --git a/tests/generator_scripts/source/HipKernels.py b/tests/generator_scripts/source/HipKernels.py new file mode 100644 index 0000000000000000000000000000000000000000..20d9df5552ef595330a7d1af8486bd5b47ffee43 --- /dev/null +++ b/tests/generator_scripts/source/HipKernels.py @@ -0,0 +1,102 @@ +from pystencilssfg import SourceFileGenerator +from pystencilssfg.lang.cpp import std +from pystencilssfg.lang.gpu import hip + +import pystencils as ps + +std.mdspan.configure(namespace="std::experimental", header="<experimental/mdspan>") + + +src, dst = ps.fields("src, dst: double[3D]", layout="c") +asm = ps.Assignment(dst(0), 2 * src(0)) + + +with SourceFileGenerator() as sfg: + sfg.namespace("gen") + + base_config = ps.CreateKernelConfig(target=ps.Target.HIP) + + block_size = hip.dim3().var("blockSize") + grid_size = hip.dim3().var("gridSize") + stream = hip.stream_t().var("stream") + + with sfg.namespace("linear3d"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, block_size=block_size, stream=stream), + ) + + with sfg.namespace("linear3d_automatic"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, stream=stream), + ) + + with sfg.namespace("blockwise4d"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "blockwise4d" + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke(khandle, stream=stream), + ) + + with sfg.namespace("linear3d_manual"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "linear3d" + cfg.gpu.manual_launch_grid = True + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke( + khandle, block_size=block_size, grid_size=grid_size, stream=stream + ), + ) + + with sfg.namespace("blockwise4d_manual"): + cfg = base_config.copy() + cfg.gpu.indexing_scheme = "blockwise4d" + cfg.gpu.manual_launch_grid = True + khandle = sfg.kernels.create(asm, "scale", cfg) + + sfg.function("scaleKernel")( + sfg.map_field( + src, std.mdspan.from_field(src, ref=True, layout_policy="layout_right") + ), + sfg.map_field( + dst, std.mdspan.from_field(dst, ref=True, layout_policy="layout_right") + ), + sfg.gpu_invoke( + khandle, block_size=block_size, grid_size=grid_size, stream=stream + ), + ) diff --git a/tests/generator_scripts/test_generator_scripts.py b/tests/generator_scripts/test_generator_scripts.py index 6f2ff160e8b9ec2367020a4a3f91d4a071a71e39..8901c3444370e238d9c6fea60a73141eaffa1596 100644 --- a/tests/generator_scripts/test_generator_scripts.py +++ b/tests/generator_scripts/test_generator_scripts.py @@ -12,16 +12,24 @@ import shutil import warnings import subprocess +from pystencils.include import get_pystencils_include_path + THIS_DIR = pathlib.Path(__file__).parent DEPS_DIR = THIS_DIR / "deps" MDSPAN_QUAL_PATH = "mdspan-mdspan-0.6.0/include/" +PYSTENCILS_RT_INCLUDE_PATH = get_pystencils_include_path() TEST_INDEX = THIS_DIR / "index.yaml" SOURCE_DIR = THIS_DIR / "source" EXPECTED_DIR = THIS_DIR / "expected" -CXX_INCLUDE_FLAGS = ["-I", f"{DEPS_DIR}/{MDSPAN_QUAL_PATH}"] +CXX_INCLUDE_FLAGS = [ + "-I", + f"{DEPS_DIR}/{MDSPAN_QUAL_PATH}", + "-I", + PYSTENCILS_RT_INCLUDE_PATH, +] def prepare_deps(): @@ -101,7 +109,7 @@ class GenScriptTest: for ext in self._expected_extensions: fname = f"{self._name}.{ext}" self._expected_files.add(fname) - if ext in ("cpp", "cxx", "c++"): + if ext in ("cpp", "cxx", "c++", "cu", "hip"): self._files_to_compile.append(fname) compile_descr: dict = test_description.get("compile", dict())