From 81184635009a1e62d06e8adaa150fa84afd2c026 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 12 Mar 2025 09:50:39 +0100 Subject: [PATCH 01/15] update pyproject.toml to distribute all headers in subfolders of `include` --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b3c6b1c02..55b21cbbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,7 +89,8 @@ build-backend = "setuptools.build_meta" [tool.setuptools.package-data] pystencils = [ - "include/*.h", + "include/**/*.h", + "include/**/*.hpp", "jit/cpu/*.tmpl.cpp", "boundaries/createindexlistcython.pyx" ] -- GitLab From 71ab7480164ba7d4aeba69ecd9352898015fd7f9 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 12 Mar 2025 10:27:06 +0100 Subject: [PATCH 02/15] start introduction of Target.HIP --- src/pystencils/backend/emission/base_printer.py | 2 +- src/pystencils/codegen/config.py | 8 +++++--- src/pystencils/codegen/driver.py | 4 ++-- src/pystencils/codegen/target.py | 7 +++++++ 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/pystencils/backend/emission/base_printer.py b/src/pystencils/backend/emission/base_printer.py index cc4b50e21..c76a347f0 100644 --- a/src/pystencils/backend/emission/base_printer.py +++ b/src/pystencils/backend/emission/base_printer.py @@ -383,7 +383,7 @@ class BasePrinter(ABC): from ...codegen import GpuKernel sig_parts = [self._func_prefix] if self._func_prefix is not None else [] - if isinstance(func, GpuKernel) and func.target == Target.CUDA: + if isinstance(func, GpuKernel) and func.target.is_gpu(): sig_parts.append("__global__") sig_parts += ["void", func.name, f"({params_str})"] signature = " ".join(sig_parts) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 0d43b40e3..821bb7e07 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -593,12 +593,14 @@ class CreateKernelConfig(ConfigBase): """Returns either the user-specified JIT compiler, or infers one from the target if none is given.""" jit: JitBase | None = self.get_option("jit") + target = self.get_target() + if jit is None: - if self.get_target().is_cpu(): + if target.is_cpu(): from ..jit import LegacyCpuJit return LegacyCpuJit() - elif self.get_target() == Target.CUDA: + elif target == Target.CUDA: try: from ..jit.gpu_cupy import CupyJit @@ -609,7 +611,7 @@ class CreateKernelConfig(ConfigBase): return no_jit - elif self.get_target() == Target.SYCL: + elif target == Target.SYCL or target == Target.HIP: from ..jit import no_jit return no_jit diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index b8f9c7101..f53f1b9b8 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -398,7 +398,7 @@ class DefaultKernelCreationDriver: return kernel_ast def _get_gpu_indexing(self) -> GpuIndexing | None: - if self._target != Target.CUDA: + if not self._target.is_gpu(): return None from .gpu_indexing import dim3 @@ -441,7 +441,7 @@ class DefaultKernelCreationDriver: omit_range_check: bool = gpu_opts.get_option("omit_range_check") match self._target: - case Target.CUDA: + case Target.CUDA | Target.HIP: from ..backend.platforms import CudaPlatform thread_mapping = ( diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py index 0d724b877..03364af28 100644 --- a/src/pystencils/codegen/target.py +++ b/src/pystencils/codegen/target.py @@ -30,6 +30,7 @@ class Target(Flag): _GPU = auto() _CUDA = auto() + _HIP = auto() _SYCL = auto() @@ -86,6 +87,12 @@ class Target(Flag): Generate a CUDA kernel for a generic Nvidia GPU. """ + HIP = _GPU | _HIP + """Generic HIP GPU target. + + Generate a HIP kernel for generic AMD or NVidia GPUs. + """ + GPU = CUDA """Alias for `Target.CUDA`, for backward compatibility.""" -- GitLab From 0810cb8c225eddfb7f023c0a5882f6b5a397dc4e Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 12 Mar 2025 12:07:10 +0100 Subject: [PATCH 03/15] fix deprecation warning stacklevel. fix linting. --- src/pystencils/backend/emission/base_printer.py | 1 - src/pystencils/enums.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/emission/base_printer.py b/src/pystencils/backend/emission/base_printer.py index c76a347f0..c4ac0640c 100644 --- a/src/pystencils/backend/emission/base_printer.py +++ b/src/pystencils/backend/emission/base_printer.py @@ -57,7 +57,6 @@ from ..extensions.foreign_ast import PsForeignExpression from ..memory import PsSymbol from ..constants import PsConstant from ...types import PsType -from ...codegen import Target if TYPE_CHECKING: from ...codegen import Kernel diff --git a/src/pystencils/enums.py b/src/pystencils/enums.py index bcea50e84..9d6470ed7 100644 --- a/src/pystencils/enums.py +++ b/src/pystencils/enums.py @@ -5,7 +5,8 @@ from warnings import warn warn( "Importing anything from `pystencils.enums` is deprecated and the module will be removed in pystencils 2.1. " "Import from `pystencils` instead.", - FutureWarning + FutureWarning, + stacklevel=2, ) Target = _Target -- GitLab From 17d9fac8b5d76f8491203d30c6c4b09cf4c11790 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Thu, 13 Mar 2025 14:41:08 +0100 Subject: [PATCH 04/15] introduce HipPlatform --- src/pystencils/backend/platforms/__init__.py | 2 + src/pystencils/backend/platforms/cuda.py | 328 +----------------- .../backend/platforms/generic_gpu.py | 326 ++++++++++++++++- src/pystencils/backend/platforms/hip.py | 11 + src/pystencils/codegen/driver.py | 34 +- src/pystencils/codegen/gpu_indexing.py | 4 +- 6 files changed, 362 insertions(+), 343 deletions(-) create mode 100644 src/pystencils/backend/platforms/hip.py diff --git a/src/pystencils/backend/platforms/__init__.py b/src/pystencils/backend/platforms/__init__.py index 589841db8..3b602964b 100644 --- a/src/pystencils/backend/platforms/__init__.py +++ b/src/pystencils/backend/platforms/__init__.py @@ -2,6 +2,7 @@ from .platform import Platform from .generic_cpu import GenericCpu, GenericVectorCpu from .generic_gpu import GenericGpu from .cuda import CudaPlatform +from .hip import HipPlatform from .x86 import X86VectorCpu, X86VectorArch from .sycl import SyclPlatform @@ -13,5 +14,6 @@ __all__ = [ "X86VectorArch", "GenericGpu", "CudaPlatform", + "HipPlatform", "SyclPlatform", ] diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index d630594ba..98ff3e3d3 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,335 +1,11 @@ from __future__ import annotations -from abc import ABC, abstractmethod -from ...types import constify, deconstify -from ..exceptions import MaterializationError from .generic_gpu import GenericGpu -from ..memory import PsSymbol -from ..kernelcreation import ( - Typifier, - IterationSpace, - FullIterationSpace, - SparseIterationSpace, - AstFactory, -) - -from ..kernelcreation.context import KernelCreationContext -from ..ast.structural import PsBlock, PsConditional, PsDeclaration -from ..ast.expressions import ( - PsExpression, - PsLiteralExpr, - PsCast, - PsCall, - PsLookup, - PsBufferAcc, -) -from ..ast.expressions import PsLt, PsAnd -from ...types import PsSignedIntegerType, PsIeeeFloatType -from ..literals import PsLiteral -from ..functions import PsMathFunction, MathFunctions, CFunction - - -int32 = PsSignedIntegerType(width=32, const=False) - -BLOCK_IDX = [ - PsLiteralExpr(PsLiteral(f"blockIdx.{coord}", int32)) for coord in ("x", "y", "z") -] -THREAD_IDX = [ - PsLiteralExpr(PsLiteral(f"threadIdx.{coord}", int32)) for coord in ("x", "y", "z") -] -BLOCK_DIM = [ - PsLiteralExpr(PsLiteral(f"blockDim.{coord}", int32)) for coord in ("x", "y", "z") -] -GRID_DIM = [ - PsLiteralExpr(PsLiteral(f"gridDim.{coord}", int32)) for coord in ("x", "y", "z") -] - - -class ThreadMapping(ABC): - - @abstractmethod - def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: - """Map the current thread index onto a point in the given iteration space. - - Implementations of this method must return a declaration for each dimension counter - of the given iteration space. - """ - - -class Linear3DMapping(ThreadMapping): - """3D globally linearized mapping, where each thread is assigned a work item according to - its location in the global launch grid.""" - - def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: - match ispace: - case FullIterationSpace(): - return self._dense_mapping(ispace) - case SparseIterationSpace(): - return self._sparse_mapping(ispace) - case _: - assert False, "unexpected iteration space" - - def _dense_mapping( - self, ispace: FullIterationSpace - ) -> dict[PsSymbol, PsExpression]: - if ispace.rank > 3: - raise MaterializationError( - f"Cannot handle {ispace.rank}-dimensional iteration space " - "using the Linear3D GPU thread index mapping." - ) - - dimensions = ispace.dimensions_in_loop_order() - idx_map: dict[PsSymbol, PsExpression] = dict() - - for coord, dim in enumerate(dimensions[::-1]): - tid = self._linear_thread_idx(coord) - idx_map[dim.counter] = dim.start + dim.step * PsCast( - deconstify(dim.counter.get_dtype()), tid - ) - - return idx_map - - def _sparse_mapping( - self, ispace: SparseIterationSpace - ) -> dict[PsSymbol, PsExpression]: - sparse_ctr = PsExpression.make(ispace.sparse_counter) - thread_idx = self._linear_thread_idx(0) - idx_map: dict[PsSymbol, PsExpression] = { - ispace.sparse_counter: PsCast( - deconstify(sparse_ctr.get_dtype()), thread_idx - ) - } - return idx_map - - def _linear_thread_idx(self, coord: int): - block_size = BLOCK_DIM[coord] - block_idx = BLOCK_IDX[coord] - thread_idx = THREAD_IDX[coord] - return block_idx * block_size + thread_idx - - -class Blockwise4DMapping(ThreadMapping): - """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions - are mapped to block indices.""" - - _indices_fastest_first = [ # slowest to fastest - THREAD_IDX[0], - BLOCK_IDX[0], - BLOCK_IDX[1], - BLOCK_IDX[2] - ] - - def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: - match ispace: - case FullIterationSpace(): - return self._dense_mapping(ispace) - case SparseIterationSpace(): - return self._sparse_mapping(ispace) - case _: - assert False, "unexpected iteration space" - - def _dense_mapping( - self, ispace: FullIterationSpace - ) -> dict[PsSymbol, PsExpression]: - if ispace.rank > 4: - raise MaterializationError( - f"Cannot handle {ispace.rank}-dimensional iteration space " - "using the Blockwise4D GPU thread index mapping." - ) - - dimensions = ispace.dimensions_in_loop_order() - idx_map: dict[PsSymbol, PsExpression] = dict() - - for dim, tid in zip(dimensions[::-1], self._indices_fastest_first): - idx_map[dim.counter] = dim.start + dim.step * PsCast( - deconstify(dim.counter.get_dtype()), tid - ) - - return idx_map - - def _sparse_mapping( - self, ispace: SparseIterationSpace - ) -> dict[PsSymbol, PsExpression]: - sparse_ctr = PsExpression.make(ispace.sparse_counter) - thread_idx = self._indices_fastest_first[0] - idx_map: dict[PsSymbol, PsExpression] = { - ispace.sparse_counter: PsCast( - deconstify(sparse_ctr.get_dtype()), thread_idx - ) - } - return idx_map - class CudaPlatform(GenericGpu): - """Platform for CUDA-based GPUs. - - Args: - ctx: The kernel creation context - omit_range_check: If `True`, generated index translation code will not check if the point identified - by block and thread indices is actually contained in the iteration space - thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points - """ - - def __init__( - self, - ctx: KernelCreationContext, - omit_range_check: bool = False, - thread_mapping: ThreadMapping | None = None, - ) -> None: - super().__init__(ctx) - - self._omit_range_check = omit_range_check - self._thread_mapping = ( - thread_mapping if thread_mapping is not None else Linear3DMapping() - ) - - self._typify = Typifier(ctx) + """Platform for the CUDA GPU taret.""" @property def required_headers(self) -> set[str]: - return {'"pystencils_runtime/hip.h"'} # TODO: move to HipPlatform once it is introduced - - def materialize_iteration_space( - self, body: PsBlock, ispace: IterationSpace - ) -> PsBlock: - if isinstance(ispace, FullIterationSpace): - return self._prepend_dense_translation(body, ispace) - elif isinstance(ispace, SparseIterationSpace): - return self._prepend_sparse_translation(body, ispace) - else: - raise MaterializationError(f"Unknown type of iteration space: {ispace}") - - def select_function(self, call: PsCall) -> PsExpression: - assert isinstance(call.function, PsMathFunction) - - func = call.function.func - dtype = call.get_dtype() - arg_types = (dtype,) * func.num_args - - if isinstance(dtype, PsIeeeFloatType): - match func: - case ( - MathFunctions.Exp - | MathFunctions.Log - | MathFunctions.Sin - | MathFunctions.Cos - | MathFunctions.Sqrt - | MathFunctions.Ceil - | MathFunctions.Floor - ) if dtype.width in (16, 32, 64): - prefix = "h" if dtype.width == 16 else "" - suffix = "f" if dtype.width == 32 else "" - name = f"{prefix}{func.function_name}{suffix}" - cfunc = CFunction(name, arg_types, dtype) - - case ( - MathFunctions.Pow - | MathFunctions.Tan - | MathFunctions.Sinh - | MathFunctions.Cosh - | MathFunctions.ASin - | MathFunctions.ACos - | MathFunctions.ATan - | MathFunctions.ATan2 - ) if dtype.width in (32, 64): - # These are unavailable for fp16 - suffix = "f" if dtype.width == 32 else "" - name = f"{func.function_name}{suffix}" - cfunc = CFunction(name, arg_types, dtype) - - case ( - MathFunctions.Min | MathFunctions.Max | MathFunctions.Abs - ) if dtype.width in (32, 64): - suffix = "f" if dtype.width == 32 else "" - name = f"f{func.function_name}{suffix}" - cfunc = CFunction(name, arg_types, dtype) - - case MathFunctions.Abs if dtype.width == 16: - cfunc = CFunction(" __habs", arg_types, dtype) - - case _: - raise MaterializationError( - f"Cannot materialize call to function {func}" - ) - - call.function = cfunc - return call - - raise MaterializationError( - f"No implementation available for function {func} on data type {dtype}" - ) - - # Internals - - def _prepend_dense_translation( - self, body: PsBlock, ispace: FullIterationSpace - ) -> PsBlock: - ctr_mapping = self._thread_mapping(ispace) - - indexing_decls = [] - conds = [] - - dimensions = ispace.dimensions_in_loop_order() - - for dim in dimensions: - # counter declarations must be ordered slowest-to-fastest - # such that inner dimensions can depend on outer ones - - dim.counter.dtype = constify(dim.counter.get_dtype()) - - ctr_expr = PsExpression.make(dim.counter) - indexing_decls.append( - self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) - ) - if not self._omit_range_check: - conds.append(PsLt(ctr_expr, dim.stop)) - - if conds: - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) - else: - body.statements = indexing_decls + body.statements - ast = body - - return ast - - def _prepend_sparse_translation( - self, body: PsBlock, ispace: SparseIterationSpace - ) -> PsBlock: - factory = AstFactory(self._ctx) - ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) - - sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) - ctr_mapping = self._thread_mapping(ispace) - - sparse_idx_decl = self._typify( - PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter]) - ) - - mappings = [ - PsDeclaration( - PsExpression.make(ctr), - PsLookup( - PsBufferAcc( - ispace.index_list.base_pointer, - (sparse_ctr_expr.clone(), factory.parse_index(0)), - ), - coord.name, - ), - ) - for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members) - ] - body.statements = mappings + body.statements - - if not self._omit_range_check: - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr_expr.clone(), stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - else: - body.statements = [sparse_idx_decl] + body.statements - ast = body - - return ast + return set() diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index b5b35c8b0..fac37ffa5 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,7 +1,331 @@ from __future__ import annotations +from abc import ABC, abstractmethod +from ...types import constify, deconstify +from ..exceptions import MaterializationError from .platform import Platform +from ..memory import PsSymbol +from ..kernelcreation import ( + Typifier, + IterationSpace, + FullIterationSpace, + SparseIterationSpace, + AstFactory, +) + +from ..kernelcreation.context import KernelCreationContext +from ..ast.structural import PsBlock, PsConditional, PsDeclaration +from ..ast.expressions import ( + PsExpression, + PsLiteralExpr, + PsCast, + PsCall, + PsLookup, + PsBufferAcc, +) +from ..ast.expressions import PsLt, PsAnd +from ...types import PsSignedIntegerType, PsIeeeFloatType +from ..literals import PsLiteral +from ..functions import PsMathFunction, MathFunctions, CFunction + + +int32 = PsSignedIntegerType(width=32, const=False) + +BLOCK_IDX = [ + PsLiteralExpr(PsLiteral(f"blockIdx.{coord}", int32)) for coord in ("x", "y", "z") +] +THREAD_IDX = [ + PsLiteralExpr(PsLiteral(f"threadIdx.{coord}", int32)) for coord in ("x", "y", "z") +] +BLOCK_DIM = [ + PsLiteralExpr(PsLiteral(f"blockDim.{coord}", int32)) for coord in ("x", "y", "z") +] +GRID_DIM = [ + PsLiteralExpr(PsLiteral(f"gridDim.{coord}", int32)) for coord in ("x", "y", "z") +] + + +class ThreadMapping(ABC): + + @abstractmethod + def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: + """Map the current thread index onto a point in the given iteration space. + + Implementations of this method must return a declaration for each dimension counter + of the given iteration space. + """ + + +class Linear3DMapping(ThreadMapping): + """3D globally linearized mapping, where each thread is assigned a work item according to + its location in the global launch grid.""" + + def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: + match ispace: + case FullIterationSpace(): + return self._dense_mapping(ispace) + case SparseIterationSpace(): + return self._sparse_mapping(ispace) + case _: + assert False, "unexpected iteration space" + + def _dense_mapping( + self, ispace: FullIterationSpace + ) -> dict[PsSymbol, PsExpression]: + if ispace.rank > 3: + raise MaterializationError( + f"Cannot handle {ispace.rank}-dimensional iteration space " + "using the Linear3D GPU thread index mapping." + ) + + dimensions = ispace.dimensions_in_loop_order() + idx_map: dict[PsSymbol, PsExpression] = dict() + + for coord, dim in enumerate(dimensions[::-1]): + tid = self._linear_thread_idx(coord) + idx_map[dim.counter] = dim.start + dim.step * PsCast( + deconstify(dim.counter.get_dtype()), tid + ) + + return idx_map + + def _sparse_mapping( + self, ispace: SparseIterationSpace + ) -> dict[PsSymbol, PsExpression]: + sparse_ctr = PsExpression.make(ispace.sparse_counter) + thread_idx = self._linear_thread_idx(0) + idx_map: dict[PsSymbol, PsExpression] = { + ispace.sparse_counter: PsCast( + deconstify(sparse_ctr.get_dtype()), thread_idx + ) + } + return idx_map + + def _linear_thread_idx(self, coord: int): + block_size = BLOCK_DIM[coord] + block_idx = BLOCK_IDX[coord] + thread_idx = THREAD_IDX[coord] + return block_idx * block_size + thread_idx + + +class Blockwise4DMapping(ThreadMapping): + """Blockwise index mapping for up to 4D iteration spaces, where the outer three dimensions + are mapped to block indices.""" + + _indices_fastest_first = [ # slowest to fastest + THREAD_IDX[0], + BLOCK_IDX[0], + BLOCK_IDX[1], + BLOCK_IDX[2] + ] + + def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: + match ispace: + case FullIterationSpace(): + return self._dense_mapping(ispace) + case SparseIterationSpace(): + return self._sparse_mapping(ispace) + case _: + assert False, "unexpected iteration space" + + def _dense_mapping( + self, ispace: FullIterationSpace + ) -> dict[PsSymbol, PsExpression]: + if ispace.rank > 4: + raise MaterializationError( + f"Cannot handle {ispace.rank}-dimensional iteration space " + "using the Blockwise4D GPU thread index mapping." + ) + + dimensions = ispace.dimensions_in_loop_order() + idx_map: dict[PsSymbol, PsExpression] = dict() + + for dim, tid in zip(dimensions[::-1], self._indices_fastest_first): + idx_map[dim.counter] = dim.start + dim.step * PsCast( + deconstify(dim.counter.get_dtype()), tid + ) + + return idx_map + + def _sparse_mapping( + self, ispace: SparseIterationSpace + ) -> dict[PsSymbol, PsExpression]: + sparse_ctr = PsExpression.make(ispace.sparse_counter) + thread_idx = self._indices_fastest_first[0] + idx_map: dict[PsSymbol, PsExpression] = { + ispace.sparse_counter: PsCast( + deconstify(sparse_ctr.get_dtype()), thread_idx + ) + } + return idx_map + class GenericGpu(Platform): - """Base class for GPU platforms.""" + """Common base platform for CUDA- and HIP-type GPU targets. + + Args: + ctx: The kernel creation context + omit_range_check: If `True`, generated index translation code will not check if the point identified + by block and thread indices is actually contained in the iteration space + thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points + """ + + def __init__( + self, + ctx: KernelCreationContext, + omit_range_check: bool = False, + thread_mapping: ThreadMapping | None = None, + ) -> None: + super().__init__(ctx) + + self._omit_range_check = omit_range_check + self._thread_mapping = ( + thread_mapping if thread_mapping is not None else Linear3DMapping() + ) + + self._typify = Typifier(ctx) + + def materialize_iteration_space( + self, body: PsBlock, ispace: IterationSpace + ) -> PsBlock: + if isinstance(ispace, FullIterationSpace): + return self._prepend_dense_translation(body, ispace) + elif isinstance(ispace, SparseIterationSpace): + return self._prepend_sparse_translation(body, ispace) + else: + raise MaterializationError(f"Unknown type of iteration space: {ispace}") + + def select_function(self, call: PsCall) -> PsExpression: + assert isinstance(call.function, PsMathFunction) + + func = call.function.func + dtype = call.get_dtype() + arg_types = (dtype,) * func.num_args + + if isinstance(dtype, PsIeeeFloatType): + match func: + case ( + MathFunctions.Exp + | MathFunctions.Log + | MathFunctions.Sin + | MathFunctions.Cos + | MathFunctions.Sqrt + | MathFunctions.Ceil + | MathFunctions.Floor + ) if dtype.width in (16, 32, 64): + prefix = "h" if dtype.width == 16 else "" + suffix = "f" if dtype.width == 32 else "" + name = f"{prefix}{func.function_name}{suffix}" + cfunc = CFunction(name, arg_types, dtype) + + case ( + MathFunctions.Pow + | MathFunctions.Tan + | MathFunctions.Sinh + | MathFunctions.Cosh + | MathFunctions.ASin + | MathFunctions.ACos + | MathFunctions.ATan + | MathFunctions.ATan2 + ) if dtype.width in (32, 64): + # These are unavailable for fp16 + suffix = "f" if dtype.width == 32 else "" + name = f"{func.function_name}{suffix}" + cfunc = CFunction(name, arg_types, dtype) + + case ( + MathFunctions.Min | MathFunctions.Max | MathFunctions.Abs + ) if dtype.width in (32, 64): + suffix = "f" if dtype.width == 32 else "" + name = f"f{func.function_name}{suffix}" + cfunc = CFunction(name, arg_types, dtype) + + case MathFunctions.Abs if dtype.width == 16: + cfunc = CFunction(" __habs", arg_types, dtype) + + case _: + raise MaterializationError( + f"Cannot materialize call to function {func}" + ) + + call.function = cfunc + return call + + raise MaterializationError( + f"No implementation available for function {func} on data type {dtype}" + ) + + # Internals + + def _prepend_dense_translation( + self, body: PsBlock, ispace: FullIterationSpace + ) -> PsBlock: + ctr_mapping = self._thread_mapping(ispace) + + indexing_decls = [] + conds = [] + + dimensions = ispace.dimensions_in_loop_order() + + for dim in dimensions: + # counter declarations must be ordered slowest-to-fastest + # such that inner dimensions can depend on outer ones + + dim.counter.dtype = constify(dim.counter.get_dtype()) + + ctr_expr = PsExpression.make(dim.counter) + indexing_decls.append( + self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) + ) + if not self._omit_range_check: + conds.append(PsLt(ctr_expr, dim.stop)) + + if conds: + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) + else: + body.statements = indexing_decls + body.statements + ast = body + + return ast + + def _prepend_sparse_translation( + self, body: PsBlock, ispace: SparseIterationSpace + ) -> PsBlock: + factory = AstFactory(self._ctx) + ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) + + sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) + ctr_mapping = self._thread_mapping(ispace) + + sparse_idx_decl = self._typify( + PsDeclaration(sparse_ctr_expr, ctr_mapping[ispace.sparse_counter]) + ) + + mappings = [ + PsDeclaration( + PsExpression.make(ctr), + PsLookup( + PsBufferAcc( + ispace.index_list.base_pointer, + (sparse_ctr_expr.clone(), factory.parse_index(0)), + ), + coord.name, + ), + ) + for ctr, coord in zip(ispace.spatial_indices, ispace.coordinate_members) + ] + body.statements = mappings + body.statements + + if not self._omit_range_check: + stop = PsExpression.make(ispace.index_list.shape[0]) + condition = PsLt(sparse_ctr_expr.clone(), stop) + ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) + else: + body.statements = [sparse_idx_decl] + body.statements + ast = body + + return ast diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py new file mode 100644 index 000000000..c758995a0 --- /dev/null +++ b/src/pystencils/backend/platforms/hip.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .generic_gpu import GenericGpu + + +class HipPlatform(GenericGpu): + """Platform for the HIP GPU taret.""" + + @property + def required_headers(self) -> set[str]: + return {'"pystencils_runtime/hip.h"'} diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index f53f1b9b8..9e3c8b163 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -440,21 +440,27 @@ class DefaultKernelCreationDriver: gpu_opts = self._cfg.gpu omit_range_check: bool = gpu_opts.get_option("omit_range_check") + thread_mapping = ( + self._gpu_indexing.get_thread_mapping() + if self._gpu_indexing is not None + else None + ) + + GpuPlatform: type match self._target: - case Target.CUDA | Target.HIP: - from ..backend.platforms import CudaPlatform - - thread_mapping = ( - self._gpu_indexing.get_thread_mapping() - if self._gpu_indexing is not None - else None - ) - - return CudaPlatform( - self._ctx, - omit_range_check=omit_range_check, - thread_mapping=thread_mapping, - ) + case Target.CUDA: + from ..backend.platforms import CudaPlatform as GpuPlatform + case Target.HIP: + from ..backend.platforms import HipPlatform as GpuPlatform + case _: + assert False, f"unexpected GPU target: {self._target}" + + return GpuPlatform( + self._ctx, + omit_range_check=omit_range_check, + thread_mapping=thread_mapping, + ) + elif self._target == Target.SYCL: from ..backend.platforms import SyclPlatform diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 27d6fc817..890afb0fb 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -14,7 +14,7 @@ from ..backend.kernelcreation import ( FullIterationSpace, SparseIterationSpace, ) -from ..backend.platforms.cuda import ThreadMapping +from ..backend.platforms.generic_gpu import ThreadMapping from ..backend.ast.expressions import PsExpression @@ -236,7 +236,7 @@ class GpuIndexing: def get_thread_mapping(self) -> ThreadMapping: """Retrieve a thread mapping object for use by the backend""" - from ..backend.platforms.cuda import Linear3DMapping, Blockwise4DMapping + from ..backend.platforms.generic_gpu import Linear3DMapping, Blockwise4DMapping match self._scheme: case GpuIndexingScheme.Linear3D: -- GitLab From de1dc39e761e2fd151c12e87952a897a150674d6 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 14 Mar 2025 16:47:42 +0000 Subject: [PATCH 05/15] start fixing JIT for HIP --- src/pystencils/codegen/config.py | 6 ++++-- src/pystencils/codegen/target.py | 12 ++++++++++++ src/pystencils/jit/gpu_cupy.py | 10 +++++++--- tests/fixtures.py | 5 ++++- tests/kernelcreation/test_buffer_gpu.py | 24 ++++++++++++------------ 5 files changed, 39 insertions(+), 18 deletions(-) diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 821bb7e07..a765dea2e 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -586,6 +586,8 @@ class CreateKernelConfig(ConfigBase): match t: case Target.CurrentCPU: return Target.auto_cpu() + case Target.CurrentGPU: + return Target.auto_gpu() case _: return t @@ -600,7 +602,7 @@ class CreateKernelConfig(ConfigBase): from ..jit import LegacyCpuJit return LegacyCpuJit() - elif target == Target.CUDA: + elif target == Target.CUDA or target == Target.HIP: try: from ..jit.gpu_cupy import CupyJit @@ -611,7 +613,7 @@ class CreateKernelConfig(ConfigBase): return no_jit - elif target == Target.SYCL or target == Target.HIP: + elif target == Target.SYCL: from ..jit import no_jit return no_jit diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py index 03364af28..c4b08b95c 100644 --- a/src/pystencils/codegen/target.py +++ b/src/pystencils/codegen/target.py @@ -126,6 +126,18 @@ class Target(Flag): else: return Target.GenericCPU + @staticmethod + def auto_gpu() -> Target: + try: + import cupy + + if cupy.cuda.runtime.is_hip: + return Target.HIP + else: + return Target.CUDA + except ImportError: + raise RuntimeError("Cannot infer GPU target since cupy is not installed.") + @staticmethod def available_targets() -> list[Target]: targets = [Target.GenericCPU] diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index d45abf878..1461bfd7a 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -208,7 +208,11 @@ class CupyKernelWrapper(KernelWrapper): class CupyJit(JitBase): def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)): - self._runtime_headers = {"<cstdint>"} + self._runtime_headers: set[str] + if cp.cuda.runtime.is_hip: + self._runtime_headers = set() + else: + self._runtime_headers = {"<cstdint>"} if len(default_block_size) > 3: raise ValueError( @@ -226,9 +230,9 @@ class CupyJit(JitBase): "`cupy` is not installed: just-in-time-compilation of CUDA kernels is unavailable." ) - if not isinstance(kernel, GpuKernel) or kernel.target != Target.CUDA: + if not isinstance(kernel, GpuKernel): raise ValueError( - "The CupyJit just-in-time compiler only accepts kernels generated for CUDA or HIP" + "The CupyJit just-in-time compiler only accepts GPU kernels generated for CUDA or HIP" ) options = self._compiler_options() diff --git a/tests/fixtures.py b/tests/fixtures.py index ba2593f76..a19519988 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -23,7 +23,10 @@ AVAILABLE_TARGETS = [ps.Target.GenericCPU] try: import cupy - AVAILABLE_TARGETS += [ps.Target.CUDA] + if cupy.cuda.runtime.is_hip: + AVAILABLE_TARGETS += [ps.Target.HIP] + else: + AVAILABLE_TARGETS += [ps.Target.CUDA] except ImportError: pass diff --git a/tests/kernelcreation/test_buffer_gpu.py b/tests/kernelcreation/test_buffer_gpu.py index 0b5019fba..bd9d2156b 100644 --- a/tests/kernelcreation/test_buffer_gpu.py +++ b/tests/kernelcreation/test_buffer_gpu.py @@ -58,7 +58,7 @@ def test_full_scalar_field(): pack_eqs = [Assignment(buffer.center(), src_field.center())] - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_ast = create_kernel(pack_eqs, config=config) pack_kernel = pack_ast.compile() @@ -66,7 +66,7 @@ def test_full_scalar_field(): unpack_eqs = [Assignment(dst_field.center(), buffer.center())] - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() @@ -94,7 +94,7 @@ def test_field_slice(): pack_eqs = [Assignment(buffer.center(), src_field.center())] - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_ast = create_kernel(pack_eqs, config=config) pack_kernel = pack_ast.compile() @@ -103,7 +103,7 @@ def test_field_slice(): # Unpack into ghost layer of dst_field in N direction unpack_eqs = [Assignment(dst_field.center(), buffer.center())] - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() @@ -131,7 +131,7 @@ def test_all_cell_values(): eq = Assignment(buffer(idx), src_field(idx)) pack_eqs.append(eq) - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() @@ -143,7 +143,7 @@ def test_all_cell_values(): eq = Assignment(dst_field(idx), buffer(idx)) unpack_eqs.append(eq) - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr) @@ -173,7 +173,7 @@ def test_subset_cell_values(): pack_eqs.append(eq) pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype} - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_ast = create_kernel(pack_eqs, config=config) pack_kernel = pack_ast.compile() pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr) @@ -185,7 +185,7 @@ def test_subset_cell_values(): unpack_eqs.append(eq) unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype} - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() @@ -215,7 +215,7 @@ def test_field_layouts(): pack_eqs.append(eq) pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype} - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) pack_ast = create_kernel(pack_eqs, config=config) pack_kernel = pack_ast.compile() @@ -228,7 +228,7 @@ def test_field_layouts(): unpack_eqs.append(eq) unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype} - config = CreateKernelConfig(target=pystencils.Target.GPU) + config = CreateKernelConfig(target=pystencils.Target.CurrentGPU) unpack_ast = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_ast.compile() @@ -299,7 +299,7 @@ def test_iteration_slices(gpu_indexing): gpu_src_arr.set(src_arr) gpu_dst_arr.fill(0) - config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice) + config = CreateKernelConfig(target=Target.CurrentGPU, iteration_slice=pack_slice) pack_code = create_kernel(pack_eqs, config=config) pack_kernel = pack_code.compile() @@ -311,7 +311,7 @@ def test_iteration_slices(gpu_indexing): eq = Assignment(dst_field(idx), buffer(idx)) unpack_eqs.append(eq) - config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice) + config = CreateKernelConfig(target=Target.CurrentGPU, iteration_slice=pack_slice) unpack_code = create_kernel(unpack_eqs, config=config) unpack_kernel = unpack_code.compile() -- GitLab From a01997e443d504e40d1d2f882c593b37a3c7a2cd Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 14 Mar 2025 17:46:52 +0000 Subject: [PATCH 06/15] fix remaining test suite --- src/pystencils/boundaries/boundaryhandling.py | 8 +++---- src/pystencils/codegen/target.py | 21 ++++++++++++++++--- .../datahandling/datahandling_interface.py | 2 -- .../datahandling/serial_datahandling.py | 6 +++--- src/pystencils/jit/gpu_cupy.py | 12 ++++++++++- tests/fixtures.py | 18 +++------------- tests/kernelcreation/test_functions.py | 14 ++++++------- tests/kernelcreation/test_gpu.py | 10 ++++----- tests/kernelcreation/test_half_precision.py | 4 ++-- tests/kernelcreation/test_index_kernels.py | 17 +++++---------- tests/kernelcreation/test_iteration_slices.py | 4 ++-- tests/runtime/test_boundary.py | 4 ++-- tests/runtime/test_datahandling.py | 12 +++++------ 13 files changed, 67 insertions(+), 65 deletions(-) diff --git a/src/pystencils/boundaries/boundaryhandling.py b/src/pystencils/boundaries/boundaryhandling.py index 1f6e3d126..58340c3e0 100644 --- a/src/pystencils/boundaries/boundaryhandling.py +++ b/src/pystencils/boundaries/boundaryhandling.py @@ -123,7 +123,7 @@ class BoundaryHandling: class_ = self.IndexFieldBlockData class_.to_cpu = to_cpu class_.to_gpu = to_gpu - gpu = self._target in data_handling._GPU_LIKE_TARGETS + gpu = self._target.is_gpu() data_handling.add_custom_class(self._index_array_name, class_, cpu=True, gpu=gpu) @property @@ -240,7 +240,7 @@ class BoundaryHandling: if self._dirty: self.prepare() - for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS): + for b in self._data_handling.iterate(gpu=self._target.is_gpu()): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): kwargs[self._field_name] = b[self._field_name] kwargs['indexField'] = idx_arr @@ -255,7 +255,7 @@ class BoundaryHandling: if self._dirty: self.prepare() - for b in self._data_handling.iterate(gpu=self._target in self._data_handling._GPU_LIKE_TARGETS): + for b in self._data_handling.iterate(gpu=self._target.is_gpu()): for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items(): arguments = kwargs.copy() arguments[self._field_name] = b[self._field_name] @@ -341,7 +341,7 @@ class BoundaryHandling: def _boundary_data_initialization(self, boundary_obj, boundary_data_setter, **kwargs): if boundary_obj.additional_data_init_callback: boundary_obj.additional_data_init_callback(boundary_data_setter, **kwargs) - if self._target in self._data_handling._GPU_LIKE_TARGETS: + if self._target.is_gpu(): self._data_handling.to_gpu(self._index_array_name) class BoundaryInfo(object): diff --git a/src/pystencils/codegen/target.py b/src/pystencils/codegen/target.py index c4b08b95c..5e214430c 100644 --- a/src/pystencils/codegen/target.py +++ b/src/pystencils/codegen/target.py @@ -93,8 +93,8 @@ class Target(Flag): Generate a HIP kernel for generic AMD or NVidia GPUs. """ - GPU = CUDA - """Alias for `Target.CUDA`, for backward compatibility.""" + GPU = CurrentGPU + """Alias for `Target.CurrentGPU`, for backward compatibility.""" SYCL = _SYCL """SYCL kernel target. @@ -106,15 +106,24 @@ class Target(Flag): """ def is_automatic(self) -> bool: + """Determine if this target is a proxy target that is automatically resolved + according to the runtime environment.""" return Target._AUTOMATIC in self def is_cpu(self) -> bool: + """Determine if this target is a CPU target.""" return Target._CPU in self def is_vector_cpu(self) -> bool: + """Determine if this target is a vector CPU target.""" return self.is_cpu() and Target._VECTOR in self def is_gpu(self) -> bool: + """Determine if this target is a GPU target. + + This refers to targets for the CUDA and HIP family of platforms. + `Target.SYCL` is *not* a GPU target. + """ return Target._GPU in self @staticmethod @@ -128,6 +137,11 @@ class Target(Flag): @staticmethod def auto_gpu() -> Target: + """Return the GPU target available in the current runtime environment. + + Raises: + RuntimeError: If `cupy` is not installed and therefore no GPU runtime is available. + """ try: import cupy @@ -140,10 +154,11 @@ class Target(Flag): @staticmethod def available_targets() -> list[Target]: + """List available""" targets = [Target.GenericCPU] try: import cupy # noqa: F401 - targets.append(Target.CUDA) + targets.append(Target.auto_gpu()) except ImportError: pass diff --git a/src/pystencils/datahandling/datahandling_interface.py b/src/pystencils/datahandling/datahandling_interface.py index 867bbf062..a6b1fcb55 100644 --- a/src/pystencils/datahandling/datahandling_interface.py +++ b/src/pystencils/datahandling/datahandling_interface.py @@ -17,8 +17,6 @@ class DataHandling(ABC): 'gather' function that has collects (parts of the) distributed data on a single process. """ - _GPU_LIKE_TARGETS = [Target.GPU] - # ---------------------------- Adding and accessing data ----------------------------------------------------------- @property @abstractmethod diff --git a/src/pystencils/datahandling/serial_datahandling.py b/src/pystencils/datahandling/serial_datahandling.py index 73b749ca4..dc6904c3a 100644 --- a/src/pystencils/datahandling/serial_datahandling.py +++ b/src/pystencils/datahandling/serial_datahandling.py @@ -110,7 +110,7 @@ class SerialDataHandling(DataHandling): if layout is None: layout = self.default_layout if gpu is None: - gpu = self.default_target in self._GPU_LIKE_TARGETS + gpu = self.default_target.is_gpu() kwargs = { 'shape': tuple(s + 2 * ghost_layers for s in self._domainSize), @@ -241,7 +241,7 @@ class SerialDataHandling(DataHandling): def swap(self, name1, name2, gpu=None): if gpu is None: - gpu = self.default_target in self._GPU_LIKE_TARGETS + gpu = self.default_target.is_gpu() arr = self.gpu_arrays if gpu else self.cpu_arrays arr[name1], arr[name2] = arr[name2], arr[name1] @@ -292,7 +292,7 @@ class SerialDataHandling(DataHandling): if target is None: target = self.default_target - if not (target.is_cpu() or target == Target.CUDA): + if not (target.is_cpu() or target.is_gpu()): raise ValueError(f"Unsupported target: {target}") if not hasattr(names, '__len__') or type(names) is str: diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 1461bfd7a..69e965325 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -231,9 +231,19 @@ class CupyJit(JitBase): ) if not isinstance(kernel, GpuKernel): - raise ValueError( + raise JitError( "The CupyJit just-in-time compiler only accepts GPU kernels generated for CUDA or HIP" ) + + if kernel.target == Target.CUDA and cp.cuda.runtime.is_hip: + raise JitError( + "Cannot compile a CUDA kernel on a HIP-based Cupy installation." + ) + + if kernel.target == Target.HIP and not cp.cuda.runtime.is_hip: + raise JitError( + "Cannot compile a HIP kernel on a CUDA-based Cupy installation." + ) options = self._compiler_options() prelude = self._prelude(kernel) diff --git a/tests/fixtures.py b/tests/fixtures.py index a19519988..a4c77f550 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -18,19 +18,7 @@ from types import ModuleType import pystencils as ps -AVAILABLE_TARGETS = [ps.Target.GenericCPU] - -try: - import cupy - - if cupy.cuda.runtime.is_hip: - AVAILABLE_TARGETS += [ps.Target.HIP] - else: - AVAILABLE_TARGETS += [ps.Target.CUDA] -except ImportError: - pass - -AVAILABLE_TARGETS += ps.Target.available_vector_cpu_targets() +AVAILABLE_TARGETS = ps.Target.available_targets() TARGET_IDS = [t.name for t in AVAILABLE_TARGETS] @@ -75,9 +63,9 @@ def xp(target: ps.Target) -> ModuleType: """Primary array module for the current target. Returns: - `cupy` if `target == Target.CUDA`, and `numpy` otherwise + `cupy` if `target.is_gpu()`, and `numpy` otherwise """ - if target == ps.Target.CUDA: + if target.is_gpu(): import cupy as xp return xp diff --git a/tests/kernelcreation/test_functions.py b/tests/kernelcreation/test_functions.py index a4d154d4b..182a59005 100644 --- a/tests/kernelcreation/test_functions.py +++ b/tests/kernelcreation/test_functions.py @@ -106,14 +106,14 @@ def function_domain(function_name, dtype): case "pow": return np.concatenate( [ - [0., 1., 1.], - rng.uniform(-1., 1., 8), - rng.uniform(0., 5., 8), + [0.0, 1.0, 1.0], + rng.uniform(-1.0, 1.0, 8), + rng.uniform(0.0, 5.0, 8), ] ).astype(dtype), np.concatenate( [ - [1., 0., 2.], - np.arange(2., 10., 1.), + [1.0, 0.0, 2.0], + np.arange(2.0, 10.0, 1.0), rng.uniform(-2.0, 2.0, 8), ] ).astype( @@ -211,14 +211,14 @@ def test_binary_functions(gen_config, xp, function_name, dtype, function_domain) dtype_and_target_for_integer_funcs = pytest.mark.parametrize( "dtype, target", - list(product([np.int32], [t for t in AVAIL_TARGETS if t is not Target.CUDA])) + list(product([np.int32], [t for t in AVAIL_TARGETS if not t.is_gpu()])) + list( product( [np.int64], [ t for t in AVAIL_TARGETS - if t not in (Target.X86_SSE, Target.X86_AVX, Target.CUDA) + if t not in (Target.X86_SSE, Target.X86_AVX) and not t.is_gpu() ], ) ), diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index f1905b1fc..8d943e8fd 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -45,7 +45,7 @@ def test_indexing_options_3d( + src[0, 0, 1], ) - cfg = CreateKernelConfig(target=Target.CUDA) + cfg = CreateKernelConfig(target=Target.CurrentGPU) cfg.gpu.indexing_scheme = indexing_scheme cfg.gpu.omit_range_check = omit_range_check cfg.gpu.manual_launch_grid = manual_grid @@ -91,7 +91,7 @@ def test_indexing_options_2d( + src[0, 1] ) - cfg = CreateKernelConfig(target=Target.CUDA) + cfg = CreateKernelConfig(target=Target.CurrentGPU) cfg.gpu.indexing_scheme = indexing_scheme cfg.gpu.omit_range_check = omit_range_check cfg.gpu.manual_launch_grid = manual_grid @@ -126,7 +126,7 @@ def test_invalid_indexing_schemes(): src, dst = fields("src, dst: [4D]") asm = Assignment(src.center(0), dst.center(0)) - cfg = CreateKernelConfig(target=Target.CUDA) + cfg = CreateKernelConfig(target=Target.CurrentGPU) cfg.gpu.indexing_scheme = "linear3d" with pytest.raises(Exception): @@ -241,7 +241,7 @@ def test_ghost_layer(): ghost_layers = [(1, 2), (2, 1)] config = CreateKernelConfig() - config.target = Target.CUDA + config.target = Target.CurrentGPU config.ghost_layers = ghost_layers config.gpu.indexing_scheme = "blockwise4d" @@ -270,7 +270,7 @@ def test_setting_value(): update_rule = [Assignment(f(0), sp.Symbol("value"))] config = CreateKernelConfig() - config.target = Target.CUDA + config.target = Target.CurrentGPU config.iteration_slice = iteration_slice config.gpu.indexing_scheme = "blockwise4d" diff --git a/tests/kernelcreation/test_half_precision.py b/tests/kernelcreation/test_half_precision.py index a9745459d..5dbe2180e 100644 --- a/tests/kernelcreation/test_half_precision.py +++ b/tests/kernelcreation/test_half_precision.py @@ -5,7 +5,7 @@ import numpy as np import pystencils as ps -@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU)) +@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.CurrentGPU)) def test_half_precison(target): if target == ps.Target.CPU: if not platform.machine() in ['arm64', 'aarch64']: @@ -14,7 +14,7 @@ def test_half_precison(target): if 'clang' not in ps.cpu.cpujit.get_compiler_config()['command']: pytest.xfail("skipping half precision because clang compiler is not used") - if target == ps.Target.GPU: + if target.is_gpu(): pytest.importorskip("cupy") dh = ps.create_data_handling(domain_size=(10, 10), default_target=target) diff --git a/tests/kernelcreation/test_index_kernels.py b/tests/kernelcreation/test_index_kernels.py index 569c0ab6a..bda0ef273 100644 --- a/tests/kernelcreation/test_index_kernels.py +++ b/tests/kernelcreation/test_index_kernels.py @@ -5,14 +5,7 @@ from pystencils import Assignment, Field, FieldType, AssignmentCollection, Targe from pystencils import create_kernel, CreateKernelConfig -@pytest.mark.parametrize("target", [Target.CPU, Target.GPU]) -def test_indexed_kernel(target): - if target == Target.GPU: - cp = pytest.importorskip("cupy") - xp = cp - else: - xp = np - +def test_indexed_kernel(target, xp): arr = xp.zeros((3, 4)) dtype = np.dtype([('x', int), ('y', int), ('value', arr.dtype)], align=True) @@ -21,8 +14,8 @@ def test_indexed_kernel(target): cpu_index_arr[1] = (1, 3, 42.0) cpu_index_arr[2] = (2, 1, 5.0) - if target == Target.GPU: - gpu_index_arr = cp.empty(cpu_index_arr.shape, cpu_index_arr.dtype) + if target.is_gpu(): + gpu_index_arr = xp.empty(cpu_index_arr.shape, cpu_index_arr.dtype) gpu_index_arr.set(cpu_index_arr) index_arr = gpu_index_arr else: @@ -40,8 +33,8 @@ def test_indexed_kernel(target): kernel(f=arr, index=index_arr) - if target == Target.GPU: - arr = cp.asnumpy(arr) + if target.is_gpu(): + arr = xp.asnumpy(arr) for i in range(cpu_index_arr.shape[0]): np.testing.assert_allclose(arr[cpu_index_arr[i]['x'], cpu_index_arr[i]['y']], cpu_index_arr[i]['value'], atol=1e-13) diff --git a/tests/kernelcreation/test_iteration_slices.py b/tests/kernelcreation/test_iteration_slices.py index b1f2da576..2b3a8ebf0 100644 --- a/tests/kernelcreation/test_iteration_slices.py +++ b/tests/kernelcreation/test_iteration_slices.py @@ -144,7 +144,7 @@ def test_triangle_pattern(gen_config: CreateKernelConfig, xp): islice = make_slice[:, slow_counter:] gen_config = replace(gen_config, iteration_slice=islice) - if gen_config.target == Target.CUDA: + if gen_config.target.is_gpu(): gen_config.gpu.manual_launch_grid = True kernel = create_kernel(update, gen_config).compile() @@ -177,7 +177,7 @@ def test_red_black_pattern(gen_config: CreateKernelConfig, xp): islice = make_slice[:, start::2] gen_config.iteration_slice = islice - if gen_config.target == Target.CUDA: + if gen_config.target.is_gpu(): gen_config.gpu.manual_launch_grid = True try: diff --git a/tests/runtime/test_boundary.py b/tests/runtime/test_boundary.py index fb8f827e8..226510b83 100644 --- a/tests/runtime/test_boundary.py +++ b/tests/runtime/test_boundary.py @@ -98,7 +98,7 @@ def test_kernel_vs_copy_boundary(): def test_boundary_gpu(): pytest.importorskip('cupy') - dh = SerialDataHandling(domain_size=(7, 7), default_target=Target.GPU) + dh = SerialDataHandling(domain_size=(7, 7), default_target=Target.CurrentGPU) src = dh.add_array('src') dh.fill("src", 0.0, ghost_layers=True) dh.fill("src", 1.0, ghost_layers=False) @@ -111,7 +111,7 @@ def test_boundary_gpu(): name="boundary_handling_cpu", target=Target.CPU) boundary_handling = BoundaryHandling(dh, src.name, boundary_stencil, - name="boundary_handling_gpu", target=Target.GPU) + name="boundary_handling_gpu", target=Target.CurrentGPU) neumann = Neumann() for d in ('N', 'S', 'W', 'E'): diff --git a/tests/runtime/test_datahandling.py b/tests/runtime/test_datahandling.py index 9d7ff924e..9e7c73cac 100644 --- a/tests/runtime/test_datahandling.py +++ b/tests/runtime/test_datahandling.py @@ -118,7 +118,7 @@ def synchronization(dh, test_gpu=False): def kernel_execution_jacobi(dh, target): - test_gpu = target == Target.GPU + test_gpu = target == Target.CurrentGPU dh.add_array('f', gpu=test_gpu) dh.add_array('tmp', gpu=test_gpu) @@ -219,15 +219,15 @@ def test_kernel(): try: import cupy dh = create_data_handling(domain_size=domain_shape, periodicity=True) - kernel_execution_jacobi(dh, Target.GPU) + kernel_execution_jacobi(dh, Target.CurrentGPU) except ImportError: pass -@pytest.mark.parametrize('target', (Target.CPU, Target.GPU)) +@pytest.mark.parametrize('target', (Target.CPU, Target.CurrentGPU)) def test_kernel_param(target): for domain_shape in [(4, 5), (3, 4, 5)]: - if target == Target.GPU: + if target == Target.CurrentGPU: pytest.importorskip('cupy') dh = create_data_handling(domain_size=domain_shape, periodicity=True, default_target=target) @@ -262,7 +262,7 @@ def test_add_arrays(): def test_add_arrays_with_layout(shape, layout): pytest.importorskip('cupy') - dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.GPU) + dh = create_data_handling(domain_size=shape, default_layout=layout, default_target=ps.Target.CurrentGPU) f1 = dh.add_array("f1", values_per_cell=19) dh.fill(f1.name, 1.0) @@ -392,8 +392,6 @@ def test_array_handler(device_number): empty = array_handler.empty(shape=size, order="F") assert empty.strides == (8, 16) - random_array = array_handler.randn(size) - cpu_array = np.empty((20, 40), dtype=np.float64) gpu_array = array_handler.to_gpu(cpu_array) -- GitLab From d5de40476583e59c40246ac843d7a0f161585920 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 14 Mar 2025 18:01:28 +0000 Subject: [PATCH 07/15] start updating docs --- docs/source/installation.md | 16 +++++++-- docs/source/user_manual/gpu_kernels.md | 48 ++++++++++++++++---------- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/docs/source/installation.md b/docs/source/installation.md index deb2b0613..8c344e760 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -48,10 +48,22 @@ to build this documentation, and `tests`, which adds `flake8` for code style che For more information on developing pystencils, see the [](#contribution_guide). ::: -### For Nvidia GPUs +### For GPUs If you have an Nvidia graphics processor and CUDA installed, you can use pystencils to directly compile and execute kernels running on your GPU. -This requires a working installation of [cupy](https://cupy.dev). +This requires a working installation of [Cupy](https://cupy.dev). Please refer to the cupy's [installation manual](https://docs.cupy.dev/en/stable/install.html) for details about installing cupy. + +You can also use Cupy together with AMD ROCm for AMD graphics cards, +but the setup steps are a bit more complicated - you might have to build cupy from source. +The Cupy documentation covers this in their [installation guide for Cupy on ROCm][cupy-rocm]. + +:::{note} +Since Cupy's support for ROCm is at this time still an experimental feature, +just-in-time compilation of pystencils HIP kernels +for the ROCm platform must also considered *experimental*. +::: + +[cupy-rocm]: https://docs.cupy.dev/en/stable/install.html#using-cupy-on-amd-gpu-experimental "Cupy on ROCm" diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 610c61ddf..2219ce042 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -26,23 +26,46 @@ import matplotlib.pyplot as plt (guide_gpukernels)= # Pystencils for GPUs -Pystencils offers code generation for Nvidia GPUs using the CUDA programming model, +Pystencils offers code generation for Nvidia and AMD GPUs +using the CUDA and HIP programming models, as well as just-in-time compilation and execution of CUDA kernels from within Python based on the [cupy] library. This section's objective is to give a detailed introduction into the creation of GPU kernels with pystencils. -## Generate, Compile and Run CUDA Kernels +:::{note} +[CuPy][cupy] is a Python library for numerical computations on GPU arrays, +which operates much in the same way that [NumPy][numpy] works on CPU arrays. +Cupy and NumPy expose nearly the same APIs for array operations; +the difference being that CuPy allocates all its arrays on the GPU +and performs its operations as CUDA kernels. +Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc]. +In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code, +and to allocate and manage the data these kernels can be executed on. + +For more information on CuPy, refer to [their documentation][cupy-docs]. +::: + +## Generate, Compile and Run GPU Kernels + +The CUDA and HIP platforms are made available in pystencils via the code generation targets +{any}`Target.CUDA` and {any}`Target.HIP`. +For pystencils code to be portable between both, we can use {any}`Target.CurrentGPU` to +automatically select one or the other, depending on the current runtime environment. + +:::{note} +If `cupy` is not installed, `create_kernel` will raise an exception when using `Target.CurrentGPU`. +When exporting kernels to be compiled externally in an environment where cupy is not available, +the GPU target must therefore be set explicitly. +::: -In order to obtain a CUDA implementation of a symbolic kernel, naught more is required -than setting the {any}`target <CreateKernelConfig.target>` code generator option to -{any}`Target.CUDA`: +Here is a snippet creating a kernel for the locally available GPU target: ```{code-cell} ipython3 f, g = ps.fields("f, g: float64[3D]") update = ps.Assignment(f.center(), 2 * g.center()) -cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) +cfg = ps.CreateKernelConfig(target=ps.Target.CurrentGPU) kernel = ps.create_kernel(update, cfg) ps.inspect(kernel) @@ -68,19 +91,6 @@ kfunc = kernel.compile() kfunc(f=f_arr, g=g_arr) ``` -:::{note} -[CuPy][cupy] is a Python library for numerical computations on GPU arrays, -which operates much in the same way that [NumPy][numpy] works on CPU arrays. -Cupy and NumPy expose nearly the same APIs for array operations; -the difference being that CuPy allocates all its arrays on the GPU -and performs its operations as CUDA kernels. -Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc]. -In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code, -and to allocate and manage the data these kernels can be executed on. - -For more information on CuPy, refer to [their documentation][cupy-docs]. -::: - (indexing_and_launch_config)= ## Modify the Indexing Scheme and Launch Configuration -- GitLab From 3d44a19e67699cc9f0b0d73426d7e76de985a742 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 14 Mar 2025 18:41:36 +0000 Subject: [PATCH 08/15] Update docs, installation + contrib guide --- docs/source/backend/gpu_codegen.md | 14 +++----- docs/source/backend/platforms.md | 15 ++++++-- docs/source/contributing/dev-workflow.md | 46 +++++++++++++++++++++--- docs/source/installation.md | 21 ++--------- docs/source/user_manual/gpu_kernels.md | 6 ++-- pyproject.toml | 3 +- 6 files changed, 67 insertions(+), 38 deletions(-) diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 1082669e6..0415c12c6 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -2,23 +2,19 @@ The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components: - - The {any}`CudaPlatform` at `backend.platforms` which performs materialization of a kernel's iteration - space by mapping GPU block and thread indices to iteration space points. To perform this task, - it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping. + - The platforms {any}`CudaPlatform` and {any}`HipPlatform` at `backend.platforms` + which perform materialization of a kernel's iteration space + by mapping GPU block and thread indices to iteration space points. + To perform this task, it depends on a {any}`ThreadMapping` instance which defines the nature of that mapping. The platform also takes care of lowering mathematical functions to their CUDA runtime library implementation. - In the code generation driver, the strings are drawn by the `GpuIndexing` helper class. It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration for the runtime system. -:::{attention} - -Code generation for HIP through the `CudaPlatform` is experimental and not tested at the moment. -::: - ## The CUDA Platform and Thread Mappings ```{eval-rst} -.. module:: pystencils.backend.platforms.cuda +.. module:: pystencils.backend.platforms.generic_gpu .. autosummary:: :toctree: generated diff --git a/docs/source/backend/platforms.md b/docs/source/backend/platforms.md index e7ffc6f15..2d2c33d86 100644 --- a/docs/source/backend/platforms.md +++ b/docs/source/backend/platforms.md @@ -26,7 +26,6 @@ targets in the future. Platform GenericCpu GenericVectorCpu - GenericGpu ``` ## CPU Platforms @@ -49,6 +48,18 @@ targets in the future. :nosignatures: :template: autosummary/entire_class.rst + GenericGpu CudaPlatform - SyclPlatform + HipPlatform ``` + +## Experimental Platforms + +```{eval-rst} +.. autosummary:: + :toctree: generated + :nosignatures: + :template: autosummary/entire_class.rst + + SyclPlatform +``` \ No newline at end of file diff --git a/docs/source/contributing/dev-workflow.md b/docs/source/contributing/dev-workflow.md index 8daac8cbd..d92916130 100644 --- a/docs/source/contributing/dev-workflow.md +++ b/docs/source/contributing/dev-workflow.md @@ -48,16 +48,22 @@ git pull --set-upstream upstream master ## Set Up the Python Environment +### Prerequesites + To develop pystencils, you will need at least the following software installed on your machine: - Python 3.10 or later: Since pystencils minimal supported version is Python 3.10, we recommend that you work with Python 3.10 directly. - An up-to-date C++ compiler, used by pystencils to JIT-compile generated code - [Nox](https://nox.thea.codes/en/stable/), which we use for test automation. Nox will be used extensively in the instructions on testing below. -- Optionally [CUDA](https://developer.nvidia.com/cuda-toolkit), - if you have an Nvidia or AMD GPU and plan to develop on pystencils' GPU capabilities +- Optionally, for GPU development: + - At least CUDA 11 for Nvidia GPUs, or + - At least ROCm/HIP 6.1 for AMD GPUs. + +### Virtual Environment Setup -Once you have these, set up a [virtual environment](https://docs.python.org/3/library/venv.html) for development. +Once you have all the prerequesites, +set up a [virtual environment](https://docs.python.org/3/library/venv.html) for development. This ensures that your system's installation of Python is kept clean, and isolates your development environment from outside influence. Use the following commands to create a virtual environment at `.venv` and perform an editable install of pystencils into it: @@ -74,7 +80,39 @@ Setting `PIP_REQUIRE_VIRTUALENV` ensures that pip refuses to install packages gl Consider setting this variable globally in your shell's configuration file. ::: -You are now ready to go! Create a new git branch to work on, open up an IDE, and start coding. +:::{admonition} Feature Groups +The above installation instructions assume that you will be running all code checking +and test tasks through `nox`. +If you need or want to run them manually, you will need to add one or more +of these feature groups to your installation: + + - `doc`, which contains all dependencies required to build this documentation; + - `dev`, which adds `flake8` for code style checking, + `mypy` for static type checking, + and the `black` formatter; + - `testsuite`, which adds `pytest` plus plugins and some more dependencies required + for running the test suite. + +Depending on your development focus, you might also need to add some of the user feature +groups listed in [the installation guide](#installation_guide). +::: + +### Cupy for CUDA and HIP + +When developing for NVidia or AMD GPUs, you will likely need an installation of [cupy](https://cupy.dev/). +Since cupy has to be built specifically against the libraries of a given CUDA or ROCm version, +it cannot be installed directly via dependency resolution from pystencils. +For instructions on how to install Cupy, refer to their [installation manual](https://docs.cupy.dev/en/stable/install.html). + +### Test Your Setup + +To check if your setup is complete, a good check is to invoke the pystencils test suite: + +```bash +nox -s "testsuite(cpu)" +``` + +If this finishes without errors, you are ready to go! Create a new git branch to work on, open up an IDE, and start coding. Make sure your IDE recognizes the virtual environment you created, though. ## Static Code Analysis diff --git a/docs/source/installation.md b/docs/source/installation.md index 8c344e760..5cb274c93 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,4 +1,4 @@ -(_installation)= +(installation_guide)= # Setup and Installation ## Install pystencils @@ -17,7 +17,7 @@ git clone -b v2.0-dev https://i10git.cs.fau.de/pycodegen/pystencils.git pip install -e pystencils ``` -### Feature Groups +## Feature Groups In both cases, you can add a set of optional features to your installation by listing them in square brackets (e.g. `pip install -e pystencils[feature1, feature2]`). @@ -33,22 +33,7 @@ The following feature sets are available: - `use_cython`: Install [Cython](https://cython.org/), which is used internally by pystencils to accelerate the setup of boundary conditions. -:::{dropdown} For Developers - -If you are developing pystencils, we recommend you perform an editable install of your -local clone of the repository, with all optional features: -```bash -pip install -e pystencils[alltrafos,interactive,use_cython,doc,testsuite] -``` - -This includes the additional feature groups `doc`, which contains all dependencies required -to build this documentation, and `tests`, which adds `flake8` for code style checking, -`mypy` for static type checking, and `pytest` plus plugins for running the test suite. - -For more information on developing pystencils, see the [](#contribution_guide). -::: - -### For GPUs +## For GPUs If you have an Nvidia graphics processor and CUDA installed, you can use pystencils to directly compile and execute kernels running on your GPU. diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 2219ce042..14a29c41c 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -55,8 +55,8 @@ automatically select one or the other, depending on the current runtime environm :::{note} If `cupy` is not installed, `create_kernel` will raise an exception when using `Target.CurrentGPU`. -When exporting kernels to be compiled externally in an environment where cupy is not available, -the GPU target must therefore be set explicitly. +You can still generate kernels for CUDA or HIP directly even without Cupy; +you just won't be able to just-in-time compile and run them. ::: Here is a snippet creating a kernel for the locally available GPU target: @@ -218,7 +218,7 @@ assignments = [ ```{code-cell} ipython3 y = ps.DEFAULTS.spatial_counters[0] cfg = ps.CreateKernelConfig() -cfg.target= ps.Target.CUDA +cfg.target= ps.Target.CurrentGPU cfg.iteration_slice = ps.make_slice[:, y:] ``` diff --git a/pyproject.toml b/pyproject.toml index 55b21cbbf..ae539b12c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ classifiers = [ "Source Code" = "https://i10git.cs.fau.de/pycodegen/pystencils" [project.optional-dependencies] -gpu = ['cupy'] alltrafos = ['islpy', 'py-cpuinfo'] bench_db = ['blitzdb', 'pymongo', 'pandas'] interactive = [ @@ -76,7 +75,7 @@ testsuite = [ 'matplotlib', 'py-cpuinfo', 'randomgen>=1.18', - 'scipy' + 'scipy', ] [build-system] -- GitLab From 4716c97aeae07e88525bd83bd6b3fb2b36926d0e Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Sat, 15 Mar 2025 11:26:38 +0100 Subject: [PATCH 09/15] final adjustments to the docs --- docs/source/installation.md | 2 +- docs/source/user_manual/gpu_kernels.md | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/source/installation.md b/docs/source/installation.md index 5cb274c93..8fdb5684f 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -41,7 +41,7 @@ This requires a working installation of [Cupy](https://cupy.dev). Please refer to the cupy's [installation manual](https://docs.cupy.dev/en/stable/install.html) for details about installing cupy. -You can also use Cupy together with AMD ROCm for AMD graphics cards, +You can also use Cupy together with AMD ROCm and HIP for AMD graphics cards, but the setup steps are a bit more complicated - you might have to build cupy from source. The Cupy documentation covers this in their [installation guide for Cupy on ROCm][cupy-rocm]. diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index 14a29c41c..d272cf9c6 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -39,7 +39,7 @@ which operates much in the same way that [NumPy][numpy] works on CPU arrays. Cupy and NumPy expose nearly the same APIs for array operations; the difference being that CuPy allocates all its arrays on the GPU and performs its operations as CUDA kernels. -Also, CuPy exposes a just-in-time-compiler for GPU kernels, which internally calls [nvrtc]. +Also, CuPy exposes a just-in-time-compiler for GPU kernels. In pystencils, we use CuPy both to compile and provide executable kernels on-demand from within Python code, and to allocate and manage the data these kernels can be executed on. @@ -271,5 +271,4 @@ only a part of the triangle is being processed. [cupy]: https://cupy.dev "CuPy Homepage" [numpy]: https://numpy.org "NumPy Homepage" -[nvrtc]: https://docs.nvidia.com/cuda/nvrtc/index.html "NVIDIA Runtime Compilation Library" [cupy-docs]: https://docs.cupy.dev/en/stable/overview.html "CuPy Documentation" -- GitLab From 7db7f592e9691ced3052fa6301f45c89afa54165 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Sat, 15 Mar 2025 11:50:17 +0100 Subject: [PATCH 10/15] do not use cupy in code that must always be executable --- src/pystencils/jit/gpu_cupy.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 69e965325..7065ce815 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -208,12 +208,6 @@ class CupyKernelWrapper(KernelWrapper): class CupyJit(JitBase): def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)): - self._runtime_headers: set[str] - if cp.cuda.runtime.is_hip: - self._runtime_headers = set() - else: - self._runtime_headers = {"<cstdint>"} - if len(default_block_size) > 3: raise ValueError( f"Invalid block size: {default_block_size}. Must be at most three-dimensional." @@ -234,12 +228,12 @@ class CupyJit(JitBase): raise JitError( "The CupyJit just-in-time compiler only accepts GPU kernels generated for CUDA or HIP" ) - + if kernel.target == Target.CUDA and cp.cuda.runtime.is_hip: raise JitError( "Cannot compile a CUDA kernel on a HIP-based Cupy installation." ) - + if kernel.target == Target.HIP and not cp.cuda.runtime.is_hip: raise JitError( "Cannot compile a HIP kernel on a CUDA-based Cupy installation." @@ -261,7 +255,13 @@ class CupyJit(JitBase): return tuple(options) def _prelude(self, kfunc: GpuKernel) -> str: - headers = self._runtime_headers + + headers: set[str] + if cp.cuda.runtime.is_hip: + headers = set() + else: + headers = {"<cstdint>"} + headers |= kfunc.required_headers if '"pystencils_runtime/half.h"' in headers: -- GitLab From 9b1219516168eaa987d2661a875922569284f8b6 Mon Sep 17 00:00:00 2001 From: Richard Angersbach <richard.angersbach@fau.de> Date: Mon, 17 Mar 2025 14:23:59 +0100 Subject: [PATCH 11/15] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Richard Angersbach <richard.angersbach@fau.de> --- docs/source/backend/gpu_codegen.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 0415c12c6..b487edbbb 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -11,7 +11,7 @@ The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP co It provides both the {any}`ThreadMapping` for the codegen backend, as well as the launch configuration for the runtime system. -## The CUDA Platform and Thread Mappings +## The GPU Platform and Thread Mappings ```{eval-rst} .. module:: pystencils.backend.platforms.generic_gpu -- GitLab From 6a06d72faeb040058b67bfec5ffee2cacff2bcad Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Tue, 18 Mar 2025 15:42:57 +0100 Subject: [PATCH 12/15] Merge branch 'v2.0-dev' into fhennig/hip-target --- docs/source/backend/gpu_codegen.md | 7 + docs/source/user_manual/gpu_kernels.md | 45 +- .../backend/platforms/generic_gpu.py | 29 +- src/pystencils/backend/platforms/sycl.py | 17 +- src/pystencils/codegen/config.py | 44 +- src/pystencils/codegen/driver.py | 24 +- src/pystencils/codegen/gpu_indexing.py | 410 +++++++++++++++--- src/pystencils/jit/gpu_cupy.py | 13 +- src/pystencils/utils.py | 5 + tests/kernelcreation/test_gpu.py | 133 +++++- 10 files changed, 573 insertions(+), 154 deletions(-) diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index b487edbbb..3fe00840e 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -68,6 +68,13 @@ These depend on the type of the launch configuration: while the `AutomaticLaunchConfiguration` permits no modification and computes grid and block size directly from kernel parameters, the `ManualLaunchConfiguration` requires the user to manually specifiy both grid and block size. +The `DynamicBlockSizeLaunchConfiguration` dynamically computes the grid size from either the default block size +or a computed block size. Computing block sizes can be signaled by the user via the `trim_block_size` or +`fit_block_size` member functions. These function receive an initial block size as an argument and adapt it. +The `trim_block_size` function trims the initial block size with the sizes of the iteration space, i.e. it takes +the minimum value of both sizes per dimension. The `fit_block_size` performs a block fitting algorithm that adapts +the initial block size by incrementally enlarging the trimmed block size until it is large enough +and aligns with the warp size. The `evaluate` method can only be used from within a Python runtime environment. When exporting pystencils CUDA kernels for external use in C++ projects, diff --git a/docs/source/user_manual/gpu_kernels.md b/docs/source/user_manual/gpu_kernels.md index d272cf9c6..7a3d54f6f 100644 --- a/docs/source/user_manual/gpu_kernels.md +++ b/docs/source/user_manual/gpu_kernels.md @@ -121,24 +121,49 @@ For most kernels with an at most three-dimensional iteration space, this behavior is sufficient and desired. It can be enforced by setting `gpu.indexing_scheme = "Linear3D"`. -If the `Linear3D` indexing scheme is used, you may modifiy the GPU thread block size in two places. -The default block size for the kernel can be set via the {any}`gpu.block_size <GpuOptions.block_size>` -code generator option; -if none is specified, a default depending on the iteration space's dimensionality will be used. - -The block size can furthermore be modified at the compiled kernel's wrapper object via the -`launch_config.block_size` attribute: +The GPU thread block size of a compiled kernel's wrapper object can only be directly modified +for manual launch configurations, cf. the section for [manual launch configurations](#manual_launch_grids). +Linear indexing schemes without manual launch configurations either employ default block sizes +or compute block sizes using user-exposed member functions that adapt the initial block size that was +passed as an argument. The available functions are :meth:`fit_block_size` and :meth:`trim_block_size`. +The :meth:`trim_block_size` function trims the user-defined initial block size with the iteration space. +The :meth:`fit_block_size` employs a fitting algorithm that finds a suitable block configuration that is +divisible by the hardware's warp size. ```{code-cell} ipython3 +:tags: [raises-exception] kfunc = kernel.compile() -kfunc.launch_config.block_size = (256, 2, 1) -# Run the kernel +# three different configuration cases for block size: + +# a) Nothing +# -> use default block size, perform no fitting, no trimming + +# b) Activate fitting with initial block size +kfunc.launch_config.fit_block_size((8, 8, 4)) + +# c) Activate trimming with initial block size +kfunc.launch_config.trim_block_size((8, 8, 4)) + +# finally run the kernel... kfunc(f=f_arr, g=g_arr) ``` +Block sizes aligning with multiples of the hardware's warp size allow for a better usage of the GPUs resources. +Even though such block size configurations are not enforced, notifying our code generator via the +`GpuOptions.assume_warp_aligned_block_size` option that the configured block size is divisible by the warp size allows +for further optimization potential, e.g. for warp-level reductions. +When setting this option to `True`, the user has to make sure that this alignment applies. +For [manual launch configurations](#manual_launch_grids), this can be achieved by manually providing suitable +block sizes via the `launch_config.block_size`. For the other launch configurations, this criterion is guaranteed +by using the default block sizes in pystencils. Using :meth:`fit_block_size` also guarantees this while also producing +block sizes that are better customized towards the kernel's iteration space. For :meth:`trim_block_size`, +the trimmed block's dimension that is closest the next multiple of the warp size is rounded up to the next multiple +of the warp size. For all cases, the final block size is checked against the imposed hardware limits and an error +is thrown in case these limits are exceeded. + In any case. pystencils will automatically compute the grid size from the shapes of the kernel's array arguments -and the given thread block size. +and the final thread block size. :::{attention} diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index fac37ffa5..11425d923 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -174,12 +174,10 @@ class GenericGpu(Platform): def __init__( self, ctx: KernelCreationContext, - omit_range_check: bool = False, thread_mapping: ThreadMapping | None = None, ) -> None: super().__init__(ctx) - self._omit_range_check = omit_range_check self._thread_mapping = ( thread_mapping if thread_mapping is not None else Linear3DMapping() ) @@ -278,17 +276,12 @@ class GenericGpu(Platform): indexing_decls.append( self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) - if not self._omit_range_check: - conds.append(PsLt(ctr_expr, dim.stop)) - - if conds: - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) - else: - body.statements = indexing_decls + body.statements - ast = body + conds.append(PsLt(ctr_expr, dim.stop)) + + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) return ast @@ -320,12 +313,8 @@ class GenericGpu(Platform): ] body.statements = mappings + body.statements - if not self._omit_range_check: - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr_expr.clone(), stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - else: - body.statements = [sparse_idx_decl] + body.statements - ast = body + stop = PsExpression.make(ispace.index_list.shape[0]) + condition = PsLt(sparse_ctr_expr.clone(), stop) + ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) return ast diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index f3c4bb3d5..d16c4f51b 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -35,12 +35,10 @@ class SyclPlatform(Platform): def __init__( self, ctx: KernelCreationContext, - omit_range_check: bool = False, automatic_block_size: bool = False, ): super().__init__(ctx) - self._omit_range_check = omit_range_check self._automatic_block_size = automatic_block_size @property @@ -136,8 +134,7 @@ class SyclPlatform(Platform): indexing_decls.append( PsDeclaration(ctr, dim.start + work_item_idx * dim.step) ) - if not self._omit_range_check: - conds.append(PsLt(ctr, dim.stop)) + conds.append(PsLt(ctr, dim.stop)) if conds: condition: PsExpression = conds[0] @@ -182,15 +179,9 @@ class SyclPlatform(Platform): ] body.statements = mappings + body.statements - if not self._omit_range_check: - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr, stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - else: - body.statements = [sparse_idx_decl] + body.statements - ast = body - - return ast + stop = PsExpression.make(ispace.index_list.shape[0]) + condition = PsLt(sparse_ctr, stop) + return PsBlock([sparse_idx_decl, PsConditional(condition, body)]) def _item_type(self, rank: int): if not self._automatic_block_size: diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index a765dea2e..91aff43f4 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -372,30 +372,38 @@ class GpuOptions(ConfigBase): indexing_scheme: Option[GpuIndexingScheme, str] = Option(GpuIndexingScheme.Linear3D) """Thread indexing scheme for dense GPU kernels.""" - omit_range_check: BasicOption[bool] = BasicOption(False) - """If set to `True`, omit the iteration counter range check. + manual_launch_grid: BasicOption[bool] = BasicOption(False) + """Always require a manually specified launch grid when running this kernel. - By default, the code generator introduces a check if the iteration counters computed from GPU block and thread - indices are within the prescribed loop range. - This check can be discarded through this option, at your own peril. + If set to `True`, the code generator will not attempt to infer the size of + the launch grid from the kernel. + The launch grid will then have to be specified manually at runtime. """ - block_size: BasicOption[tuple[int, int, int] | _AUTO_TYPE] = BasicOption(AUTO) - """Desired block size for the execution of GPU kernels. + warp_size: BasicOption[int] = BasicOption() + """Specifies the size of a warp (CUDA) or wavefront (HIP). - This option only takes effect if `Linear3D <GpuIndexingScheme.Linear3D>` - is chosen as an indexing scheme. - The block size may be overridden at runtime. + If this option is not set the default value for the given target will be automatically used. """ - manual_launch_grid: BasicOption[bool] = BasicOption(False) - """Always require a manually specified launch grid when running this kernel. + assume_warp_aligned_block_size: BasicOption[bool] = BasicOption(False) + """Specifies whether block sizes are divisible by the hardware's warp size. - If set to `True`, the code generator will not attempt to infer the size of - the launch grid from the kernel. - The launch grid will then have to be specified manually at runtime. + If set to `True`, the code generator can employ optimizations that require this assumption, + e.g. warp-level reductions. + The pystencils Cupy runtime also checks if user-provided block sizes fulfill this criterion. """ + @staticmethod + def default_warp_size(target: Target): + match target: + case Target.CUDA: + return 32 + case _: + raise NotImplementedError( + f"No default warp/wavefront size known for target {target}" + ) + @indexing_scheme.validate def _validate_idx_scheme(self, val: str | GpuIndexingScheme): if isinstance(val, GpuIndexingScheme): @@ -605,7 +613,7 @@ class CreateKernelConfig(ConfigBase): elif target == Target.CUDA or target == Target.HIP: try: from ..jit.gpu_cupy import CupyJit - + return CupyJit() except ImportError: @@ -738,9 +746,7 @@ class CreateKernelConfig(ConfigBase): UserWarning, ) - self.gpu = GpuOptions( - block_size=gpu_indexing_params.get("block_size", None) - ) + self.gpu = GpuOptions() def _deprecated_option(name, instead): # pragma: no cover diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 9e3c8b163..9f59e3510 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -11,6 +11,7 @@ from .config import ( GhostLayerSpec, IterationSliceSpec, GpuIndexingScheme, + GpuOptions, ) from .kernel import Kernel, GpuKernel from .properties import PsSymbolProperty, FieldBasePtr @@ -401,13 +402,22 @@ class DefaultKernelCreationDriver: if not self._target.is_gpu(): return None - from .gpu_indexing import dim3 - idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") - block_size: dim3 | _AUTO_TYPE = self._cfg.gpu.get_option("block_size") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") + warp_size: int | None = self._cfg.gpu.get_option("warp_size") - return GpuIndexing(self._ctx, idx_scheme, block_size, manual_launch_grid) + if warp_size is None: + warp_size = GpuOptions.default_warp_size(self._target) + + return GpuIndexing( + self._ctx, + self._target, + idx_scheme, + warp_size, + manual_launch_grid, + assume_warp_aligned_block_size, + ) def _get_platform(self) -> Platform: if Target._CPU in self._target: @@ -437,9 +447,6 @@ class DefaultKernelCreationDriver: ) elif self._target.is_gpu(): - gpu_opts = self._cfg.gpu - omit_range_check: bool = gpu_opts.get_option("omit_range_check") - thread_mapping = ( self._gpu_indexing.get_thread_mapping() if self._gpu_indexing is not None @@ -457,7 +464,6 @@ class DefaultKernelCreationDriver: return GpuPlatform( self._ctx, - omit_range_check=omit_range_check, thread_mapping=thread_mapping, ) @@ -465,11 +471,9 @@ class DefaultKernelCreationDriver: from ..backend.platforms import SyclPlatform auto_block_size: bool = self._cfg.sycl.get_option("automatic_block_size") - omit_range_check = self._cfg.gpu.get_option("omit_range_check") return SyclPlatform( self._ctx, - omit_range_check=omit_range_check, automatic_block_size=auto_block_size, ) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 890afb0fb..0524eb0b1 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -1,13 +1,16 @@ from __future__ import annotations from abc import ABC, abstractmethod +from dataclasses import dataclass from typing import cast, Any, Callable from itertools import chain +from warnings import warn from .functions import Lambda from .parameters import Parameter from .errors import CodegenError -from .config import GpuIndexingScheme, _AUTO_TYPE +from .config import GpuIndexingScheme +from .target import Target from ..backend.kernelcreation import ( KernelCreationContext, @@ -16,13 +19,36 @@ from ..backend.kernelcreation import ( ) from ..backend.platforms.generic_gpu import ThreadMapping -from ..backend.ast.expressions import PsExpression +from ..backend.ast.expressions import PsExpression, PsIntDiv +from math import prod +from ..utils import ceil_to_multiple dim3 = tuple[int, int, int] _Dim3Lambda = tuple[Lambda, Lambda, Lambda] +@dataclass +class HardwareProperties: + warp_size: int + max_threads_per_block: int + max_block_sizes: dim3 + + def block_size_exceeds_hw_limits( + self, + block_size: tuple[int, ...] + ) -> bool: + """Checks if provided block size conforms limits given by the hardware.""" + + return ( + any( + size > max_size + for size, max_size in zip(block_size, self.max_block_sizes) + ) + or prod(block_size) > self.max_threads_per_block + ) + + class GpuLaunchConfiguration(ABC): """Base class for launch configurations for CUDA and HIP kernels. @@ -33,6 +59,18 @@ class GpuLaunchConfiguration(ABC): parameters to the associated kernel """ + @property + @abstractmethod + def block_size(self) -> dim3 | None: + """Returns desired block size if available.""" + pass + + @block_size.setter + @abstractmethod + def block_size(self, val: dim3): + """Sets desired block size if possible.""" + pass + @property @abstractmethod def parameters(self) -> frozenset[Parameter]: @@ -52,6 +90,25 @@ class GpuLaunchConfiguration(ABC): this launch configuration, such that when the configuration changes, the JIT parameter cache is invalidated.""" + @staticmethod + def get_default_block_size(rank: int) -> dim3: + """Returns the default block size configuration used by the generator.""" + + match rank: + case 1: + return (256, 1, 1) + case 2: + return (16, 16, 1) + case 3: + return (8, 8, 4) + case _: + assert False, "unreachable code" + + @staticmethod + def _excessive_block_size_error_msg(block_size: tuple[int, ...]): + return f"Unable to determine GPU block size for this kernel. \ + Final block size was too large: {block_size}." + class AutomaticLaunchConfiguration(GpuLaunchConfiguration): """Launch configuration that is dynamically computed from kernel parameters. @@ -63,20 +120,37 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): self, block_size: _Dim3Lambda, grid_size: _Dim3Lambda, + hw_props: HardwareProperties, + assume_warp_aligned_block_size: bool, ) -> None: self._block_size = block_size self._grid_size = grid_size + self._hw_props = hw_props + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size self._params: frozenset[Parameter] = frozenset().union( *(lb.parameters for lb in chain(block_size, grid_size)) ) + @property + def block_size(self) -> dim3 | None: + """Block size is only available when `evaluate` is called.""" + return None + + @block_size.setter + def block_size(self, val: dim3): + AttributeError("Setting `block_size` on an automatic launch configuration has no effect.") + @property def parameters(self) -> frozenset[Parameter]: return self._params def evaluate(self, **kwargs) -> tuple[dim3, dim3]: block_size = tuple(int(bs(**kwargs)) for bs in self._block_size) + + if self._hw_props.block_size_exceeds_hw_limits(block_size): + raise CodegenError(f"Block size {block_size} exceeds hardware limits.") + grid_size = tuple(int(gs(**kwargs)) for gs in self._grid_size) return cast(dim3, block_size), cast(dim3, grid_size) @@ -91,8 +165,12 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): """ def __init__( - self, + self, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool = False ) -> None: + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + + self._hw_props = hw_props + self._block_size: dim3 | None = None self._grid_size: dim3 | None = None @@ -123,6 +201,18 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): if self._grid_size is None: raise AttributeError("No GPU grid size was set by the user.") + if ( + self._assume_warp_aligned_block_size + and prod(self._block_size) % self._hw_props.warp_size != 0 + ): + raise CodegenError( + "Specified block sizes must align with warp size with " + "`assume_warp_aligned_block_size` enabled." + ) + + if self._hw_props.block_size_exceeds_hw_limits(self._block_size): + raise CodegenError(self._excessive_block_size_error_msg(self._block_size)) + return self._block_size, self._grid_size def jit_cache_key(self) -> Any: @@ -130,24 +220,60 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): - """GPU launch configuration that permits the user to set a block size and dynamically computes the grid size. + """GPU launch configuration that dynamically computes the grid size from either the default block size + or a computed block size. Computing block sizes can be triggerred via the :meth:`trim_block_size` or + :meth:`fit_block_size` member functions. These functions adapt a user-defined initial block size that they + receive as an argument. The adaptation of the initial block sizes is described in the following: - The actual launch grid size is computed from the user-defined ``user_block_size`` and the number of work items - in the kernel's iteration space as follows. For each dimension :math:`c \\in \\{ x, y, z \\}`, - - if ``user_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` and ``grid_size.c = 1``; - - otherwise, ``block_size.c = user_block_size.c`` and ``grid_size.c = ceil(num_work_items.c / block_size.c)``. + - if :meth:`fit_block_size` was chosen: + + the initial block size is adapted such that it aligns with multiples of the hardware's warp size. + This is done using a fitting algorithm first trims the initial block size with the iteration space + and increases it incrementally until it is large enough and coincides with multiples of the warp size, i.e. + + ``block_size.c = _fit_block_size_to_it_space(iter_space.c, init_block_size.c, hardware_properties)`` + + The fitted block size also guarantees the user usage of `GpuOptions.assume_warp_aligned_block_size`. + + - elif :meth:`trim_block_size` was chosen: + + a trimming between the number of work items and the kernel's iteration space occurs, i.e. + + - if ``init_block_size.c > num_work_items.c``, ``block_size = num_work_items.c`` + - otherwise, ``block_size.c = init_block_size.c`` + + When `GpuOptions.assume_warp_aligned_block_size` is set, we ensure warp-alignment by + rounding the block size dimension that is closest the next multiple of the warp size. + + - otherwise: the default block size is taken i.e. + + ``block_size.c = get_default_block_size(rank=3).c`` + + The actual launch grid size is then computed as follows. + + ``grid_size.c = ceil(num_work_items.c / block_size.c)``. """ def __init__( self, num_work_items: _Dim3Lambda, - default_block_size: dim3 | None = None, + hw_props: HardwareProperties, + assume_warp_aligned_block_size: bool, ) -> None: self._num_work_items = num_work_items - self._block_size: dim3 | None = default_block_size + self._hw_props = hw_props + + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + + default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items)) + self._default_block_size = default_bs + self._init_block_size: dim3 = default_bs + self._compute_block_size: ( + Callable[[dim3, dim3, HardwareProperties], tuple[int, ...]] | None + ) = None self._params: frozenset[Parameter] = frozenset().union( *(wit.parameters for wit in num_work_items) @@ -159,44 +285,188 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): dimension from kernel parameters.""" return self._num_work_items + @property + def parameters(self) -> frozenset[Parameter]: + """Parameters of this launch configuration""" + return self._params + @property def block_size(self) -> dim3 | None: - """The desired GPU block size.""" - return self._block_size + """Block size is only available when `evaluate` is called.""" + return None @block_size.setter def block_size(self, val: dim3): - self._block_size = val + AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.") + + @staticmethod + def _round_block_sizes_to_warp_size( + to_round: tuple[int, ...], warp_size: int + ) -> tuple[int, ...]: + # check if already aligns with warp size + if prod(to_round) % warp_size == 0: + return tuple(to_round) + + # find index of element closest to warp size and round up + index_to_round = to_round.index(max(to_round, key=lambda i: abs(i % warp_size))) + if index_to_round + 1 < len(to_round): + return ( + *to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], warp_size), + *to_round[index_to_round + 1:], + ) + else: + return ( + *to_round[:index_to_round], + ceil_to_multiple(to_round[index_to_round], warp_size), + ) - @property - def parameters(self) -> frozenset[Parameter]: - """Parameters of this launch configuration""" - return self._params + def trim_block_size(self, block_size: dim3): + def call_trimming_factory( + it: dim3, + bs: dim3, + hw: HardwareProperties, + ): + return self._trim_block_size_to_it_space(it, bs, hw) - def evaluate(self, **kwargs) -> tuple[dim3, dim3]: - if self._block_size is None: - raise AttributeError("No GPU block size was specified by the user!") + self._init_block_size = block_size + self._compute_block_size = call_trimming_factory + + def _trim_block_size_to_it_space( + self, + it_space: dim3, + block_size: dim3, + hw_props: HardwareProperties, + ) -> tuple[int, ...]: + """Returns specified block sizes trimmed with iteration space. + Raises CodegenError if trimmed block size does not conform hardware limits. + """ + + ret = tuple([min(b, i) for b, i in zip(block_size, it_space)]) + if hw_props.block_size_exceeds_hw_limits(ret): + raise CodegenError(self._excessive_block_size_error_msg(ret)) + + if ( + self._assume_warp_aligned_block_size + and prod(ret) % self._hw_props.warp_size != 0 + ): + self._round_block_sizes_to_warp_size(ret, hw_props.warp_size) + + return ret + + def fit_block_size(self, block_size: dim3): + def call_fitting_factory( + it: dim3, + bs: dim3, + hw: HardwareProperties, + ): + return self._fit_block_size_to_it_space(it, bs, hw) + + self._init_block_size = block_size + self._compute_block_size = call_fitting_factory + + def _fit_block_size_to_it_space( + self, + it_space: dim3, + block_size: dim3, + hw_props: HardwareProperties, + ) -> tuple[int, ...]: + """Returns an optimized block size configuration with block sizes being aligned with the warp size. + Raises CodegenError if optimal block size could not be found or does not conform hardware limits. + """ + def trim(to_trim: list[int]) -> list[int]: + return [min(b, i) for b, i in zip(to_trim, it_space)] + + def check_sizes_and_return(ret: tuple[int, ...]) -> tuple[int, ...]: + if hw_props.block_size_exceeds_hw_limits(ret): + raise CodegenError(self._excessive_block_size_error_msg(ret)) + return ret + + trimmed = trim(list(block_size)) + if ( + prod(trimmed) >= hw_props.warp_size + and prod(trimmed) % hw_props.warp_size == 0 + ): + # case 1: greater than min block size -> use trimmed result + return check_sizes_and_return(tuple(trimmed)) + + prev_trim_size = 0 + resize_order = [0, 2, 1] if len(it_space) == 3 else range(len(it_space)) + while prod(trimmed) is not prev_trim_size: + prev_trim_size = prod(trimmed) + + # case 2: trimmed block is equivalent to the whole iteration space + if all(b == i for b, i in zip(trimmed, it_space)): + return check_sizes_and_return( + self._round_block_sizes_to_warp_size( + tuple(trimmed), hw_props.warp_size + ) + ) + else: + # double block size in each dimension until block is large enough (or case 2 triggers) + for d in resize_order: + trimmed[d] *= 2 + + # trim fastest moving dim to multiples of warp size + if ( + d == 0 + and trimmed[d] > hw_props.warp_size + and trimmed[d] % hw_props.warp_size != 0 + ): + # subtract remainder + trimmed[d] = trimmed[d] - (trimmed[d] % hw_props.warp_size) + + # check if block sizes are within hardware capabilities + trimmed[d] = min(trimmed[d], hw_props.max_block_sizes[d]) + + # trim again + trimmed = trim(trimmed) + + # case 3: trim block is large enough + if prod(trimmed) >= hw_props.warp_size: + return check_sizes_and_return( + self._round_block_sizes_to_warp_size( + tuple(trimmed), hw_props.warp_size + ) + ) + + raise CodegenError("Unable to determine GPU block size for this kernel.") + + def evaluate(self, **kwargs) -> tuple[dim3, dim3]: from ..utils import div_ceil num_work_items = cast( dim3, tuple(int(wit(**kwargs)) for wit in self._num_work_items) ) - reduced_block_size = cast( - dim3, - tuple(min(wit, bs) for wit, bs in zip(num_work_items, self._block_size)), - ) + + block_size: dim3 + if self._compute_block_size: + try: + computed_bs = self._compute_block_size( + num_work_items, self._init_block_size, self._hw_props + ) + + block_size = cast(dim3, computed_bs) + except CodegenError as e: + block_size = self._default_block_size + warn( + f"CodeGenError occurred: {getattr(e, 'message', repr(e))}. " + f"Block size fitting could not determine optimal block size configuration. " + f"Defaulting back to {self._default_block_size}." + ) + else: + block_size = self._default_block_size + grid_size = cast( dim3, - tuple( - div_ceil(wit, bs) for wit, bs in zip(num_work_items, reduced_block_size) - ), + tuple(div_ceil(wit, bs) for wit, bs in zip(num_work_items, block_size)), ) - return reduced_block_size, grid_size + return block_size, grid_size def jit_cache_key(self) -> Any: - return self._block_size + return () class GpuIndexing: @@ -218,14 +488,24 @@ class GpuIndexing: def __init__( self, ctx: KernelCreationContext, + target: Target, scheme: GpuIndexingScheme, - default_block_size: dim3 | _AUTO_TYPE | None = None, + warp_size: int, manual_launch_grid: bool = False, + assume_warp_aligned_block_size: bool = False, ) -> None: self._ctx = ctx + self._target = target self._scheme = scheme - self._default_block_size = default_block_size + self._warp_size = warp_size self._manual_launch_grid = manual_launch_grid + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + + self._hw_props = HardwareProperties( + warp_size, + self.get_max_threads_per_block(target), + self.get_max_block_sizes(target), + ) from ..backend.kernelcreation import AstFactory from .driver import KernelFactory @@ -233,6 +513,26 @@ class GpuIndexing: self._ast_factory = AstFactory(self._ctx) self._kernel_factory = KernelFactory(self._ctx) + @staticmethod + def get_max_block_sizes(target: Target): + match target: + case Target.CUDA: + return (1024, 1024, 64) + case _: + raise CodegenError( + f"Cannot determine max GPU block sizes for target {target}" + ) + + @staticmethod + def get_max_threads_per_block(target: Target): + match target: + case Target.CUDA: + return 1024 + case _: + raise CodegenError( + f"Cannot determine max GPU threads per block for target {target}" + ) + def get_thread_mapping(self) -> ThreadMapping: """Retrieve a thread mapping object for use by the backend""" @@ -247,7 +547,13 @@ class GpuIndexing: def get_launch_config_factory(self) -> Callable[[], GpuLaunchConfiguration]: """Retrieve a factory for the launch configuration for later consumption by the runtime system""" if self._manual_launch_grid: - return ManualLaunchConfiguration + + def factory(): + return ManualLaunchConfiguration( + self._hw_props, self._assume_warp_aligned_block_size + ) + + return factory match self._scheme: case GpuIndexingScheme.Linear3D: @@ -268,10 +574,9 @@ class GpuIndexing: ) work_items_expr += tuple( - self._ast_factory.parse_index(1) - for _ in range(3 - rank) + self._ast_factory.parse_index(1) for _ in range(3 - rank) ) - + num_work_items = cast( _Dim3Lambda, tuple(self._kernel_factory.create_lambda(wit) for wit in work_items_expr), @@ -280,28 +585,12 @@ class GpuIndexing: def factory(): return DynamicBlockSizeLaunchConfiguration( num_work_items, - self._get_default_block_size(rank), + self._hw_props, + self._assume_warp_aligned_block_size, ) return factory - def _get_default_block_size(self, rank: int) -> dim3: - if self._default_block_size is None: - raise CodegenError("The default block size option was not set") - - if isinstance(self._default_block_size, _AUTO_TYPE): - match rank: - case 1: - return (256, 1, 1) - case 2: - return (128, 2, 1) - case 3: - return (128, 2, 2) - case _: - assert False, "unreachable code" - else: - return self._default_block_size - def _get_blockwise4d_config_factory( self, ) -> Callable[[], AutomaticLaunchConfiguration]: @@ -311,8 +600,19 @@ class GpuIndexing: if rank > 4: raise ValueError(f"Iteration space rank is too large: {rank}") + # impossible to use block size determination function since the iteration space is unknown + # -> round block size in fastest moving dimension up to multiple of warp size + rounded_block_size: PsExpression + if self._assume_warp_aligned_block_size: + warp_size = self._ast_factory.parse_index(self._hw_props.warp_size) + rounded_block_size = self._ast_factory.parse_index( + PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1), + warp_size.clone()) * warp_size.clone()) + else: + rounded_block_size = work_items[0] + block_size = ( - self._kernel_factory.create_lambda(work_items[0]), + self._kernel_factory.create_lambda(rounded_block_size), self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), self._kernel_factory.create_lambda(self._ast_factory.parse_index(1)), ) @@ -328,6 +628,8 @@ class GpuIndexing: return AutomaticLaunchConfiguration( block_size, cast(_Dim3Lambda, grid_size), + self._hw_props, + self._assume_warp_aligned_block_size, ) return factory diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 7065ce815..221780961 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -1,4 +1,4 @@ -from typing import Any, Sequence, cast, Callable +from typing import Any, Callable from dataclasses import dataclass try: @@ -207,17 +207,6 @@ class CupyKernelWrapper(KernelWrapper): class CupyJit(JitBase): - def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)): - if len(default_block_size) > 3: - raise ValueError( - f"Invalid block size: {default_block_size}. Must be at most three-dimensional." - ) - - self._default_block_size: tuple[int, int, int] = cast( - tuple[int, int, int], - tuple(default_block_size) + (1,) * (3 - len(default_block_size)), - ) - def compile(self, kernel: Kernel) -> KernelWrapper: if not HAVE_CUPY: raise JitError( diff --git a/src/pystencils/utils.py b/src/pystencils/utils.py index 0049d0a2c..5ef13f31f 100644 --- a/src/pystencils/utils.py +++ b/src/pystencils/utils.py @@ -316,3 +316,8 @@ def div_ceil(divident, divisor): The result is unspecified if either argument is negative.""" return c_intdiv(divident + divisor - 1, divisor) + + +def ceil_to_multiple(divident, divisor): + """Rounds 'divident' to the next multiple of 'divisor'.""" + return div_ceil(divident, divisor) * divisor diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 8d943e8fd..944bd1241 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -12,6 +12,7 @@ from pystencils import ( create_kernel, Target, ) +from pystencils.codegen.gpu_indexing import GpuIndexing, HardwareProperties from pystencils.slicing import ( add_ghost_layers, @@ -19,6 +20,7 @@ from pystencils.slicing import ( remove_ghost_layers, normalize_slice, ) +from math import prod try: import cupy as cp @@ -29,10 +31,12 @@ except ImportError: @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) -@pytest.mark.parametrize("omit_range_check", [False, True]) @pytest.mark.parametrize("manual_grid", [False, True]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True]) def test_indexing_options_3d( - indexing_scheme: str, omit_range_check: bool, manual_grid: bool + indexing_scheme: str, + manual_grid: bool, + assume_warp_aligned_block_size: bool, ): src, dst = fields("src, dst: [3D]") asm = Assignment( @@ -47,8 +51,8 @@ def test_indexing_options_3d( cfg = CreateKernelConfig(target=Target.CurrentGPU) cfg.gpu.indexing_scheme = indexing_scheme - cfg.gpu.omit_range_check = omit_range_check cfg.gpu.manual_launch_grid = manual_grid + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size ast = create_kernel(asm, cfg) kernel = ast.compile() @@ -59,14 +63,100 @@ def test_indexing_options_3d( if manual_grid: match indexing_scheme: case "linear3d": - kernel.launch_config.block_size = (10, 8, 8) - kernel.launch_config.grid_size = (4, 4, 2) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (8, 10, 8) + kernel.launch_config.grid_size = (5, 4, 2) + else: + kernel.launch_config.block_size = (10, 10, 8) + kernel.launch_config.grid_size = (4, 4, 2) case "blockwise4d": - kernel.launch_config.block_size = (40, 1, 1) - kernel.launch_config.grid_size = (32, 16, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (64, 1, 1) + kernel.launch_config.grid_size = (32, 16, 1) + else: + kernel.launch_config.block_size = (40, 1, 1) + kernel.launch_config.grid_size = (32, 16, 1) elif indexing_scheme == "linear3d": - kernel.launch_config.block_size = (10, 8, 8) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (32, 8, 2) + else: + kernel.launch_config.block_size = (10, 10, 10) + + kernel(src=src_arr, dst=dst_arr) + + expected = cp.zeros_like(src_arr) + expected[1:-1, 1:-1, 1:-1].fill(6.0) + + cp.testing.assert_allclose(dst_arr, expected) + +@pytest.mark.parametrize("iteration_space", + [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)]) +@pytest.mark.parametrize("initial_block_size", + [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) +@pytest.mark.parametrize("use_block_fitting", [True, False]) +def test_block_size_adaptations( + iteration_space: tuple[int, int, int], + initial_block_size: tuple[int, int, int], + assume_warp_aligned_block_size: bool, + use_block_fitting: bool, +): + src, dst = fields("src, dst: [3D]") + asm = Assignment( + dst.center(), + src[-1, 0, 0] + + src[1, 0, 0] + + src[0, -1, 0] + + src[0, 1, 0] + + src[0, 0, -1] + + src[0, 0, 1], + ) + + target = Target.CUDA + cfg = CreateKernelConfig(target=target) + cfg.gpu.indexing_scheme = "linear3d" + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size + + warp_size = cfg.gpu.default_warp_size(target) + max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) + max_block_sizes = GpuIndexing.get_max_block_sizes(target) + + ast = create_kernel(asm, cfg) + kernel = ast.compile() + + if use_block_fitting: + # test internal block fitting function later used in `kernel.launch_config.fit_block_size` + internal_block_size = kernel.launch_config._fit_block_size_to_it_space( + iteration_space, + initial_block_size, + HardwareProperties(warp_size, max_threads_per_block, max_block_sizes), + ) + + # checks if criterion for warp size alignment is fulfilled + def check_suitability(b): + return prod(b) >= warp_size and prod(b) % warp_size == 0 + + # block size fitting should not modify an already ideal configuration + # -> check if ideal configurations are modified + if ( + check_suitability(initial_block_size) + and all(x == y for x, y in zip(initial_block_size, iteration_space)) # trimming may alter results + ): + assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \ + f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}." + + assert check_suitability(internal_block_size), \ + "Determined block size shall be divisible by warp size." + + # set block size via fitting algorithm + kernel.launch_config.fit_block_size(initial_block_size) + else: + # set block size via trimming algorithm + kernel.launch_config.trim_block_size(initial_block_size) + + src_arr = cp.ones(iteration_space) + dst_arr = cp.zeros_like(src_arr) kernel(src=src_arr, dst=dst_arr) @@ -77,10 +167,10 @@ def test_indexing_options_3d( @pytest.mark.parametrize("indexing_scheme", ["linear3d", "blockwise4d"]) -@pytest.mark.parametrize("omit_range_check", [False, True]) @pytest.mark.parametrize("manual_grid", [False, True]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [False, True]) def test_indexing_options_2d( - indexing_scheme: str, omit_range_check: bool, manual_grid: bool + indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool ): src, dst = fields("src, dst: [2D]") asm = Assignment( @@ -93,8 +183,8 @@ def test_indexing_options_2d( cfg = CreateKernelConfig(target=Target.CurrentGPU) cfg.gpu.indexing_scheme = indexing_scheme - cfg.gpu.omit_range_check = omit_range_check cfg.gpu.manual_launch_grid = manual_grid + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size ast = create_kernel(asm, cfg) kernel = ast.compile() @@ -105,14 +195,25 @@ def test_indexing_options_2d( if manual_grid: match indexing_scheme: case "linear3d": - kernel.launch_config.block_size = (10, 8, 1) - kernel.launch_config.grid_size = (4, 2, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (8, 8, 1) + kernel.launch_config.grid_size = (5, 2, 1) + else: + kernel.launch_config.block_size = (10, 8, 1) + kernel.launch_config.grid_size = (4, 2, 1) case "blockwise4d": - kernel.launch_config.block_size = (40, 1, 1) - kernel.launch_config.grid_size = (16, 1, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (64, 1, 1) + kernel.launch_config.grid_size = (16, 1, 1) + else: + kernel.launch_config.block_size = (40, 1, 1) + kernel.launch_config.grid_size = (16, 1, 1) elif indexing_scheme == "linear3d": - kernel.launch_config.block_size = (10, 8, 1) + if assume_warp_aligned_block_size: + kernel.launch_config.block_size = (8, 8, 1) + else: + kernel.launch_config.block_size = (10, 8, 1) kernel(src=src_arr, dst=dst_arr) -- GitLab From 9e7299033af893db0fd38a0d64e88493fd536476 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Tue, 18 Mar 2025 16:07:13 +0000 Subject: [PATCH 13/15] update tests --- conftest.py | 14 +++-- src/pystencils/codegen/config.py | 4 +- src/pystencils/codegen/driver.py | 2 + src/pystencils/codegen/gpu_indexing.py | 35 +++++++---- tests/kernelcreation/test_gpu.py | 84 +++++++++++++++----------- 5 files changed, 86 insertions(+), 53 deletions(-) diff --git a/conftest.py b/conftest.py index ff0467eff..7ea8f5ba0 100644 --- a/conftest.py +++ b/conftest.py @@ -43,10 +43,16 @@ def add_path_to_ignore(path): ] -collect_ignore = [ - os.path.join(SCRIPT_FOLDER, "doc", "conf.py"), - os.path.join(SCRIPT_FOLDER, "src", "pystencils", "opencl", "opencl.autoinit"), -] +def ignore_file(fp): + global collect_ignore + collect_ignore += [os.path.join(SCRIPT_FOLDER, fp)] + + +collect_ignore = [] + +ignore_file("noxfile.py") +ignore_file("docs/source/conf.py") +add_path_to_ignore("docs/build") add_path_to_ignore("tests/benchmark") add_path_to_ignore("_local_tmp") diff --git a/src/pystencils/codegen/config.py b/src/pystencils/codegen/config.py index 91aff43f4..8e7e54ff1 100644 --- a/src/pystencils/codegen/config.py +++ b/src/pystencils/codegen/config.py @@ -395,10 +395,12 @@ class GpuOptions(ConfigBase): """ @staticmethod - def default_warp_size(target: Target): + def default_warp_size(target: Target) -> int | None: match target: case Target.CUDA: return 32 + case Target.HIP: + return None case _: raise NotImplementedError( f"No default warp/wavefront size known for target {target}" diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 9f59e3510..543d9db2f 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -410,6 +410,8 @@ class DefaultKernelCreationDriver: if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) + # TODO: Warn if warp_size is None and assume_warp_aligned_block_size is True + return GpuIndexing( self._ctx, self._target, diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 0524eb0b1..b5e70043f 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -34,10 +34,7 @@ class HardwareProperties: max_threads_per_block: int max_block_sizes: dim3 - def block_size_exceeds_hw_limits( - self, - block_size: tuple[int, ...] - ) -> bool: + def block_size_exceeds_hw_limits(self, block_size: tuple[int, ...]) -> bool: """Checks if provided block size conforms limits given by the hardware.""" return ( @@ -106,8 +103,10 @@ class GpuLaunchConfiguration(ABC): @staticmethod def _excessive_block_size_error_msg(block_size: tuple[int, ...]): - return f"Unable to determine GPU block size for this kernel. \ - Final block size was too large: {block_size}." + return ( + "Unable to determine GPU block size for this kernel. " + f"Final block size was too large: {block_size}." + ) class AutomaticLaunchConfiguration(GpuLaunchConfiguration): @@ -139,7 +138,9 @@ class AutomaticLaunchConfiguration(GpuLaunchConfiguration): @block_size.setter def block_size(self, val: dim3): - AttributeError("Setting `block_size` on an automatic launch configuration has no effect.") + AttributeError( + "Setting `block_size` on an automatic launch configuration has no effect." + ) @property def parameters(self) -> frozenset[Parameter]: @@ -297,7 +298,9 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): @block_size.setter def block_size(self, val: dim3): - AttributeError("Setting `block_size` on an dynamic launch configuration has no effect.") + AttributeError( + "Setting `block_size` on an dynamic launch configuration has no effect." + ) @staticmethod def _round_block_sizes_to_warp_size( @@ -313,7 +316,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): return ( *to_round[:index_to_round], ceil_to_multiple(to_round[index_to_round], warp_size), - *to_round[index_to_round + 1:], + *to_round[index_to_round + 1 :], ) else: return ( @@ -518,6 +521,8 @@ class GpuIndexing: match target: case Target.CUDA: return (1024, 1024, 64) + case Target.HIP: + return (1024, 1024, 1024) case _: raise CodegenError( f"Cannot determine max GPU block sizes for target {target}" @@ -526,7 +531,7 @@ class GpuIndexing: @staticmethod def get_max_threads_per_block(target: Target): match target: - case Target.CUDA: + case Target.CUDA | Target.HIP: return 1024 case _: raise CodegenError( @@ -606,8 +611,14 @@ class GpuIndexing: if self._assume_warp_aligned_block_size: warp_size = self._ast_factory.parse_index(self._hw_props.warp_size) rounded_block_size = self._ast_factory.parse_index( - PsIntDiv(work_items[0].clone() + warp_size.clone() - self._ast_factory.parse_index(1), - warp_size.clone()) * warp_size.clone()) + PsIntDiv( + work_items[0].clone() + + warp_size.clone() + - self._ast_factory.parse_index(1), + warp_size.clone(), + ) + * warp_size.clone() + ) else: rounded_block_size = work_items[0] diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index 944bd1241..bbe9aedd9 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -90,10 +90,32 @@ def test_indexing_options_3d( cp.testing.assert_allclose(dst_arr, expected) -@pytest.mark.parametrize("iteration_space", - [(8, 4, 4), (3, 8, 8), (3, 3, 16), (17, 3, 3), (3, 12, 56), (65, 65, 65), (3, 7, 9)]) -@pytest.mark.parametrize("initial_block_size", - [(8, 4, 4), (3, 8, 8), (3, 3, 16), (2, 2, 64), (8, 2, 1), (3, 1, 32), (32, 1, 1), (1, 2, 3)]) + +@pytest.mark.parametrize( + "iteration_space", + [ + (8, 4, 4), + (1, 8, 8), + (1, 1, 16), + (17, 1, 1), + (1, 12, 56), + (65, 65, 65), + (1, 7, 9), + ], +) +@pytest.mark.parametrize( + "initial_block_size", + [ + (8, 4, 4), + (1, 8, 8), + (1, 1, 16), + (2, 2, 64), + (8, 2, 1), + (3, 1, 32), + (32, 1, 1), + (1, 2, 3), + ], +) @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) @pytest.mark.parametrize("use_block_fitting", [True, False]) def test_block_size_adaptations( @@ -102,7 +124,13 @@ def test_block_size_adaptations( assume_warp_aligned_block_size: bool, use_block_fitting: bool, ): - src, dst = fields("src, dst: [3D]") + field_shape = tuple(2 + x for x in iteration_space[::-1]) + src_arr = cp.ones(field_shape) + dst_arr = cp.zeros_like(src_arr) + + src = Field.create_from_numpy_array("src", src_arr) + dst = Field.create_from_numpy_array("dst", dst_arr) + asm = Assignment( dst.center(), src[-1, 0, 0] @@ -113,25 +141,20 @@ def test_block_size_adaptations( + src[0, 0, 1], ) - target = Target.CUDA + target = Target.CurrentGPU cfg = CreateKernelConfig(target=target) cfg.gpu.indexing_scheme = "linear3d" cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size - warp_size = cfg.gpu.default_warp_size(target) - max_threads_per_block = GpuIndexing.get_max_threads_per_block(target) - max_block_sizes = GpuIndexing.get_max_block_sizes(target) + warp_size = cfg.gpu.default_warp_size(cfg.get_target()) ast = create_kernel(asm, cfg) kernel = ast.compile() if use_block_fitting: # test internal block fitting function later used in `kernel.launch_config.fit_block_size` - internal_block_size = kernel.launch_config._fit_block_size_to_it_space( - iteration_space, - initial_block_size, - HardwareProperties(warp_size, max_threads_per_block, max_block_sizes), - ) + kernel.launch_config.fit_block_size(initial_block_size) + internal_block_size, _ = kernel.launch_config.evaluate() # checks if criterion for warp size alignment is fulfilled def check_suitability(b): @@ -139,25 +162,20 @@ def test_block_size_adaptations( # block size fitting should not modify an already ideal configuration # -> check if ideal configurations are modified - if ( - check_suitability(initial_block_size) - and all(x == y for x, y in zip(initial_block_size, iteration_space)) # trimming may alter results - ): - assert all(x == y for x, y in zip(initial_block_size, internal_block_size)), \ - f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}." - - assert check_suitability(internal_block_size), \ - "Determined block size shall be divisible by warp size." - - # set block size via fitting algorithm - kernel.launch_config.fit_block_size(initial_block_size) + if check_suitability(initial_block_size) and all( + x == y for x, y in zip(initial_block_size, iteration_space) + ): # trimming may alter results + assert all( + x == y for x, y in zip(initial_block_size, internal_block_size) + ), f"Initial block size unnecessarily adapted from {initial_block_size} to {internal_block_size}." + + assert check_suitability( + internal_block_size + ), "Determined block size shall be divisible by warp size." else: # set block size via trimming algorithm kernel.launch_config.trim_block_size(initial_block_size) - src_arr = cp.ones(iteration_space) - dst_arr = cp.zeros_like(src_arr) - kernel(src=src_arr, dst=dst_arr) expected = cp.zeros_like(src_arr) @@ -173,13 +191,7 @@ def test_indexing_options_2d( indexing_scheme: str, manual_grid: bool, assume_warp_aligned_block_size: bool ): src, dst = fields("src, dst: [2D]") - asm = Assignment( - dst.center(), - src[-1, 0] - + src[1, 0] - + src[0, -1] - + src[0, 1] - ) + asm = Assignment(dst.center(), src[-1, 0] + src[1, 0] + src[0, -1] + src[0, 1]) cfg = CreateKernelConfig(target=Target.CurrentGPU) cfg.gpu.indexing_scheme = indexing_scheme -- GitLab From 7c371fb43a0eb731268fa3fe8fd7861f0b4b5f68 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 19 Mar 2025 13:03:39 +0000 Subject: [PATCH 14/15] make no assumptions about warp_size for Target.HIP --- src/pystencils/codegen/driver.py | 4 +++- src/pystencils/codegen/gpu_indexing.py | 20 ++++++++++++++------ tests/kernelcreation/test_gpu.py | 5 +++-- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 543d9db2f..e9fc69b76 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import cast, Sequence, Callable, TYPE_CHECKING from dataclasses import dataclass, replace +from warnings import warn from .target import Target from .config import ( @@ -410,7 +411,8 @@ class DefaultKernelCreationDriver: if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) - # TODO: Warn if warp_size is None and assume_warp_aligned_block_size is True + if warp_size is None and assume_warp_aligned_block_size: + warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.") return GpuIndexing( self._ctx, diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index b5e70043f..09570e345 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -30,7 +30,7 @@ _Dim3Lambda = tuple[Lambda, Lambda, Lambda] @dataclass class HardwareProperties: - warp_size: int + warp_size: int | None max_threads_per_block: int max_block_sizes: dim3 @@ -204,6 +204,7 @@ class ManualLaunchConfiguration(GpuLaunchConfiguration): if ( self._assume_warp_aligned_block_size + and self._hw_props.warp_size is not None and prod(self._block_size) % self._hw_props.warp_size != 0 ): raise CodegenError( @@ -316,7 +317,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): return ( *to_round[:index_to_round], ceil_to_multiple(to_round[index_to_round], warp_size), - *to_round[index_to_round + 1 :], + *to_round[index_to_round + 1:], ) else: return ( @@ -351,7 +352,8 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): if ( self._assume_warp_aligned_block_size - and prod(ret) % self._hw_props.warp_size != 0 + and hw_props.warp_size is not None + and prod(ret) % hw_props.warp_size != 0 ): self._round_block_sizes_to_warp_size(ret, hw_props.warp_size) @@ -387,6 +389,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): return ret trimmed = trim(list(block_size)) + + if hw_props.warp_size is None: + return tuple(trimmed) + if ( prod(trimmed) >= hw_props.warp_size and prod(trimmed) % hw_props.warp_size == 0 @@ -493,14 +499,13 @@ class GpuIndexing: ctx: KernelCreationContext, target: Target, scheme: GpuIndexingScheme, - warp_size: int, + warp_size: int | None, manual_launch_grid: bool = False, assume_warp_aligned_block_size: bool = False, ) -> None: self._ctx = ctx self._target = target self._scheme = scheme - self._warp_size = warp_size self._manual_launch_grid = manual_launch_grid self._assume_warp_aligned_block_size = assume_warp_aligned_block_size @@ -608,7 +613,10 @@ class GpuIndexing: # impossible to use block size determination function since the iteration space is unknown # -> round block size in fastest moving dimension up to multiple of warp size rounded_block_size: PsExpression - if self._assume_warp_aligned_block_size: + if ( + self._assume_warp_aligned_block_size + and self._hw_props.warp_size is not None + ): warp_size = self._ast_factory.parse_index(self._hw_props.warp_size) rounded_block_size = self._ast_factory.parse_index( PsIntDiv( diff --git a/tests/kernelcreation/test_gpu.py b/tests/kernelcreation/test_gpu.py index bbe9aedd9..a3f8a5482 100644 --- a/tests/kernelcreation/test_gpu.py +++ b/tests/kernelcreation/test_gpu.py @@ -145,8 +145,9 @@ def test_block_size_adaptations( cfg = CreateKernelConfig(target=target) cfg.gpu.indexing_scheme = "linear3d" cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size - - warp_size = cfg.gpu.default_warp_size(cfg.get_target()) + + warp_size = 32 + cfg.gpu.warp_size = warp_size ast = create_kernel(asm, cfg) kernel = ast.compile() -- GitLab From 3c959156c2d043f3ae3f679a9a518c46a15b827d Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 19 Mar 2025 13:29:51 +0000 Subject: [PATCH 15/15] expose default block size of dynamic launch config as property --- src/pystencils/codegen/gpu_indexing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 09570e345..43b612bd7 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -291,6 +291,10 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def parameters(self) -> frozenset[Parameter]: """Parameters of this launch configuration""" return self._params + + @property + def default_block_size(self) -> dim3: + return self._default_block_size @property def block_size(self) -> dim3 | None: -- GitLab