diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 3fe00840ed13faeb837d91711bca0be1a390aa4e..a95c365665bf803ce3cc683b40319c72dbf857ca 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -1,3 +1,4 @@ +(gpu_codegen)= # GPU Code Generation The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components: diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst index 0d384c55bc1e5933a055365f9d5ffe4143c902b6..b9b400544de4657d77bbc67a88db4092c3bd8001 100644 --- a/docs/source/backend/index.rst +++ b/docs/source/backend/index.rst @@ -16,6 +16,7 @@ who wish to customize or extend the behaviour of the code generator in their app iteration_space translation platforms + reduction_codegen transformations gpu_codegen errors diff --git a/docs/source/backend/reduction_codegen.md b/docs/source/backend/reduction_codegen.md new file mode 100644 index 0000000000000000000000000000000000000000..360c692568ada1147b99ef7c829ab6cdc9af02fa --- /dev/null +++ b/docs/source/backend/reduction_codegen.md @@ -0,0 +1,122 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.4 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +mystnb: + execution_mode: cache +--- + +```{code-cell} ipython3 +:tags: [remove-cell, raises-exception] + +import pystencils as ps +import numpy as np +import cupy as cp +``` + +(codegen_reductions)= +# Code Generation for Reductions + +In this guide, we demonstrate how reduction kernels can be generated for different platforms and what impact certain +optimization strategies have. +For this, we set up the update rule for a simple dot product kernel: + +```{code-cell} ipython3 +r = ps.TypedSymbol("r", "double") +x, y = ps.fields(f"x, y: double[3D]", layout="fzyx") + +assign_dot_prod = ps.AddReductionAssignment(r, x.center() * y.center()) +``` + +## CPU Platforms + +We first consider a base variant for CPUs without employing any optimizations. +The generated code for this variant looks as follows: + +```{code-cell} ipython3 +cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) +kernel_cpu = ps.create_kernel(assign_dot_prod, cfg) + +ps.inspect(kernel_cpu) +``` + +We want the reduction kernel to be SIMD vectorized and employ shared-memory parallelism using OpenMP. +The supported SIMD instruction sets for reductions are: +* SSE3 +* AVX/AVX2 +* AVX512 + +Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`. +**Note that reductions require `assume_inner_stride_one` to be enabled.** +This is due to the fact that other inner strides would require masked SIMD operations +which are not supported yet. + +```{code-cell} ipython3 +# configure SIMD vectorization +cfg = ps.CreateKernelConfig( + target=ps.Target.X86_AVX, +) +cfg.cpu.vectorize.enable = True +cfg.cpu.vectorize.assume_inner_stride_one = True + +# configure OpenMP parallelization +cfg.cpu.openmp.enable = True +cfg.cpu.openmp.num_threads = 8 + +kernel_cpu_opt = ps.create_kernel(assign_dot_prod, cfg) + +ps.inspect(kernel_cpu_opt) +``` + +## GPU Platforms + +Reductions are currently only supported for CUDA platforms. +Similar to the CPU section, a base variant for GPUs without explicitly employing any optimizations is shown: + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + + kernel_gpu = ps.create_kernel(assign_dot_prod, cfg) + + ps.inspect(kernel_gpu) +``` + +As evident from the code, the generated kernel employs atomic operations for updating the pointer +holding the reduction result. +Using the explicit warp-level instructions provided by CUDA allows us to achieve higher performance compared to +only using atomic operations. +To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by +the hardware's warp size. +**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this +criterion by enabling `assume_warp_aligned_block_size`.** +While the default block sizes provided by the code generator already fulfill this criterion, +we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space. + +You can find more detailed information about warp size alignment in {ref}`gpu_codegen`. + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + cfg.gpu.assume_warp_aligned_block_size = True + + kernel_gpu_opt = ps.create_kernel(assign_dot_prod, cfg) + + kernel_func = kernel_gpu_opt.compile() + kernel_func.launch_config.fit_block_size((32, 1, 1)) + + ps.inspect(kernel_gpu_opt) +``` + +:::{admonition} Developers To Do: + +- Support for HIP platforms +- Support vectorization using NEON intrinsics +::: + diff --git a/docs/source/index.rst b/docs/source/index.rst index 6dba50af184ee95c7378a2e923bd76a6d97883a2..4e1070979fb7e02f3c6be799761f3999fec10476 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,6 +81,7 @@ Topics user_manual/symbolic_language user_manual/kernelcreation + user_manual/reductions user_manual/gpu_kernels user_manual/WorkingWithTypes diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md new file mode 100644 index 0000000000000000000000000000000000000000..454fd7df4c4f686075c70e45f38bf2d5563ae6af --- /dev/null +++ b/docs/source/user_manual/reductions.md @@ -0,0 +1,132 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.4 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +mystnb: + execution_mode: cache +--- + +```{code-cell} ipython3 +:tags: [remove-cell, raises-exception] + +import sympy as sp +import pystencils as ps +import numpy as np +import cupy as cp + +from enum import Enum +``` + +(guide_reductions)= +# Reductions in Pystencils + +Reductions play a vital role in numerical simulations as they allow aggregating data across multiple elements, +such as computing sums, products over an array or finding its minima or maxima. + +## Specifying Assignments with Reductions + +In pystencils, reductions are made available via specialized assignments, namely `ReductionAssignment`. +Here is a snippet creating a reduction assignment for adding up all elements of a field: + +```{code-cell} ipython3 +r = ps.TypedSymbol("r", "double") +x = ps.fields(f"x: double[3D]", layout="fzyx") + +assign_sum = ps.AddReductionAssignment(r, x.center()) +``` + +For each point in the iteration space, the left-hand side symbol `r` accumulates the contents of the +right-hand side `x.center()`. In our case, the `AddReductionAssignment` denotes an accumulation via additions. + +**Pystencils requires type information about the reduction symbols and thus requires `r` to be a `TypedSymbol`.** + +The following reduction assignment classes are available in pystencils: +* `AddReductionAssignment`: Builds sum over elements +* `SubReductionAssignment`: Builds difference over elements +* `MulReductionAssignment`: Builds product over elements +* `MinReductionAssignment`: Finds minimum element +* `MaxReductionAssignment`: Finds maximum element + +:::{note} +AlternatÃvely, you can also make use of the `reduction_assignment` or `reduction_assignment_from_str` functions +to specify reduction assignments: +::: + +```{code-cell} ipython3 +from pystencils.sympyextensions import reduction_assignment, reduction_assignment_from_str +from pystencils.sympyextensions.reduction import ReductionOp + +assign_sum = reduction_assignment(r, ReductionOp.Add, x.center()) + +assign_sum = reduction_assignment_from_str(r, "+", x.center()) +``` + +For other reduction operations, the following enums can be passed to `reduction_assignment` +or the corresponding strings can be passed to `reduction_assignment_from_str`. + +```{code-cell} python3 +class ReductionOp(Enum): + Add = "+" + Sub = "-" + Mul = "*" + Min = "min" + Max = "max" +``` + +## Generating Reduction Kernels + +With the assignments being fully assembled, we can finally invoke the code generator and +create the kernel object via the {any}`create_kernel` function. +For this example, we assume a kernel configuration where no optimizations are explicitly enabled. + +```{code-cell} ipython3 +cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) +kernel = ps.create_kernel(assign_sum, cfg) + +ps.inspect(kernel) +``` + +:::{note} +The generated reduction kernels may vary vastly for different platforms and optimizations. +For the sake of compactness, the impact of different backend or optimization choices is left out. + +A detailed description of configuration choices and their impact on the generated code can be found in +{ref}`codegen_reductions`. +::: + +The kernel can be compiled and run immediately. + +To execute the kernel on CPUs, not only a {any}`numpy.ndarray` has to be passed for each field +but also one for exporting reduction results. +The export mechanism can be seen in the previously generated code snippet. +Here, the kernel obtains a pointer with the name of the reduction symbol (here: `r`). +This pointer not only allows providing initial values for the reduction but is also used for writing back the +reduction result. +Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value. + +```{code-cell} ipython3 + kernel_func = kernel.compile() + + x_array = np.ones((4, 4, 4), dtype="float64") + reduction_result = np.zeros((1,), dtype="float64") + + kernel_func(x=x_array, r=reduction_result) + + reduction_result[0] +``` + +For GPU platforms, the concepts remain the same but the fields and the write-back pointer now require device memory, +i.e. instances of {any}`cupy.ndarray`. + +:::{admonition} Developers To Do: + +- Support for higher-order data types for reductions, e.g. vector/matrix reductions +::: diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 07283d5294bc08c8e68e6a40af0f956b36a0129a..329f61d32087df392406d17b5db6bcb520798317 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -1,10 +1,6 @@ """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions""" -from .codegen import ( - Target, - CreateKernelConfig, - AUTO -) +from .codegen import Target, CreateKernelConfig, AUTO from .defaults import DEFAULTS from . import fd from . import stencil as stencil @@ -34,6 +30,13 @@ from .simp import AssignmentCollection from .sympyextensions.typed_sympy import TypedSymbol, DynamicType from .sympyextensions import SymbolCreator, tcast from .datahandling import create_data_handling +from .sympyextensions.reduction import ( + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, +) __all__ = [ "Field", @@ -61,6 +64,11 @@ __all__ = [ "AssignmentCollection", "Assignment", "AddAugmentedAssignment", + "AddReductionAssignment", + "SubReductionAssignment", + "MulReductionAssignment", + "MinReductionAssignment", + "MaxReductionAssignment", "assignment_from_stencil", "SymbolCreator", "create_data_handling", @@ -81,4 +89,5 @@ __all__ = [ ] from . import _version -__version__ = _version.get_versions()['version'] + +__version__ = _version.get_versions()["version"] diff --git a/src/pystencils/backend/ast/analysis.py b/src/pystencils/backend/ast/analysis.py index edeba04f2b8e5d8727abe1150b9e574808e6811a..7032690a03dcf851a5b5227abd70c98301b907e7 100644 --- a/src/pystencils/backend/ast/analysis.py +++ b/src/pystencils/backend/ast/analysis.py @@ -62,7 +62,7 @@ class UndefinedSymbolsCollector: case PsAssignment(lhs, rhs): undefined_vars = self(lhs) | self(rhs) - if isinstance(lhs, PsSymbolExpr): + if isinstance(node, PsDeclaration) and isinstance(lhs, PsSymbolExpr): undefined_vars.remove(lhs.symbol) return undefined_vars diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 705d250949f3662695d506feeff30c20649eb1c5..4141b029677e8a79065421c2108da2215ce5ca2f 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -3,8 +3,9 @@ from __future__ import annotations from typing import cast from .astnode import PsAstNode -from .expressions import PsExpression, PsLvalue, PsUnOp +from .expressions import PsExpression, PsLvalue, PsUnOp, PsBinOp from .util import failing_cast +from ...sympyextensions import ReductionOp from ...types import PsVectorType @@ -17,7 +18,7 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): """Broadcast a scalar value to N vector lanes.""" __match_args__ = ("lanes", "operand") - + def __init__(self, lanes: int, operand: PsExpression): super().__init__(operand) self._lanes = lanes @@ -25,26 +26,86 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): @property def lanes(self) -> int: return self._lanes - + @lanes.setter def lanes(self, n: int): self._lanes = n def _clone_expr(self) -> PsVecBroadcast: return PsVecBroadcast(self._lanes, self._operand.clone()) - + def structurally_equal(self, other: PsAstNode) -> bool: if not isinstance(other, PsVecBroadcast): return False + return super().structurally_equal(other) and self._lanes == other._lanes + + +class PsVecHorizontal(PsBinOp, PsVectorOp): + """Extracts scalar value from N vector lanes.""" + + __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op") + + def __init__( + self, + lanes: int, + scalar_operand: PsExpression, + vector_operand: PsExpression, + reduction_op: ReductionOp, + ): + super().__init__(scalar_operand, vector_operand) + self._lanes = lanes + self._reduction_op = reduction_op + + @property + def lanes(self) -> int: + return self._lanes + + @lanes.setter + def lanes(self, n: int): + self._lanes = n + + @property + def scalar_operand(self) -> PsExpression: + return self._op1 + + @scalar_operand.setter + def scalar_operand(self, op: PsExpression): + self._op1 = op + + @property + def vector_operand(self) -> PsExpression: + return self._op2 + + @vector_operand.setter + def vector_operand(self, op: PsExpression): + self._op2 = op + + @property + def reduction_op(self) -> ReductionOp: + return self._reduction_op + + @reduction_op.setter + def reduction_op(self, op: ReductionOp): + self._reduction_op = op + + def _clone_expr(self) -> PsVecHorizontal: + return PsVecHorizontal( + self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op + ) + + def structurally_equal(self, other: PsAstNode) -> bool: + if not isinstance(other, PsVecHorizontal): + return False return ( super().structurally_equal(other) and self._lanes == other._lanes + and self._reduction_op == other._reduction_op ) class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): """Pointer-based vectorized memory access. - + Args: base_ptr: Pointer identifying the accessed memory region offset: Offset inside the memory region @@ -95,7 +156,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): @property def stride(self) -> PsExpression | None: return self._stride - + @stride.setter def stride(self, expr: PsExpression | None): self._stride = expr @@ -106,10 +167,12 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): def get_vector_type(self) -> PsVectorType: return cast(PsVectorType, self._dtype) - + def get_children(self) -> tuple[PsAstNode, ...]: - return (self._ptr, self._offset) + (() if self._stride is None else (self._stride,)) - + return (self._ptr, self._offset) + ( + () if self._stride is None else (self._stride,) + ) + def set_child(self, idx: int, c: PsAstNode): idx = [0, 1, 2][idx] match idx: @@ -138,7 +201,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): and self._vector_entries == other._vector_entries and self._aligned == other._aligned ) - + def __repr__(self) -> str: return ( f"PsVecMemAcc({repr(self._ptr)}, {repr(self._offset)}, {repr(self._vector_entries)}, " diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py index ffb65181ccd71ff95dffd6d006617dadc6809eea..22ae2f91a1ee25a3e8b5ab0097f46d5f8e6f05c2 100644 --- a/src/pystencils/backend/emission/ir_printer.py +++ b/src/pystencils/backend/emission/ir_printer.py @@ -10,7 +10,7 @@ from .base_printer import BasePrinter, Ops, LR from ..ast import PsAstNode from ..ast.expressions import PsBufferAcc -from ..ast.vector import PsVecMemAcc, PsVecBroadcast +from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal if TYPE_CHECKING: from ...codegen import Kernel @@ -24,7 +24,7 @@ def emit_ir(ir: PsAstNode | Kernel): class IRAstPrinter(BasePrinter): """Print the IR AST as pseudo-code. - + This printer produces a complete pseudocode representation of a pystencils AST. Other than the `CAstPrinter`, the `IRAstPrinter` is capable of emitting code for each node defined in `ast <pystencils.backend.ast>`. @@ -77,6 +77,17 @@ class IRAstPrinter(BasePrinter): f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest ) + case PsVecHorizontal(lanes, scalar_operand, vector_operand, reduction_op): + pc.push_op(Ops.Weakest, LR.Middle) + scalar_operand_code = self.visit(scalar_operand, pc) + vector_operand_code = self.visit(vector_operand, pc) + pc.pop_op() + + return pc.parenthesize( + f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({scalar_operand_code, vector_operand_code})", + Ops.Weakest, + ) + case _: return super().visit(node, pc) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index f3d18f3498079dcd06c2f4ffa61a87ff59e47117..4e38de5e9f11ca1d971ae6659f04e6df7b47f64a 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -30,6 +30,7 @@ from typing import Any, Sequence, TYPE_CHECKING from abc import ABC from enum import Enum +from ..sympyextensions import ReductionOp from ..types import PsType from .exceptions import PsInternalCompilerError @@ -95,17 +96,31 @@ class MathFunctions(Enum): self.num_args = num_args +class NumericLimitsFunctions(Enum): + """Numerical limits functions supported by the backend. + + Each platform has to materialize these functions to a concrete implementation. + """ + + Min = ("min", 0) + Max = ("max", 0) + + def __init__(self, func_name, num_args): + self.function_name = func_name + self.num_args = num_args + + class PsMathFunction(PsFunction): """Homogenously typed mathematical functions.""" __match_args__ = ("func",) - def __init__(self, func: MathFunctions) -> None: + def __init__(self, func: MathFunctions | NumericLimitsFunctions) -> None: super().__init__(func.function_name, func.num_args) self._func = func @property - def func(self) -> MathFunctions: + def func(self) -> MathFunctions | NumericLimitsFunctions: return self._func def __str__(self) -> str: @@ -121,6 +136,47 @@ class PsMathFunction(PsFunction): return hash(self._func) +class ReductionFunctions(Enum): + """Function representing different steps in kernels with reductions supported by the backend. + + Each platform has to materialize these functions to a concrete implementation. + """ + + WriteBackToPtr = ("WriteBackToPtr", 2) + + def __init__(self, func_name, num_args): + self.function_name = func_name + self.num_args = num_args + + +class PsReductionFunction(PsFunction): + + def __init__(self, func: ReductionFunctions, reduction_op: ReductionOp) -> None: + super().__init__(func.function_name, func.num_args) + self._func = func + self._reduction_op = reduction_op + + @property + def func(self) -> ReductionFunctions: + return self._func + + @property + def reduction_op(self) -> ReductionOp: + return self._reduction_op + + def __str__(self) -> str: + return f"{self._func.function_name}" + + def __eq__(self, other: object) -> bool: + if not isinstance(other, PsReductionFunction): + return False + + return self._func == other._func and self._reduction_op == other._reduction_op + + def __hash__(self) -> int: + return hash(self._func) + + class CFunction(PsFunction): """A concrete C function. diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 68da893ff6204c73d61dcebddb1da602f37520c7..358b5ff6cdeb6cca1cfc232c5129ae239bdb3be9 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -1,12 +1,15 @@ from __future__ import annotations +from dataclasses import dataclass from typing import Iterable, Iterator, Any from itertools import chain, count from collections import namedtuple, defaultdict import re +from ..ast.expressions import PsExpression from ...defaults import DEFAULTS from ...field import Field, FieldType +from ...sympyextensions import ReductionOp from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType from ..memory import PsSymbol, PsBuffer @@ -44,6 +47,16 @@ class FieldsInKernel: FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array")) +@dataclass(frozen=True) +class ReductionInfo: + """Information about a reduction operation, its neutral element in form of an initial value + and the pointer used by the kernel as write-back argument.""" + + op: ReductionOp + init_val: PsExpression + ptr_symbol: PsSymbol + + class KernelCreationContext: """Manages the translation process from the SymPy frontend to the backend AST, and collects all necessary information for the translation: @@ -75,6 +88,8 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) + self._symbols_reduction_info: dict[PsSymbol, ReductionInfo] = dict() + self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() @@ -93,7 +108,7 @@ class KernelCreationContext: def index_dtype(self) -> PsIntegerType: """Data type used by default for index expressions""" return self._index_dtype - + def resolve_dynamic_type(self, dtype: DynamicType | PsType) -> PsType: """Selects the appropriate data type for `DynamicType` instances, and returns all other types as they are.""" match dtype: @@ -178,6 +193,20 @@ class KernelCreationContext: self._symbols[old.name] = new + def add_symbol_reduction_info( + self, local_symb: PsSymbol, reduction_info: ReductionInfo + ): + """Adds entry for a symbol and its reduction info to its corresponding lookup table. + + The symbol ``symbol`` shall not exist in the symbol table already. + """ + if local_symb in self._symbols_reduction_info: + raise PsInternalCompilerError( + f"add_symbol_reduction_info: {local_symb.name} already exist in the symbol table" + ) + + self._symbols_reduction_info[local_symb] = reduction_info + def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None ) -> PsSymbol: @@ -213,6 +242,11 @@ class KernelCreationContext: """Return an iterable of all symbols listed in the symbol table.""" return self._symbols.values() + @property + def symbols_reduction_info(self) -> dict[PsSymbol, ReductionInfo]: + """Return a dictionary holding kernel-local reduction symbols and their reduction properties.""" + return self._symbols_reduction_info + # Fields and Arrays @property diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index b3ff5aefb525ef311d0e3199c79f60c52617a853..9dc3928b3549e7d0a9eae4ae2e78244e4e9e4b4c 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -1,23 +1,25 @@ from typing import overload, cast, Any from functools import reduce -from operator import add, mul, sub, truediv +from operator import add, mul, sub import sympy as sp -import sympy.core.relational import sympy.logic.boolalg from sympy.codegen.ast import AssignmentBase, AugmentedAssignment +from ..memory import PsSymbol from ...assignment import Assignment from ...simp import AssignmentCollection from ...sympyextensions import ( integer_functions, ConditionalFieldAccess, ) +from ...reduction_op_mapping import reduction_op_to_expr from ...sympyextensions.typed_sympy import TypedSymbol, TypeCast, DynamicType from ...sympyextensions.pointers import AddressOf, mem_acc +from ...sympyextensions.reduction import ReductionAssignment, ReductionOp from ...field import Field, FieldType -from .context import KernelCreationContext +from .context import KernelCreationContext, ReductionInfo from ..ast.structural import ( PsAstNode, @@ -55,14 +57,14 @@ from ..ast.expressions import ( PsAnd, PsOr, PsNot, - PsMemAcc + PsMemAcc, ) from ..ast.vector import PsVecMemAcc from ..constants import PsConstant -from ...types import PsNumericType, PsStructType, PsType +from ...types import PsNumericType, PsStructType, PsType, PsPointerType from ..exceptions import PsInputError -from ..functions import PsMathFunction, MathFunctions +from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions from ..exceptions import FreezeError @@ -108,7 +110,9 @@ class FreezeExpressions: def __call__(self, obj: AssignmentCollection | sp.Basic) -> PsAstNode: if isinstance(obj, AssignmentCollection): - return PsBlock([cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments]) + return PsBlock( + [cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments] + ) elif isinstance(obj, AssignmentBase): return cast(PsAssignment, self.visit(obj)) elif isinstance(obj, _ExprLike): @@ -170,19 +174,69 @@ class FreezeExpressions: assert isinstance(lhs, PsExpression) assert isinstance(rhs, PsExpression) - match expr.op: - case "+=": - op = add - case "-=": - op = sub - case "*=": - op = mul - case "/=": - op = truediv + # transform augmented assignment to reduction op + str_to_reduction_op: dict[str, ReductionOp] = { + "+=": ReductionOp.Add, + "-=": ReductionOp.Sub, + "*=": ReductionOp.Mul, + "/=": ReductionOp.Div, + } + + # reuse existing handling for transforming reduction ops to expressions + return PsAssignment( + lhs, reduction_op_to_expr(str_to_reduction_op[expr.op], lhs.clone(), rhs) + ) + + def map_ReductionAssignment(self, expr: ReductionAssignment): + assert isinstance(expr.lhs, TypedSymbol) + + lhs = self.visit(expr.lhs) + rhs = self.visit(expr.rhs) + + assert isinstance(rhs, PsExpression) + assert isinstance(lhs, PsSymbolExpr) + + op = expr.reduction_op + orig_lhs_symb = lhs.symbol + dtype = lhs.dtype + + assert isinstance(dtype, PsNumericType), \ + "Reduction assignments require type information of the lhs symbol." + + # replace original symbol with pointer-based type used for export + orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype)) + + # create kernel-local copy of lhs symbol to work with + new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype) + new_lhs = PsSymbolExpr(new_lhs_symb) + + # get new rhs from augmented assignment + new_rhs: PsExpression = reduction_op_to_expr(op, new_lhs.clone(), rhs) + + # match for reduction operation and set neutral init_val + init_val: PsExpression + match op: + case ReductionOp.Add: + init_val = PsConstantExpr(PsConstant(0)) + case ReductionOp.Sub: + init_val = PsConstantExpr(PsConstant(0)) + case ReductionOp.Mul: + init_val = PsConstantExpr(PsConstant(1)) + case ReductionOp.Min: + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) + case ReductionOp.Max: + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) case _: - raise FreezeError(f"Unsupported augmented assignment: {expr.op}.") + raise FreezeError(f"Unsupported kind of reduction assignment: {op}.") - return PsAssignment(lhs, op(lhs.clone(), rhs)) + reduction_info = ReductionInfo(op, init_val, orig_lhs_symb_as_ptr) + + # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info + self._ctx.add_symbol(new_lhs_symb) + self._ctx.add_symbol_reduction_info(new_lhs_symb, reduction_info) + self._ctx.replace_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr) + + return PsAssignment(new_lhs, new_rhs) def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) @@ -214,7 +268,7 @@ class FreezeExpressions: exponent = expr.args[1] expr_frozen = self.visit_expr(base) - + if isinstance(exponent, sp.Rational): # Decompose rational exponent num: int = exponent.numerator @@ -234,7 +288,7 @@ class FreezeExpressions: denom = 1 assert denom == 1 - + # Pairwise multiplication for logarithmic runtime factors = [expr_frozen] + [expr_frozen.clone() for _ in range(num - 1)] while len(factors) > 1: @@ -250,7 +304,7 @@ class FreezeExpressions: expr_frozen = one / expr_frozen return expr_frozen - + # If we got this far, use pow exponent_frozen = self.visit_expr(exponent) expr_frozen = PsMathFunction(MathFunctions.Pow)(expr_frozen, exponent_frozen) @@ -280,22 +334,22 @@ class FreezeExpressions: raise FreezeError("Cannot translate an empty tuple.") items = [self.visit_expr(item) for item in expr] - + if any(isinstance(i, PsArrayInitList) for i in items): # base case: have nested arrays if not all(isinstance(i, PsArrayInitList) for i in items): raise FreezeError( f"Cannot translate nested arrays of non-uniform shape: {expr}" ) - + subarrays = cast(list[PsArrayInitList], items) shape_tail = subarrays[0].shape - + if not all(s.shape == shape_tail for s in subarrays[1:]): raise FreezeError( f"Cannot translate nested arrays of non-uniform shape: {expr}" ) - + return PsArrayInitList([s.items_grid for s in subarrays]) # type: ignore else: # base case: no nested arrays diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 62feca26504b753a9898153b4e6fb85e3fc5b2e7..b457f39a078009e329176845bce7e4a021be6861 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -49,7 +49,7 @@ from ..ast.expressions import ( PsNeg, PsNot, ) -from ..ast.vector import PsVecBroadcast, PsVecMemAcc +from ..ast.vector import PsVecBroadcast, PsVecMemAcc, PsVecHorizontal from ..functions import PsMathFunction, CFunction from ..ast.util import determine_memory_object from ..exceptions import TypificationError @@ -194,9 +194,10 @@ class TypeContext: f" Target type: {self._target_type}" ) - case PsNumericOpTrait() if not isinstance( - self._target_type, PsNumericType - ) or self._target_type.is_bool(): + case PsNumericOpTrait() if ( + not isinstance(self._target_type, PsNumericType) + or self._target_type.is_bool() + ): # FIXME: PsBoolType derives from PsNumericType, but is not numeric raise TypificationError( f"Numerical operation encountered in non-numerical type context:\n" @@ -579,6 +580,33 @@ class Typifier: else: tc.apply_dtype(PsBoolType(), expr) + case PsVecHorizontal(): + # bin op consisting of a scalar and a vector that is converted to a scalar + # -> whole expression should be treated as scalar + + scalar_op_tc = TypeContext() + self.visit_expr(expr.scalar_operand, scalar_op_tc) + + vector_op_tc = TypeContext() + self.visit_expr(expr.vector_operand, vector_op_tc) + + if scalar_op_tc.target_type is None or vector_op_tc.target_type is None: + raise TypificationError( + f"Unable to determine type of argument to vector horizontal: {expr}" + ) + + if not isinstance(scalar_op_tc.target_type, PsScalarType): + raise TypificationError( + f"Illegal type in scalar operand (op1) to vector horizontal: {scalar_op_tc.target_type}" + ) + + if not isinstance(vector_op_tc.target_type, PsVectorType): + raise TypificationError( + f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}" + ) + + tc.apply_dtype(scalar_op_tc.target_type, expr) + case PsBinOp(op1, op2): self.visit_expr(op1, tc) self.visit_expr(op2, tc) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 98ff3e3d332a46074931514ba3af1603dc6318b2..8c3cd45faf2a210bf8de16e3f25d9ba7897d4e0f 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,11 +1,139 @@ from __future__ import annotations +import math + from .generic_gpu import GenericGpu +from ..ast import PsAstNode +from ..ast.expressions import ( + PsExpression, + PsLiteralExpr, + PsCall, + PsAnd, + PsConstantExpr, + PsSymbolExpr, +) +from ..ast.structural import ( + PsConditional, + PsStatement, + PsAssignment, + PsBlock, + PsStructuralNode, +) +from ..constants import PsConstant +from ..exceptions import MaterializationError +from ..functions import NumericLimitsFunctions, CFunction +from ..literals import PsLiteral +from ...reduction_op_mapping import reduction_op_to_expr +from ...sympyextensions import ReductionOp +from ...types import PsType, PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType +from ...types.quick import SInt, UInt class CudaPlatform(GenericGpu): - """Platform for the CUDA GPU taret.""" + """Platform for the CUDA GPU target.""" @property def required_headers(self) -> set[str]: - return set() + return super().required_headers | { + '"npp.h"', + } + + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + stype = symbol_expr.dtype + ptrtype = ptr_expr.dtype + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) + + if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): + NotImplementedError( + "atomic operations are only available for float32/64 datatypes" + ) + + # workaround for subtractions -> use additions for reducing intermediate results + # similar to OpenMP reductions: local copies (negative sign) are added at the end + match reduction_op: + case ReductionOp.Sub: + actual_reduction_op = ReductionOp.Add + case _: + actual_reduction_op = reduction_op + + # check if thread is valid for performing reduction + ispace = self._ctx.get_iteration_space() + is_valid_thread = self._get_condition_for_translation(ispace) + + cond: PsExpression + shuffles: tuple[PsAssignment, ...] + if self._warp_size and self._assume_warp_aligned_block_size: + # perform local warp reductions + def gen_shuffle_instr(offset: int): + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) + return PsCall( + CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [ + full_mask, + symbol_expr, + PsConstantExpr(PsConstant(offset, SInt(32))), + ], + ) + + # set up shuffle instructions for warp-level reduction + num_shuffles = math.frexp(self._warp_size)[1] + shuffles = tuple( + PsAssignment( + symbol_expr, + reduction_op_to_expr( + actual_reduction_op, + symbol_expr, + gen_shuffle_instr(pow(2, i - 1)), + ), + ) + for i in reversed(range(1, num_shuffles)) + ) + + # find first thread in warp + first_thread_in_warp = self._first_thread_in_warp(ispace) + + # set condition to only execute atomic operation on first valid thread in warp + cond = ( + PsAnd(is_valid_thread, first_thread_in_warp) + if is_valid_thread + else first_thread_in_warp + ) + else: + # no optimization: only execute atomic add on valid thread + shuffles = () + cond = is_valid_thread + + # use atomic operation + func = CFunction( + f"atomic{actual_reduction_op.name}", [ptrtype, stype], PsCustomType("void") + ) + func_args = (ptr_expr, symbol_expr) + + # assemble warp reduction + return shuffles, PsConditional( + cond, PsBlock([PsStatement(PsCall(func, func_args))]) + ) + + def resolve_numeric_limits( + self, func: NumericLimitsFunctions, dtype: PsType + ) -> PsExpression: + assert isinstance(dtype, PsIeeeFloatType) + + match func: + case NumericLimitsFunctions.Min: + define = f"NPP_MINABS_{dtype.width}F" + case NumericLimitsFunctions.Max: + define = f"NPP_MAXABS_{dtype.width}F" + case _: + raise MaterializationError( + f"Cannot materialize call to function {func}" + ) + + return PsLiteralExpr(PsLiteral(define, dtype)) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index fa8e54002679b9727578f923d4d409208268782c..3de7cf696ff47d65950e376e4aa38c63e1528dcf 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -1,10 +1,21 @@ from abc import ABC, abstractmethod from typing import Sequence -from pystencils.backend.ast.expressions import PsCall - -from ..functions import CFunction, PsMathFunction, MathFunctions -from ...types import PsIntegerType, PsIeeeFloatType +from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr + +from ..ast import PsAstNode +from ..functions import ( + CFunction, + MathFunctions, + NumericLimitsFunctions, + ReductionFunctions, + PsMathFunction, + PsReductionFunction, +) +from ..literals import PsLiteral +from ...reduction_op_mapping import reduction_op_to_expr +from ...sympyextensions import ReductionOp +from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType from .platform import Platform from ..exceptions import MaterializationError @@ -17,7 +28,7 @@ from ..kernelcreation.iteration_space import ( ) from ..constants import PsConstant -from ..ast.structural import PsDeclaration, PsLoop, PsBlock +from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsStructuralNode from ..ast.expressions import ( PsSymbolExpr, PsExpression, @@ -26,6 +37,7 @@ from ..ast.expressions import ( PsGe, PsLe, PsTernary, + PsLiteralExpr, ) from ..ast.vector import PsVecMemAcc from ...types import PsVectorType, PsCustomType @@ -43,7 +55,7 @@ class GenericCpu(Platform): @property def required_headers(self) -> set[str]: - return {"<cmath>"} + return {"<cmath>", "<limits>"} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace @@ -55,13 +67,57 @@ class GenericCpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression: - assert isinstance(call.function, PsMathFunction) + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + call_func = call.function + assert isinstance(call_func, PsReductionFunction | PsMathFunction) + + func = call_func.func + + if ( + isinstance(call_func, PsReductionFunction) + and func is ReductionFunctions.WriteBackToPtr + ): + ptr_expr, symbol_expr = call.args + op = call_func.reduction_op + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance( + ptr_expr.dtype, PsPointerType + ) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance( + symbol_expr.dtype, PsScalarType + ) + + ptr_access = PsMemAcc( + ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) + ) + + # inspired by OpenMP: local reduction variable (negative sign) is added at the end + actual_op = ReductionOp.Add if op is ReductionOp.Sub else op + + # create binop and potentially select corresponding function for e.g. min or max + potential_call = reduction_op_to_expr(actual_op, ptr_access, symbol_expr) + if isinstance(potential_call, PsCall): + potential_call.dtype = symbol_expr.dtype + return self.select_function(potential_call) + + return potential_call - func = call.function.func dtype = call.get_dtype() arg_types = (dtype,) * func.num_args + if isinstance(dtype, PsScalarType) and func in ( + NumericLimitsFunctions.Min, + NumericLimitsFunctions.Max, + ): + return PsLiteralExpr( + PsLiteral( + f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", + dtype, + ) + ) + if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): cfunc: CFunction match func: diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 11425d9238e8270b06a628555908622b623931d6..8a4dd11a29c3243bc1cbad2a857646a1a7be038e 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,7 +1,19 @@ from __future__ import annotations -from abc import ABC, abstractmethod -from ...types import constify, deconstify +import operator +from abc import ABC, abstractmethod +from functools import reduce + +from ..ast import PsAstNode +from ..constants import PsConstant +from ...sympyextensions.reduction import ReductionOp +from ...types import ( + constify, + deconstify, + PsScalarType, + PsType, +) +from ...types.quick import SInt from ..exceptions import MaterializationError from .platform import Platform @@ -15,7 +27,12 @@ from ..kernelcreation import ( ) from ..kernelcreation.context import KernelCreationContext -from ..ast.structural import PsBlock, PsConditional, PsDeclaration +from ..ast.structural import ( + PsBlock, + PsConditional, + PsDeclaration, + PsStructuralNode, +) from ..ast.expressions import ( PsExpression, PsLiteralExpr, @@ -23,12 +40,22 @@ from ..ast.expressions import ( PsCall, PsLookup, PsBufferAcc, + PsConstantExpr, + PsAdd, + PsRem, + PsEq, ) from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral -from ..functions import PsMathFunction, MathFunctions, CFunction - +from ..functions import ( + MathFunctions, + CFunction, + ReductionFunctions, + NumericLimitsFunctions, + PsReductionFunction, + PsMathFunction, +) int32 = PsSignedIntegerType(width=32, const=False) @@ -117,7 +144,7 @@ class Blockwise4DMapping(ThreadMapping): THREAD_IDX[0], BLOCK_IDX[0], BLOCK_IDX[1], - BLOCK_IDX[2] + BLOCK_IDX[2], ] def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: @@ -163,7 +190,7 @@ class Blockwise4DMapping(ThreadMapping): class GenericGpu(Platform): """Common base platform for CUDA- and HIP-type GPU targets. - + Args: ctx: The kernel creation context omit_range_check: If `True`, generated index translation code will not check if the point identified @@ -171,13 +198,40 @@ class GenericGpu(Platform): thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points """ + @property + @abstractmethod + def required_headers(self) -> set[str]: + return { + '"gpu_atomics.h"', + } + + @abstractmethod + def resolve_numeric_limits( + self, func: NumericLimitsFunctions, dtype: PsType + ) -> PsExpression: + pass + + @abstractmethod + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + pass + def __init__( self, ctx: KernelCreationContext, + assume_warp_aligned_block_size: bool, + warp_size: int | None, thread_mapping: ThreadMapping | None = None, ) -> None: super().__init__(ctx) + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + self._warp_size = warp_size + self._thread_mapping = ( thread_mapping if thread_mapping is not None else Linear3DMapping() ) @@ -194,14 +248,80 @@ class GenericGpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression: - assert isinstance(call.function, PsMathFunction) + @staticmethod + def _get_condition_for_translation(ispace: IterationSpace): + + if isinstance(ispace, FullIterationSpace): + conds = [] + + dimensions = ispace.dimensions_in_loop_order() + + for dim in dimensions: + ctr_expr = PsExpression.make(dim.counter) + conds.append(PsLt(ctr_expr, dim.stop)) + + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + + return condition + elif isinstance(ispace, SparseIterationSpace): + sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) + stop = PsExpression.make(ispace.index_list.shape[0]) + + return PsLt(sparse_ctr_expr.clone(), stop) + else: + raise MaterializationError(f"Unknown type of iteration space: {ispace}") + + @staticmethod + def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]: + """Returns thread indices multiplied with block dimension strides per dimension.""" + + return tuple( + idx + * PsConstantExpr( + PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)) + ) + for i, idx in enumerate(THREAD_IDX[: ispace.rank]) + ) + + def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression: + """Returns expression that determines whether a thread is the first within a warp.""" + + tids_per_dim = GenericGpu._thread_index_per_dim(ispace) + tid: PsExpression = tids_per_dim[0] + for t in tids_per_dim[1:]: + tid = PsAdd(tid, t) + + return PsEq( + PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32))), + ) + + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + call_func = call.function + assert isinstance(call_func, PsReductionFunction | PsMathFunction) + + func = call_func.func + + if ( + isinstance(call_func, PsReductionFunction) + and func is ReductionFunctions.WriteBackToPtr + ): + ptr_expr, symbol_expr = call.args + op = call_func.reduction_op + + return self.resolve_reduction(ptr_expr, symbol_expr, op) - func = call.function.func dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsIeeeFloatType): + if isinstance(dtype, PsScalarType) and isinstance(func, NumericLimitsFunctions): + return self.resolve_numeric_limits(func, dtype) + + if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions: match func: case ( MathFunctions.Exp @@ -262,7 +382,6 @@ class GenericGpu(Platform): ctr_mapping = self._thread_mapping(ispace) indexing_decls = [] - conds = [] dimensions = ispace.dimensions_in_loop_order() @@ -276,14 +395,9 @@ class GenericGpu(Platform): indexing_decls.append( self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) - conds.append(PsLt(ctr_expr, dim.stop)) - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) - - return ast + cond = self._get_condition_for_translation(ispace) + return PsBlock(indexing_decls + [PsConditional(cond, body)]) def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace @@ -313,8 +427,5 @@ class GenericGpu(Platform): ] body.statements = mappings + body.statements - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr_expr.clone(), stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - - return ast + cond = self._get_condition_for_translation(ispace) + return PsBlock([sparse_idx_decl, PsConditional(cond, body)]) diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py index c758995a0d9f8fbbb2e9e424bf2cfa6ab7eca086..404d9bb27bf7eded15b9e2f7dc60fb70ceedb1fe 100644 --- a/src/pystencils/backend/platforms/hip.py +++ b/src/pystencils/backend/platforms/hip.py @@ -1,11 +1,40 @@ from __future__ import annotations from .generic_gpu import GenericGpu +from ..ast import PsAstNode +from ..ast.expressions import PsExpression, PsLiteralExpr +from ..ast.structural import PsStructuralNode +from ..exceptions import MaterializationError +from ..functions import NumericLimitsFunctions +from ..literals import PsLiteral +from ...sympyextensions import ReductionOp +from ...types import PsType, PsIeeeFloatType class HipPlatform(GenericGpu): - """Platform for the HIP GPU taret.""" + """Platform for the HIP GPU target.""" @property def required_headers(self) -> set[str]: - return {'"pystencils_runtime/hip.h"'} + return super().required_headers | {'"pystencils_runtime/hip.h"', "<limits>"} + + def resolve_numeric_limits( + self, func: NumericLimitsFunctions, dtype: PsType + ) -> PsExpression: + assert isinstance(dtype, PsIeeeFloatType) + + return PsLiteralExpr( + PsLiteral( + f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", + dtype, + ) + ) + + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + + raise MaterializationError("Reductions are yet not supported in HIP backend.") diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index 8ed4729a2d67777bbd132d9e48140e20e3656767..7b81865aef50d63e9913161933f90ee86b24533e 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod -from ..ast.structural import PsBlock +from ..ast import PsAstNode +from ..ast.structural import PsBlock, PsStructuralNode from ..ast.expressions import PsCall, PsExpression from ..kernelcreation.context import KernelCreationContext @@ -11,7 +12,7 @@ class Platform(ABC): """Abstract base class for all supported platforms. The platform performs all target-dependent tasks during code generation: - + - Translation of the iteration space to an index source (loop nest, GPU indexing, ...) - Platform-specific optimizations (e.g. vectorization, OpenMP) """ @@ -37,7 +38,7 @@ class Platform(ABC): @abstractmethod def select_function( self, call: PsCall - ) -> PsExpression: + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: """Select an implementation for the given function on the given data type. If no viable implementation exists, raise a `MaterializationError`. diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index d16c4f51b0609e50b1047c8f2fbd3824b8630b18..22d60f9b0e9f8b18995338973ade8ebe8caab8bf 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -1,12 +1,13 @@ from __future__ import annotations +from ..ast import PsAstNode from ..functions import CFunction, PsMathFunction, MathFunctions from ..kernelcreation.iteration_space import ( IterationSpace, FullIterationSpace, SparseIterationSpace, ) -from ..ast.structural import PsDeclaration, PsBlock, PsConditional +from ..ast.structural import PsDeclaration, PsBlock, PsConditional, PsStructuralNode from ..ast.expressions import ( PsExpression, PsSymbolExpr, @@ -55,7 +56,9 @@ class SyclPlatform(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression: + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: assert isinstance(call.function, PsMathFunction) func = call.function.func diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index 3f0285f0c9d75e1df00d1b662ade818a8b50a9cc..add38cfe419ee352a23bf06aefa6f56c22c59d11 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import Sequence +from typing import Sequence, Tuple from enum import Enum from functools import cache @@ -17,8 +17,9 @@ from ..ast.expressions import ( PsCast, PsCall, ) -from ..ast.vector import PsVecMemAcc, PsVecBroadcast -from ...types import PsCustomType, PsVectorType, PsPointerType +from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal +from ...sympyextensions import ReductionOp +from ...types import PsCustomType, PsVectorType, PsPointerType, PsType from ..constants import PsConstant from ..exceptions import MaterializationError @@ -132,6 +133,8 @@ class X86VectorCpu(GenericVectorCpu): else: headers = {"<immintrin.h>"} + headers.update({'"simd_horizontal_helpers.h"'}) + return super().required_headers | headers def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType: @@ -160,7 +163,14 @@ class X86VectorCpu(GenericVectorCpu): ) -> PsExpression: match expr: case PsUnOp() | PsBinOp(): - func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype()) + vtype: PsType + if isinstance(expr, PsVecHorizontal): + # return type of expression itself is scalar, but input argument to intrinsic is a vector + vtype = expr.vector_operand.get_dtype() + else: + vtype = expr.get_dtype() + + func = _x86_op_intrin(self._vector_arch, expr, vtype) intrinsic = func(*operands) intrinsic.dtype = func.return_type return intrinsic @@ -339,6 +349,7 @@ def _x86_op_intrin( prefix = varch.intrin_prefix(vtype) suffix = varch.intrin_suffix(vtype) rtype = atype = varch.intrin_type(vtype) + atypes: Tuple[PsType, ...] = () match op: case PsVecBroadcast(): @@ -346,6 +357,16 @@ def _x86_op_intrin( if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4: suffix += "x" atype = vtype.scalar_type + case PsVecHorizontal(): + # horizontal add instead of sub avoids double inversion of sign + actual_op = ( + ReductionOp.Add + if op.reduction_op == ReductionOp.Sub + else op.reduction_op + ) + opstr = f"horizontal_{actual_op.name.lower()}" + rtype = vtype.scalar_type + atypes = (vtype.scalar_type, vtype) case PsAdd(): opstr = "add" case PsSub(): @@ -392,7 +413,9 @@ def _x86_op_intrin( case (SInt(64), Fp()) | ( Fp(), SInt(64), - ) if varch < X86VectorArch.AVX512: + ) if ( + varch < X86VectorArch.AVX512 + ): panic() # AVX512 only: cvtepiA_epiT if A > T case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512: @@ -408,4 +431,7 @@ def _x86_op_intrin( ) num_args = 1 if isinstance(op, PsUnOp) else 2 - return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype) + if not atypes: + atypes = (atype,) * num_args + + return CFunction(f"{prefix}_{opstr}_{suffix}", atypes, rtype) diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index bd782422f1fa80b96ec7cf69473fda2b1f45c3d6..fa466e495c826fca6c8f0ae1f338dfd868436ebd 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -9,6 +9,7 @@ from ..ast import PsAstNode from ..ast.structural import PsBlock, PsLoop, PsPragma, PsStructuralNode from ..ast.expressions import PsExpression +from ...types import PsScalarType __all__ = ["InsertPragmasAtLoops", "LoopPragma", "AddOpenMP"] @@ -122,6 +123,17 @@ class AddOpenMP: if num_threads is not None: pragma_text += f" num_threads({str(num_threads)})" + if bool(ctx.symbols_reduction_info): + for symbol, reduction_info in ctx.symbols_reduction_info.items(): + if isinstance(symbol.dtype, PsScalarType): + pragma_text += ( + f" reduction({reduction_info.op.value}: {symbol.name})" + ) + else: + NotImplementedError( + "OMP: Reductions for non-scalar data types are not supported yet." + ) + if collapse is not None: if collapse <= 0: raise ValueError( diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index e1e4fea502c08de86e13de5e3c251f1b7a7d0ee6..48b9ad0da567a6221c33c594795c8d4b6e745ea0 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -7,9 +7,15 @@ from ...types import PsVectorType, PsScalarType from ..kernelcreation import KernelCreationContext from ..constants import PsConstant from ..ast import PsAstNode -from ..ast.structural import PsLoop, PsBlock, PsDeclaration -from ..ast.expressions import PsExpression, PsTernary, PsGt -from ..ast.vector import PsVecBroadcast +from ..ast.structural import ( + PsLoop, + PsBlock, + PsDeclaration, + PsAssignment, + PsStructuralNode, +) +from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr +from ..ast.vector import PsVecBroadcast, PsVecHorizontal from ..ast.analysis import collect_undefined_symbols from .ast_vectorizer import VectorizationAxis, VectorizationContext, AstVectorizer @@ -134,6 +140,36 @@ class LoopVectorizer: # Prepare vectorization context vc = VectorizationContext(self._ctx, self._lanes, axis) + # Prepare reductions + simd_init_local_reduction_vars: list[PsStructuralNode] = [] + simd_writeback_local_reduction_vars: list[PsStructuralNode] = [] + for symb, reduction_info in self._ctx.symbols_reduction_info.items(): + # Vectorize symbol for local copy + vector_symb = vc.vectorize_symbol(symb) + + # Declare and init vector + simd_init_local_reduction_vars += [ + self._type_fold( + PsDeclaration( + PsSymbolExpr(vector_symb), + PsVecBroadcast(self._lanes, PsSymbolExpr(symb)), + ) + ) + ] + + # Write back vectorization result + simd_writeback_local_reduction_vars += [ + PsAssignment( + PsSymbolExpr(symb), + PsVecHorizontal( + self._lanes, + PsSymbolExpr(symb), + PsSymbolExpr(vector_symb), + reduction_info.op, + ), + ) + ] + # Generate vectorized loop body simd_body = self._vectorize_ast(loop.body, vc) @@ -224,10 +260,10 @@ class LoopVectorizer: ) return PsBlock( - [ - simd_stop_decl, - simd_step_decl, - simd_loop, + simd_init_local_reduction_vars + + [simd_stop_decl, simd_step_decl, simd_loop] + + simd_writeback_local_reduction_vars + + [ trailing_start_decl, trailing_loop, ] @@ -238,11 +274,13 @@ class LoopVectorizer: case LoopVectorizer.TrailingItersTreatment.NONE: return PsBlock( - [ + simd_init_local_reduction_vars + + [ simd_stop_decl, simd_step_decl, simd_loop, ] + + simd_writeback_local_reduction_vars ) @overload diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py index e41c345ae4ed71101d07fcaa5b9df88b1e0f54e2..9ce4046931036a5e16eda2c9cd16faa272752acc 100644 --- a/src/pystencils/backend/transformations/select_functions.py +++ b/src/pystencils/backend/transformations/select_functions.py @@ -1,7 +1,9 @@ +from ..ast.structural import PsAssignment, PsBlock, PsStructuralNode +from ..exceptions import MaterializationError from ..platforms import Platform from ..ast import PsAstNode -from ..ast.expressions import PsCall -from ..functions import PsMathFunction +from ..ast.expressions import PsCall, PsExpression +from ..functions import PsMathFunction, PsReductionFunction class SelectFunctions: @@ -17,7 +19,39 @@ class SelectFunctions: def visit(self, node: PsAstNode) -> PsAstNode: node.children = [self.visit(c) for c in node.children] - if isinstance(node, PsCall) and isinstance(node.function, PsMathFunction): - return self._platform.select_function(node) + if isinstance(node, PsAssignment): + rhs = node.rhs + if isinstance(rhs, PsCall) and isinstance( + rhs.function, PsReductionFunction + ): + resolved_func = self._platform.select_function(rhs) + + match resolved_func: + case (prepend, new_rhs): + assert isinstance(prepend, tuple) + + match new_rhs: + case PsExpression(): + return PsBlock( + prepend + (PsAssignment(node.lhs, new_rhs),) + ) + case PsStructuralNode(): + # special case: produces structural with atomic operation writing value back to ptr + return PsBlock(prepend + (new_rhs,)) + case _: + assert False, "Unexpected output from SelectFunctions." + case PsExpression(): + return PsAssignment(node.lhs, resolved_func) + case _: + raise MaterializationError( + f"Wrong return type for resolved function {rhs.function.name} in SelectFunctions." + ) + else: + return node + elif isinstance(node, PsCall) and isinstance(node.function, PsMathFunction): + resolved_func = self._platform.select_function(node) + assert isinstance(resolved_func, PsExpression) + + return resolved_func else: return node diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py index 060192810a7ccb9ab9ed13f64dd7948791078ea4..b20614393b439a8e5e70d682362dbec6769df9c2 100644 --- a/src/pystencils/backend/transformations/select_intrinsics.py +++ b/src/pystencils/backend/transformations/select_intrinsics.py @@ -7,7 +7,7 @@ from ..ast.structural import PsAstNode, PsDeclaration, PsAssignment, PsStatement from ..ast.expressions import PsExpression, PsCall, PsCast, PsLiteral from ...types import PsCustomType, PsVectorType, constify, deconstify from ..ast.expressions import PsSymbolExpr, PsConstantExpr, PsUnOp, PsBinOp -from ..ast.vector import PsVecMemAcc +from ..ast.vector import PsVecMemAcc, PsVecHorizontal from ..exceptions import MaterializationError from ..functions import CFunction, PsMathFunction @@ -86,6 +86,10 @@ class SelectIntrinsics: new_rhs = self.visit_expr(rhs, sc) return PsStatement(self._platform.vector_store(lhs, new_rhs)) + case PsAssignment(lhs, rhs) if isinstance(rhs, PsVecHorizontal): + new_rhs = self.visit_expr(rhs, sc) + return PsAssignment(lhs, new_rhs) + case _: node.children = [self.visit(c, sc) for c in node.children] @@ -93,7 +97,15 @@ class SelectIntrinsics: def visit_expr(self, expr: PsExpression, sc: SelectionContext) -> PsExpression: if not isinstance(expr.dtype, PsVectorType): - return expr + # special case: result type of horizontal reduction is scalar + if isinstance(expr, PsVecHorizontal): + scalar_op = expr.scalar_operand + vector_op_to_scalar = self.visit_expr(expr.vector_operand, sc) + return self._platform.op_intrinsic( + expr, [scalar_op, vector_op_to_scalar] + ) + else: + return expr match expr: case PsSymbolExpr(symb): diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index e9fc69b76b3d88024f9cbde880617d6a3a3696ff..c285dd7bf1a5fb786ed27472004325e0efcfe5f0 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -25,8 +25,15 @@ from ..types import PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode -from ..backend.ast.structural import PsBlock, PsLoop -from ..backend.ast.expressions import PsExpression +from ..backend.functions import PsReductionFunction, ReductionFunctions +from ..backend.ast.expressions import ( + PsExpression, + PsSymbolExpr, + PsCall, + PsMemAcc, + PsConstantExpr, +) +from ..backend.ast.structural import PsBlock, PsLoop, PsDeclaration, PsAssignment from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -183,6 +190,31 @@ class DefaultKernelCreationDriver: if self._intermediates is not None: self._intermediates.constants_eliminated = kernel_ast.clone() + # Extensions for reductions + for symbol, reduction_info in self._ctx.symbols_reduction_info.items(): + typify = Typifier(self._ctx) + symbol_expr = typify(PsSymbolExpr(symbol)) + ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol)) + init_val = typify(reduction_info.init_val) + + ptr_access = PsMemAcc( + ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) + ) + write_back_ptr = PsCall( + PsReductionFunction( + ReductionFunctions.WriteBackToPtr, reduction_info.op + ), + [ptr_symbol_expr, symbol_expr], + ) + + # declare and init local copy with neutral element + prepend_ast = [PsDeclaration(symbol_expr, init_val)] + # write back result to reduction target variable + append_ast = [PsAssignment(ptr_access, write_back_ptr)] + + kernel_ast.statements = prepend_ast + kernel_ast.statements + kernel_ast.statements += append_ast + # Target-Specific optimizations if self._target.is_cpu(): kernel_ast = self._transform_for_cpu(kernel_ast) @@ -405,14 +437,18 @@ class DefaultKernelCreationDriver: idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") - assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option( + "assume_warp_aligned_block_size" + ) warp_size: int | None = self._cfg.gpu.get_option("warp_size") if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) if warp_size is None and assume_warp_aligned_block_size: - warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.") + warn( + "GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`." + ) return GpuIndexing( self._ctx, @@ -457,6 +493,11 @@ class DefaultKernelCreationDriver: else None ) + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option( + "assume_warp_aligned_block_size" + ) + warp_size: int | None = self._cfg.gpu.get_option("warp_size") + GpuPlatform: type match self._target: case Target.CUDA: @@ -468,6 +509,8 @@ class DefaultKernelCreationDriver: return GpuPlatform( self._ctx, + assume_warp_aligned_block_size, + warp_size, thread_mapping=thread_mapping, ) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 43b612bd77f535ef45c09c38489784dd12a53ce0..f5606da0256039ba187d62583f6838e825002f63 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -260,6 +260,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def __init__( self, + rank: int, num_work_items: _Dim3Lambda, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool, @@ -270,7 +271,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self._assume_warp_aligned_block_size = assume_warp_aligned_block_size - default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items)) + default_bs = GpuLaunchConfiguration.get_default_block_size(rank) self._default_block_size = default_bs self._init_block_size: dim3 = default_bs self._compute_block_size: ( @@ -598,6 +599,7 @@ class GpuIndexing: def factory(): return DynamicBlockSizeLaunchConfiguration( + rank, num_work_items, self._hw_props, self._assume_warp_aligned_block_size, diff --git a/src/pystencils/include/gpu_atomics.h b/src/pystencils/include/gpu_atomics.h new file mode 100644 index 0000000000000000000000000000000000000000..6de5c3321e1cff84e00cdf7b3c551638ecb2a99e --- /dev/null +++ b/src/pystencils/include/gpu_atomics.h @@ -0,0 +1,90 @@ +#pragma once + +// No direct implementation for all atomic operations available +// -> add support by custom implementations using a CAS mechanism + +#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__) + +// - atomicMul (double/float) +// see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division +__device__ double atomicMul(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int oldValue = *address_as_ull, assumed; + do { + assumed = oldValue; + oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * + __longlong_as_double(assumed))); + } while (assumed != oldValue); + + return __longlong_as_double(oldValue); +} + +__device__ float atomicMul(float* address, float val) { + int* address_as_int = (int*)address; + int old = *address_as_int; + int assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed))); + } while (assumed != old); + + return __int_as_float(old); +} + +#endif + +#ifdef __CUDA_ARCH__ + +// - atomicMin (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMin(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val < __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMin(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val < __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +// - atomicMax (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMax(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val > __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMax(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val > __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +#endif \ No newline at end of file diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..bd1889153a56876032781b77dcb66a413b3fe07d --- /dev/null +++ b/src/pystencils/include/simd_horizontal_helpers.h @@ -0,0 +1,231 @@ +#pragma once + +#include <cmath> + +#if defined(__SSE3__) +#include <immintrin.h> + +inline double _mm_horizontal_add_pd(double dst, __m128d src) { + __m128d _v = src; + return dst + _mm_cvtsd_f64(_mm_hadd_pd(_v, _v)); +} + +inline float _mm_horizontal_add_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_hadd_ps(_v, _v); + return dst + _mm_cvtss_f32(_mm_add_ps(_h, _mm_movehdup_ps(_h))); +} + +inline double _mm_horizontal_mul_pd(double dst, __m128d src) { + __m128d _v = src; + double _r = _mm_cvtsd_f64(_mm_mul_pd(_v, _mm_shuffle_pd(_v, _v, 1))); + return dst * _r; +} + +inline float _mm_horizontal_mul_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_mul_ps(_v, _mm_shuffle_ps(_v, _v, 177)); + float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return dst * _r; +} + +inline double _mm_horizontal_min_pd(double dst, __m128d src) { + __m128d _v = src; + double _r = _mm_cvtsd_f64(_mm_min_pd(_v, _mm_shuffle_pd(_v, _v, 1))); + return fmin(_r, dst); +} + +inline float _mm_horizontal_min_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_min_ps(_v, _mm_shuffle_ps(_v, _v, 177)); + float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmin(_r, dst); +} + +inline double _mm_horizontal_max_pd(double dst, __m128d src) { + __m128d _v = src; + double _r = _mm_cvtsd_f64(_mm_max_pd(_v, _mm_shuffle_pd(_v, _v, 1))); + return fmax(_r, dst); +} + +inline float _mm_horizontal_max_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_max_ps(_v, _mm_shuffle_ps(_v, _v, 177)); + float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmax(_r, dst); +} + +#endif + +#if defined(__AVX__) +#include <immintrin.h> + +inline double _mm256_horizontal_add_pd(double dst, __m256d src) { + __m256d _v = src; + __m256d _h = _mm256_hadd_pd(_v, _v); + return dst + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h))); +} + +inline float _mm256_horizontal_add_ps(float dst, __m256 src) { + __m256 _v = src; + __m256 _h = _mm256_hadd_ps(_v, _v); + __m128 _i = _mm_add_ps(_mm256_extractf128_ps(_h,1), _mm256_castps256_ps128(_h)); + return dst + _mm_cvtss_f32(_mm_hadd_ps(_i,_i)); +} + +inline double _mm256_horizontal_mul_pd(double dst, __m256d src) { + __m256d _v = src; + __m128d _w = _mm_mul_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v)); + double _r = _mm_cvtsd_f64(_mm_mul_pd(_w, _mm_permute_pd(_w,1))); + return dst * _r; +} + +inline float _mm256_horizontal_mul_ps(float dst, __m256 src) { + __m256 _v = src; + __m128 _w = _mm_mul_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v)); + __m128 _h = _mm_mul_ps(_w, _mm_shuffle_ps(_w, _w, 177)); + float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return dst * _r; +} + +inline double _mm256_horizontal_min_pd(double dst, __m256d src) { + __m256d _v = src; + __m128d _w = _mm_min_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v)); + double _r = _mm_cvtsd_f64(_mm_min_pd(_w, _mm_permute_pd(_w,1))); + return fmin(_r, dst); +} + +inline float _mm256_horizontal_min_ps(float dst, __m256 src) { + __m256 _v = src; + __m128 _w = _mm_min_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v)); + __m128 _h = _mm_min_ps(_w, _mm_shuffle_ps(_w, _w, 177)); + float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmin(_r, dst); +} + +inline double _mm256_horizontal_max_pd(double dst, __m256d src) { + __m256d _v = src; + __m128d _w = _mm_max_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v)); + double _r = _mm_cvtsd_f64(_mm_max_pd(_w, _mm_permute_pd(_w,1))); + return fmax(_r, dst); +} + +inline float _mm256_horizontal_max_ps(float dst, __m256 src) { + __m256 _v = src; + __m128 _w = _mm_max_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v)); + __m128 _h = _mm_max_ps(_w, _mm_shuffle_ps(_w, _w, 177)); + float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmax(_r, dst); +} + +#endif + +#if defined(__AVX512F__) +#include <immintrin.h> + +inline double _mm512_horizontal_add_pd(double dst, __m512d src) { + double _r = _mm512_reduce_add_pd(src); + return dst + _r; +} + +inline float _mm512_horizontal_add_ps(float dst, __m512 src) { + float _r = _mm512_reduce_add_ps(src); + return dst + _r; +} + +inline double _mm512_horizontal_mul_pd(double dst, __m512d src) { + double _r = _mm512_reduce_mul_pd(src); + return dst * _r; +} + +inline float _mm512_horizontal_mul_ps(float dst, __m512 src) { + float _r = _mm512_reduce_mul_ps(src); + return dst * _r; +} + +inline double _mm512_horizontal_min_pd(double dst, __m512d src) { + double _r = _mm512_reduce_min_pd(src); + return fmin(_r, dst); +} + +inline float _mm512_horizontal_min_ps(float dst, __m512 src) { + float _r = _mm512_reduce_min_ps(src); + return fmin(_r, dst); +} + +inline double _mm512_horizontal_max_pd(double dst, __m512d src) { + double _r = _mm512_reduce_max_pd(src); + return fmax(_r, dst); +} + +inline float _mm512_horizontal_max_ps(float dst, __m512 src) { + float _r = _mm512_reduce_max_ps(src); + return fmax(_r, dst); +} + +#endif + +#if defined(_M_ARM64) +#include <arm_neon.h> + +inline double vgetq_horizontal_add_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r += vgetq_lane_f64(_v,1); + return dst + _r; +} + +inline float vget_horizontal_add_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vadd_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r += vget_lane_f32(_w,1); + return dst + _r; +} + +inline double vgetq_horizontal_mul_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r *= vgetq_lane_f64(_v,1); + return dst * _r; +} + +inline float vget_horizontal_mul_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vmul_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r *= vget_lane_f32(_w,1); + return dst * _r; +} + +inline double vgetq_horizontal_min_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r = fmin(_r, vgetq_lane_f64(_v,1)); + return fmin(_r, dst); +} + +inline float vget_horizontal_min_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vmin_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r = fmin(_r, vget_lane_f32(_w,1)); + return fmin(_r, dst); +} + +inline double vgetq_horizontal_max_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r = fmax(_r, vgetq_lane_f64(_v,1)); + return fmax(_r, dst); +} + +inline float vget_horizontal_max_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vmax_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r = fmax(_r, vget_lane_f32(_w,1)); + return fmax(_r, dst); +} + +#endif \ No newline at end of file diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index fca043db90725a441cb8b0ed9b99765c53eecb8d..4d76ea9ca1d75bc2b2ca2d77976b42c5f16b240c 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -92,6 +92,7 @@ class PsKernelExtensioNModule: # Kernels and call wrappers from ..backend.emission import CAstPrinter + printer = CAstPrinter(func_prefix="FUNC_PREFIX") for name, kernel in self._kernels.items(): @@ -200,13 +201,15 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ """ def __init__(self) -> None: - self._buffer_types: dict[Field, PsType] = dict() - self._array_extractions: dict[Field, str] = dict() - self._array_frees: dict[Field, str] = dict() + self._buffer_types: dict[Any, PsType] = dict() + self._array_extractions: dict[Any, str] = dict() + self._array_frees: dict[Any, str] = dict() self._array_assoc_var_extractions: dict[Parameter, str] = dict() self._scalar_extractions: dict[Parameter, str] = dict() + self._pointer_extractions: dict[Parameter, str] = dict() + self._constraint_checks: list[str] = [] self._call: str | None = None @@ -232,38 +235,42 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ else: return None + def get_buffer(self, buffer_name: str) -> str: + """Get the Python buffer object for a given buffer name.""" + return f"buffer_{buffer_name}" + def get_field_buffer(self, field: Field) -> str: """Get the Python buffer object for the given field.""" - return f"buffer_{field.name}" + return self.get_buffer(field.name) - def extract_field(self, field: Field) -> None: - """Add the necessary code to extract the NumPy array for a given field""" - if field not in self._array_extractions: - extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=field.name) - actual_dtype = self._buffer_types[field] + def extract_buffer(self, buffer: Any, name: str) -> None: + """Add the necessary code to extract the NumPy array for a given buffer""" + if buffer not in self._array_extractions: + extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=name) + actual_dtype = self._buffer_types[buffer] # Check array type type_char = self._type_char(actual_dtype) if type_char is not None: - dtype_cond = f"buffer_{field.name}.format[0] == '{type_char}'" + dtype_cond = f"buffer_{name}.format[0] == '{type_char}'" extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( cond=dtype_cond, what="data type", - name=field.name, + name=name, expected=str(actual_dtype), ) # Check item size itemsize = actual_dtype.itemsize - item_size_cond = f"buffer_{field.name}.itemsize == {itemsize}" + item_size_cond = f"buffer_{name}.itemsize == {itemsize}" extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( - cond=item_size_cond, what="itemsize", name=field.name, expected=itemsize + cond=item_size_cond, what="itemsize", name=name, expected=itemsize ) - self._array_extractions[field] = extraction_code + self._array_extractions[buffer] = extraction_code - release_code = f"PyBuffer_Release(&buffer_{field.name});" - self._array_frees[field] = release_code + release_code = f"PyBuffer_Release(&buffer_{name});" + self._array_frees[buffer] = release_code def extract_scalar(self, param: Parameter) -> str: if param not in self._scalar_extractions: @@ -277,6 +284,26 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return param.name + def extract_ptr(self, param: Parameter) -> str: + if param not in self._pointer_extractions: + ptr = param.symbol + ptr_dtype = ptr.dtype + + assert isinstance(ptr_dtype, PsPointerType) + + self._buffer_types[ptr] = ptr_dtype.base_type + self.extract_buffer(ptr, param.name) + buffer = self.get_buffer(param.name) + code = ( + f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" + ) + + assert code is not None + + self._array_assoc_var_extractions[param] = code + + return param.name + def extract_array_assoc_var(self, param: Parameter) -> str: if param not in self._array_assoc_var_extractions: field = param.fields[0] @@ -321,11 +348,13 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ actual_field_type = field.dtype self._buffer_types[prop.field] = actual_field_type - self.extract_field(prop.field) + self.extract_buffer(prop.field, field.name) for param in params: if param.is_field_parameter: self.extract_array_assoc_var(param) + elif isinstance(param.dtype, PsPointerType): + self.extract_ptr(param) else: self.extract_scalar(param) diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 2217809618ec9a161d5e2d6f8c3c374e399e0471..760dccf41883c26b21c0fd8fde5820832bead3d1 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -11,7 +11,6 @@ except ImportError: from ..codegen import Target from ..field import FieldType -from ..types import PsType from .jit import JitBase, JitError, KernelWrapper from ..codegen import ( Kernel, @@ -20,7 +19,7 @@ from ..codegen import ( ) from ..codegen.gpu_indexing import GpuLaunchConfiguration from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr -from ..types import PsStructType, PsPointerType +from ..types import PsType, PsStructType, PsPointerType from ..include import get_pystencils_include_path @@ -186,10 +185,13 @@ class CupyKernelWrapper(KernelWrapper): arr.strides[coord] // arr.dtype.itemsize, ) break + elif isinstance(kparam.dtype, PsPointerType): + val = kwargs[kparam.name] + kernel_args.append(val) else: # scalar parameter - val: Any = kwargs[param.name] - adder(param, val) + val = kwargs[kparam.name] + adder(kparam, val) # Process Arguments diff --git a/src/pystencils/reduction_op_mapping.py b/src/pystencils/reduction_op_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..06fb8aa3e981d661f8e4226a34307f03cd6d9a70 --- /dev/null +++ b/src/pystencils/reduction_op_mapping.py @@ -0,0 +1,39 @@ +from .backend.ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv +from .backend.exceptions import FreezeError +from .backend.functions import PsMathFunction, MathFunctions +from .sympyextensions.reduction import ReductionOp + +_available_operator_interface: set[ReductionOp] = { + ReductionOp.Add, + ReductionOp.Sub, + ReductionOp.Mul, + ReductionOp.Div, +} + + +def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: + if op in _available_operator_interface: + match op: + case ReductionOp.Add: + operator = PsAdd + case ReductionOp.Sub: + operator = PsSub + case ReductionOp.Mul: + operator = PsMul + case ReductionOp.Div: + operator = PsDiv + case _: + raise FreezeError( + f"Found unsupported operation type for reduction assignments: {op}." + ) + return operator(op1, op2) + else: + match op: + case ReductionOp.Min: + return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2]) + case ReductionOp.Max: + return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) + case _: + raise FreezeError( + f"Found unsupported operation type for reduction assignments: {op}." + ) diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py index f1ba8715431d96fb2a09a01e45872def421fe94f..03b4edccfdd402d3013e0e82b924f5e5e87458c7 100644 --- a/src/pystencils/simp/assignment_collection.py +++ b/src/pystencils/simp/assignment_collection.py @@ -1,5 +1,6 @@ import itertools from copy import copy + from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union import sympy as sp diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index fe108a3ff9ae12139e6204f6cd2cc174b4c5211a..bd0fa1fe9d5459f842ec4aca52bf13de63ab7834 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,6 +1,7 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc, tcast, DynamicType from .pointers import mem_acc +from .reduction import reduction_assignment, reduction_assignment_from_str, ReductionOp from .math import ( prod, @@ -27,12 +28,15 @@ from .math import ( count_operations_in_ast, common_denominator, get_symmetric_part, - SymbolCreator + SymbolCreator, ) __all__ = [ "ConditionalFieldAccess", + "reduction_assignment", + "reduction_assignment_from_str", + "ReductionOp", "TypedSymbol", "CastFunc", "tcast", @@ -63,5 +67,5 @@ __all__ = [ "common_denominator", "get_symmetric_part", "SymbolCreator", - "DynamicType" + "DynamicType", ] diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..e95e37c24f3e5815ebaebb8f2739da15619a70c5 --- /dev/null +++ b/src/pystencils/sympyextensions/reduction.py @@ -0,0 +1,82 @@ +from enum import Enum + +from sympy.codegen.ast import AssignmentBase + + +class ReductionOp(Enum): + Add = "+" + Sub = "-" + Mul = "*" + Div = "/" + Min = "min" + Max = "max" + + +class ReductionAssignment(AssignmentBase): + """ + Base class for reduced assignments. + + Attributes: + =========== + + reduction_op : ReductionOp + Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc. + """ + + _reduction_op = None # type: ReductionOp + + @property + def reduction_op(self): + return self._reduction_op + + @reduction_op.setter + def reduction_op(self, op): + self._reduction_op = op + + +class AddReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Add + + +class SubReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Sub + + +class MulReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Mul + + +class MinReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Min + + +class MaxReductionAssignment(ReductionAssignment): + reduction_op = ReductionOp.Max + + +# Mapping from ReductionOp enum to ReductionAssigment classes +_reduction_assignment_classes = { + cls.reduction_op: cls + for cls in [ + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, + ] +} + +# Mapping from ReductionOp str to ReductionAssigment classes +_reduction_assignment_classes_for_str = { + cls.value: cls for cls in _reduction_assignment_classes +} + + +def reduction_assignment(lhs, op: ReductionOp, rhs): + if op not in _reduction_assignment_classes: + raise ValueError("Unrecognized operator %s" % op) + return _reduction_assignment_classes[op](lhs, rhs) + + +def reduction_assignment_from_str(lhs, op: str, rhs): + return reduction_assignment(lhs, _reduction_assignment_classes_for_str[op], rhs) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..cd1710cf5ffe8e5ced135867be2c9e947c7e6b64 --- /dev/null +++ b/tests/kernelcreation/test_reduction.py @@ -0,0 +1,93 @@ +import pytest +import numpy as np + +import pystencils as ps +from pystencils.sympyextensions import reduction_assignment_from_str + +INIT_W = 5 +INIT_ARR = 2 +SIZE = 15 +SOLUTION = { + "+": INIT_W + INIT_ARR * SIZE, + "-": INIT_W - INIT_ARR * SIZE, + "*": INIT_W * INIT_ARR**SIZE, + "min": min(INIT_W, INIT_ARR), + "max": max(INIT_W, INIT_ARR), +} + + +# get AST for kernel with reduction assignment +def get_reduction_assign_ast(dtype, op, config): + x = ps.fields(f"x: {dtype}[1d]") + w = ps.TypedSymbol("w", dtype) + + red_assign = reduction_assignment_from_str(w, op, x.center()) + + return ps.create_kernel([red_assign], config, default_dtype=dtype) + + +@pytest.mark.parametrize("instruction_set", ["sse", "avx"]) +@pytest.mark.parametrize("dtype", ["float64", "float32"]) +@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) +def test_reduction_cpu(instruction_set, dtype, op): + vectorize_info = { + "instruction_set": instruction_set, + "assume_inner_stride_one": True, + } + + config = ps.CreateKernelConfig( + target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info + ) + + ast_reduction = get_reduction_assign_ast(dtype, op, config) + ps.show_code(ast_reduction) + kernel_reduction = ast_reduction.compile() + + array = np.full((SIZE,), INIT_ARR, dtype=dtype) + reduction_array = np.full((1,), INIT_W, dtype=dtype) + + kernel_reduction(x=array, w=reduction_array) + assert np.allclose(reduction_array, SOLUTION[op]) + + +@pytest.mark.parametrize("dtype", ["float64", "float32"]) +@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) +@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) +@pytest.mark.parametrize("use_block_fitting", [True, False]) +def test_reduction_gpu( + dtype: str, + op: str, + assume_warp_aligned_block_size: bool, + use_block_fitting: bool, +): + try: + import cupy as cp + from cupy_backends.cuda.api.runtime import CUDARuntimeError + + device_count = range(cp.cuda.runtime.getDeviceCount()) + print(f"Found {device_count} GPUs") + except ImportError: + pytest.skip(reason="CuPy is not available", allow_module_level=True) + except CUDARuntimeError: + pytest.skip( + reason="No CUDA capable device is detected", allow_module_level=True + ) + + cfg = ps.CreateKernelConfig(target=ps.Target.GPU) + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size + + ast_reduction = get_reduction_assign_ast(dtype, op, cfg) + ps.show_code(ast_reduction) + kernel_reduction = ast_reduction.compile() + + if use_block_fitting: + kernel_reduction.launch_config.fit_block_size((32, 1, 1)) + + array = np.full((SIZE,), INIT_ARR, dtype=dtype) + reduction_array = np.full((1,), INIT_W, dtype=dtype) + + array_gpu = cp.asarray(array) + reduction_array_gpu = cp.asarray(reduction_array) + + kernel_reduction(x=array_gpu, w=reduction_array_gpu) + assert np.allclose(reduction_array_gpu.get(), SOLUTION[op]) diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py new file mode 100644 index 0000000000000000000000000000000000000000..1d652c6e1f53f5a7b060f2a725162ef07f1fdfae --- /dev/null +++ b/util/generate_simd_horizontal_op.py @@ -0,0 +1,309 @@ +from enum import Enum + +FCT_QUALIFIERS = "inline" + + +class InstructionSets(Enum): + SSE3 = "SSE3" + AVX = "AVX" + AVX512 = "AVX512" + NEON = "NEON" + + def __str__(self): + return self.value + + +class ReductionOps(Enum): + Add = ("add", "+") + Mul = ("mul", "*") + Min = ("min", "min") + Max = ("max", "max") + + def __init__(self, op_name, op_str): + self.op_name = op_name + self.op_str = op_str + + +class ScalarTypes(Enum): + Double = "double" + Float = "float" + + def __str__(self): + return self.value + + +class VectorTypes(Enum): + SSE3_128d = "__m128d" + SSE3_128 = "__m128" + + AVX_256d = "__m256d" + AVX_256 = "__m256" + AVX_128 = "__m128" + + AVX_512d = "__m512d" + AVX_512 = "__m512" + + NEON_64x2 = "float64x2_t" + NEON_32x4 = "float32x4_t" + + def __str__(self): + return self.value + + +class Variable: + def __init__(self, name: str, dtype: ScalarTypes | VectorTypes): + self._name = name + self._dtype = dtype + + def __str__(self): + return f"{self._dtype} {self._name}" + + @property + def name(self) -> str: + return self._name + + @property + def dtype(self) -> ScalarTypes | VectorTypes: + return self._dtype + + +def get_intrin_from_vector_type(vtype: VectorTypes) -> InstructionSets: + match vtype: + case VectorTypes.SSE3_128 | VectorTypes.SSE3_128d: + return InstructionSets.SSE3 + case VectorTypes.AVX_256 | VectorTypes.AVX_256d: + return InstructionSets.AVX + case VectorTypes.AVX_512 | VectorTypes.AVX_512d: + return InstructionSets.AVX512 + case VectorTypes.NEON_32x4 | VectorTypes.NEON_64x2: + return InstructionSets.NEON + + +def intrin_prefix(instruction_set: InstructionSets, double_prec: bool): + match instruction_set: + case InstructionSets.SSE3: + return "_mm" + case InstructionSets.AVX: + return "_mm256" + case InstructionSets.AVX512: + return "_mm512" + case InstructionSets.NEON: + return "vgetq" if double_prec else "vget" + case _: + raise ValueError(f"Unknown instruction set {instruction_set}") + + +def intrin_suffix(instruction_set: InstructionSets, double_prec: bool): + if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]: + return "pd" if double_prec else "ps" + elif instruction_set in [InstructionSets.NEON]: + return "f64" if double_prec else "f32" + else: + raise ValueError(f"Unknown instruction set {instruction_set}") + + +def generate_hadd_intrin(instruction_set: InstructionSets, double_prec: bool, v: str): + return f"{intrin_prefix(instruction_set, double_prec)}_hadd_{intrin_suffix(instruction_set, double_prec)}({v}, {v})" + + +def generate_shuffle_intrin(instruction_set: InstructionSets, double_prec: bool, v: str, offset): + return f"_mm_shuffle_{intrin_suffix(instruction_set, double_prec)}({v}, {v}, {offset})" + + +def generate_op_intrin(instruction_set: InstructionSets, double_prec: bool, reduction_op: ReductionOps, a: str, b: str): + return f"_mm_{reduction_op.op_name}_{intrin_suffix(instruction_set, double_prec)}({a}, {b})" + + +def generate_cvts_intrin(double_prec: bool, v: str): + convert_suffix = "f64" if double_prec else "f32" + intrin_suffix = "d" if double_prec else "s" + return f"_mm_cvts{intrin_suffix}_{convert_suffix}({v})" + + +def generate_fct_name(instruction_set: InstructionSets, double_prec: bool, op: ReductionOps): + prefix = intrin_prefix(instruction_set, double_prec) + suffix = intrin_suffix(instruction_set, double_prec) + return f"{prefix}_horizontal_{op.op_name}_{suffix}" + + +def generate_fct_decl(instruction_set: InstructionSets, op: ReductionOps, svar: Variable, vvar: Variable): + double_prec = svar.dtype is ScalarTypes.Double + return f"{FCT_QUALIFIERS} {svar.dtype} {generate_fct_name(instruction_set, double_prec, op)}({svar}, {vvar}) {{ \n" + + +# SSE & AVX provide horizontal add 'hadd' intrinsic that allows for specialized handling +def generate_simd_horizontal_add(scalar_var: Variable, vector_var: Variable): + reduction_op = ReductionOps.Add + instruction_set = get_intrin_from_vector_type(vector_var.dtype) + double_prec = scalar_var.dtype is ScalarTypes.Double + + sname = scalar_var.name + vtype = vector_var.dtype + vname = vector_var.name + + simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b) + hadd = lambda var: generate_hadd_intrin(instruction_set, double_prec, var) + cvts = lambda var: generate_cvts_intrin(double_prec, var) + + # function body + body = f"\t{vtype} _v = {vname};\n" + match instruction_set: + case InstructionSets.SSE3: + if double_prec: + body += f"\treturn {sname} + {cvts(hadd('_v'))};\n" + else: + body += f"\t{vtype} _h = {hadd('_v')};\n" \ + f"\treturn {sname} + {cvts(simd_op('_h', '_mm_movehdup_ps(_h)'))};\n" + + case InstructionSets.AVX: + if double_prec: + body += f"\t{vtype} _h = {hadd('_v')};\n" \ + f"\treturn {sname} + {cvts(simd_op('_mm256_extractf128_pd(_h,1)', '_mm256_castpd256_pd128(_h)'))};\n" + else: + add_i = "_mm_hadd_ps(_i,_i)" + body += f"\t{vtype} _h = {hadd('_v')};\n" \ + f"\t__m128 _i = {simd_op('_mm256_extractf128_ps(_h,1)', '_mm256_castps256_ps128(_h)')};\n" \ + f"\treturn {sname} + {cvts(add_i)};\n" + + case _: + raise ValueError(f"No specialized version of horizontal_add available for {instruction_set}") + + # function decl + decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var) + + return decl + body + "}\n" + + +def generate_simd_horizontal_op(reduction_op: ReductionOps, scalar_var: Variable, vector_var: Variable): + instruction_set = get_intrin_from_vector_type(vector_var.dtype) + double_prec = scalar_var.dtype is ScalarTypes.Double + + # generate specialized version for add operation + if reduction_op == ReductionOps.Add and instruction_set in [InstructionSets.SSE3, InstructionSets.AVX]: + return generate_simd_horizontal_add(scalar_var, vector_var) + + sname = scalar_var.name + stype = scalar_var.dtype + vtype = vector_var.dtype + vname = vector_var.name + + opname = reduction_op.op_name + opstr = reduction_op.op_str + + reduction_function = f"f{opname}" \ + if reduction_op in [ReductionOps.Max, ReductionOps.Min] else None + + simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b) + cvts = lambda var: generate_cvts_intrin(double_prec, var) + shuffle = lambda var, offset: generate_shuffle_intrin(instruction_set, double_prec, var, offset) + + # function body + body = f"\t{vtype} _v = {vname};\n" if instruction_set != InstructionSets.AVX512 else "" + match instruction_set: + case InstructionSets.SSE3: + if double_prec: + body += f"\t{stype} _r = {cvts(simd_op('_v', shuffle('_v', 1)))};\n" + else: + body += f"\t{vtype} _h = {simd_op('_v', shuffle('_v', 177))};\n" \ + f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n" + + case InstructionSets.AVX: + if double_prec: + body += f"\t__m128d _w = {simd_op('_mm256_extractf128_pd(_v,1)', '_mm256_castpd256_pd128(_v)')};\n" \ + f"\t{stype} _r = {cvts(simd_op('_w', '_mm_permute_pd(_w,1)'))}; \n" + else: + body += f"\t__m128 _w = {simd_op('_mm256_extractf128_ps(_v,1)', '_mm256_castps256_ps128(_v)')};\n" \ + f"\t__m128 _h = {simd_op('_w', shuffle('_w', 177))};\n" \ + f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n" + + case InstructionSets.AVX512: + suffix = intrin_suffix(instruction_set, double_prec) + body += f"\t{stype} _r = _mm512_reduce_{opname}_{suffix}({vname});\n" + + case InstructionSets.NEON: + if double_prec: + body += f"\t{stype} _r = vgetq_lane_f64(_v,0);\n" + if reduction_function: + body += f"\t_r = {reduction_function}(_r, vgetq_lane_f64(_v,1));\n" + else: + body += f"\t_r {opstr}= vgetq_lane_f64(_v,1);\n" + else: + body += f"\tfloat32x2_t _w = v{opname}_f32(vget_high_f32(_v), vget_low_f32(_v));\n" \ + f"\t{stype} _r = vgetq_lane_f32(_w,0);\n" + if reduction_function: + body += f"\t_r = {reduction_function}(_r, vget_lane_f32(_w,1));\n" + else: + body += f"\t_r {opstr}= vget_lane_f32(_w,1);\n" + + case _: + raise ValueError(f"Unsupported instruction set {instruction_set}") + + # finalize reduction + if reduction_function: + body += f"\treturn {reduction_function}(_r, {sname});\n" + else: + body += f"\treturn {sname} {opstr} _r;\n" + + # function decl + decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var) + + return decl + body + "}\n" + + +stypes = { + True: ScalarTypes.Double, + False: ScalarTypes.Float +} + +vtypes_for_instruction_set = { + InstructionSets.SSE3: { + True: VectorTypes.SSE3_128d, + False: VectorTypes.SSE3_128 + }, + InstructionSets.AVX: { + True: VectorTypes.AVX_256d, + False: VectorTypes.AVX_256 + }, + InstructionSets.AVX512: { + True: VectorTypes.AVX_512d, + False: VectorTypes.AVX_512 + }, + InstructionSets.NEON: { + True: VectorTypes.NEON_64x2, + False: VectorTypes.NEON_32x4 + }, +} + +guards_for_instruction_sets = { + InstructionSets.SSE3: "__SSE3__", + InstructionSets.AVX: "__AVX__", + InstructionSets.AVX512: '__AVX512F__', + InstructionSets.NEON: '_M_ARM64', +} + +code = """#pragma once + +#include <cmath> + +""" + +for instruction_set in InstructionSets: + code += f"#if defined({guards_for_instruction_sets[instruction_set]})\n" + + if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]: + code += "#include <immintrin.h>\n\n" + elif instruction_set == InstructionSets.NEON: + code += "#include <arm_neon.h>\n\n" + else: + ValueError(f"Missing header include for instruction set {instruction_set}") + + for reduction_op in ReductionOps: + for double_prec in [True, False]: + scalar_var = Variable("dst", stypes[double_prec]) + vector_var = Variable("src", vtypes_for_instruction_set[instruction_set][double_prec]) + + code += generate_simd_horizontal_op(reduction_op, scalar_var, vector_var) + "\n" + + code += "#endif\n\n" + +print(code)