From 8a2bf6648548c18ab47d71db429956e2bf1f52b6 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 14 Jan 2025 18:13:18 +0100 Subject: [PATCH 001/180] Initial work for introducing reduction capabilities to pystencils Signed-off-by: zy69guqi <richard.angersbach@fau.de> --- .../backend/kernelcreation/freeze.py | 27 +++++++++ src/pystencils/simp/assignment_collection.py | 13 +++++ src/pystencils/sympyextensions/__init__.py | 2 + src/pystencils/sympyextensions/reduction.py | 57 +++++++++++++++++++ tests/kernelcreation/test_reduction.py | 44 ++++++++++++++ 5 files changed, 143 insertions(+) create mode 100644 src/pystencils/sympyextensions/reduction.py create mode 100644 tests/kernelcreation/test_reduction.py diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 44ee17077..65be23065 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -1,3 +1,4 @@ +from sympyextensions.reduction import ReducedAssignment from typing import overload, cast, Any from functools import reduce from operator import add, mul, sub, truediv @@ -183,6 +184,32 @@ class FreezeExpressions: return PsAssignment(lhs, op(lhs.clone(), rhs)) + def map_ReducedAssignment(self, expr: ReducedAssignment): + lhs = self.visit(expr.lhs) + rhs = self.visit(expr.rhs) + + assert isinstance(lhs, PsExpression) + assert isinstance(rhs, PsExpression) + + match expr.op: + case "+=": + op = add + case "-=": + op = sub + case "*=": + op = mul + case "/=": + op = truediv + # TODO: unsure if sp.Min & sp.Max work here + case "min=": + op = sp.Min + case "max=": + op = sp.Max + case _: + raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") + + return PsAssignment(lhs, op(lhs.clone(), rhs)) # TODO: PsReducedAssignment? + def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) return PsSymbolExpr(symb) diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py index f1ba87154..4de3e8dc6 100644 --- a/src/pystencils/simp/assignment_collection.py +++ b/src/pystencils/simp/assignment_collection.py @@ -1,5 +1,8 @@ import itertools from copy import copy + +from sympyextensions import reduced_assign +from sympyextensions.reduction import ReducedAssignment from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union import sympy as sp @@ -55,8 +58,13 @@ class AssignmentCollection: subexpressions = list(itertools.chain.from_iterable( [(a if isinstance(a, Iterable) else [a]) for a in subexpressions])) + # filter out reduced assignments + reduced_assignments = [a for a in main_assignments if isinstance(a, ReducedAssignment)] + main_assignments = [a for a in main_assignments if (a not in reduced_assignments)] + self.main_assignments = main_assignments self.subexpressions = subexpressions + self.reductions = reduced_assignments if simplification_hints is None: simplification_hints = {} @@ -71,6 +79,11 @@ class AssignmentCollection: else: self.subexpression_symbol_generator = subexpression_symbol_generator + def add_reduction(self, lhs: sp.Symbol, op: str, rhs: sp.Expr) -> None: + """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet.""" + assert lhs not in self.reductions, f"Reduction for lhs {lhs} exists" + self.reductions.append(reduced_assign(lhs, op, rhs)) + def add_simplification_hint(self, key: str, value: Any) -> None: """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet.""" assert key not in self.simplification_hints, "This hint already exists" diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 7431416c9..6ab24e936 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,6 +1,7 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc from .pointers import mem_acc +from .reduction import reduced_assign from .math import ( prod, @@ -33,6 +34,7 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", + "reduced_assign", "TypedSymbol", "CastFunc", "mem_acc", diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py new file mode 100644 index 000000000..aa947c1d2 --- /dev/null +++ b/src/pystencils/sympyextensions/reduction.py @@ -0,0 +1,57 @@ +from sympy.codegen.ast import AssignmentBase + + +class ReducedAssignment(AssignmentBase): + """ + Base class for reduced assignments. + + Attributes: + =========== + + binop : str + Symbol for binary operation being applied in the assignment, such as "+", + "*", etc. + """ + binop = None # type: str + + # TODO: initial value + + @property + def op(self): + return self.binop + '=' + + +class AddReducedAssignment(ReducedAssignment): + binop = '+' + +class SubReducedAssignment(ReducedAssignment): + binop = '-' + + +class MulReducedAssignment(ReducedAssignment): + binop = '*' + + +class DivReducedAssignment(ReducedAssignment): + binop = '/' + + +class MinReducedssignment(ReducedAssignment): + binop = 'min' + +class MaxReducedssignment(ReducedAssignment): + binop = 'max' + + +# Mapping from binary op strings to AugmentedAssignment subclasses +reduced_assign_classes = { + cls.binop: cls for cls in [ + AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, DivReducedAssignment, + MinReducedssignment, MaxReducedssignment + ] +} + +def reduced_assign(lhs, op, rhs): + if op not in reduced_assign_classes: + raise ValueError("Unrecognized operator %s" % op) + return reduced_assign_classes[op](lhs, rhs) \ No newline at end of file diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py new file mode 100644 index 000000000..47509e267 --- /dev/null +++ b/tests/kernelcreation/test_reduction.py @@ -0,0 +1,44 @@ +import pytest +import numpy as np +import sympy as sp + +import pystencils as ps +from sympyextensions.reduction import reduced_assign + + +@pytest.mark.parametrize('dtype', ["float64", "float32"]) +def test_log(dtype): + a = sp.Symbol("a") + x = ps.fields(f'x: {dtype}[1d]') + + # kernel with main assignments and no reduction + + main_assignment = ps.AssignmentCollection({x.center(): a}) + + ast_main = ps.create_kernel(main_assignment, default_dtype=dtype) + code_main = ps.get_code_str(ast_main) + kernel_main = ast_main.compile() + + # ps.show_code(ast) + + if dtype == "float64": + assert "float" not in code_main + + array = np.zeros((10,), dtype=dtype) + kernel_main(x=array, a=100) + assert np.allclose(array, 4.60517019) + + # kernel with single reduction assignment + + omega = sp.Symbol("omega") + + reduction_assignment = reduced_assign(omega, "+", x.center()) + + ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype) + code_reduction = ps.get_code_str(ast_reduction) + kernel_reduction = ast_reduction.compile() + + if dtype == "float64": + assert "float" not in code_reduction + + ps.show_code(ast_reduction) \ No newline at end of file -- GitLab From 543bf118944b32b851b526964ee275d7a1808034 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 11:36:53 +0100 Subject: [PATCH 002/180] Fix relative module imports for newly introduced sympyextensions for reductions --- src/pystencils/backend/kernelcreation/freeze.py | 2 +- src/pystencils/simp/assignment_collection.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 65be23065..4d75f1ca6 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -1,4 +1,3 @@ -from sympyextensions.reduction import ReducedAssignment from typing import overload, cast, Any from functools import reduce from operator import add, mul, sub, truediv @@ -16,6 +15,7 @@ from ...sympyextensions import ( ) from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType from ...sympyextensions.pointers import AddressOf, mem_acc +from ...sympyextensions.reduction import ReducedAssignment from ...field import Field, FieldType from .context import KernelCreationContext diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py index 4de3e8dc6..212dbf751 100644 --- a/src/pystencils/simp/assignment_collection.py +++ b/src/pystencils/simp/assignment_collection.py @@ -1,8 +1,6 @@ import itertools from copy import copy -from sympyextensions import reduced_assign -from sympyextensions.reduction import ReducedAssignment from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union import sympy as sp @@ -11,6 +9,8 @@ import pystencils from ..assignment import Assignment from .simplifications import (sort_assignments_topologically, transform_lhs_and_rhs, transform_rhs) from ..sympyextensions import count_operations, fast_subs +from ..sympyextensions import reduced_assign +from ..sympyextensions.reduction import ReducedAssignment class AssignmentCollection: -- GitLab From 558a0f20e082370a0bccd20b96a647e3536bc31e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 12:59:36 +0100 Subject: [PATCH 003/180] Expose new reduced assignments to pystencils interface --- src/pystencils/__init__.py | 14 ++++++++++++++ tests/kernelcreation/test_reduction.py | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 6cb375b61..eecd929cf 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -38,6 +38,14 @@ from .simp import AssignmentCollection from .sympyextensions.typed_sympy import TypedSymbol, DynamicType from .sympyextensions import SymbolCreator from .datahandling import create_data_handling +from .sympyextensions.reduction import ( + AddReducedAssignment, + SubReducedAssignment, + MulReducedAssignment, + DivReducedAssignment, + MinReducedssignment, + MaxReducedssignment +) __all__ = [ "Field", @@ -69,6 +77,12 @@ __all__ = [ "AssignmentCollection", "Assignment", "AddAugmentedAssignment", + "AddReducedAssignment", + "SubReducedAssignment", + "MulReducedAssignment", + "DivReducedAssignment", + "MinReducedssignment", + "MaxReducedssignment", "assignment_from_stencil", "SymbolCreator", "create_data_handling", diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 47509e267..f8c2b1870 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -3,7 +3,7 @@ import numpy as np import sympy as sp import pystencils as ps -from sympyextensions.reduction import reduced_assign +from pystencils import AddReducedAssignment @pytest.mark.parametrize('dtype', ["float64", "float32"]) @@ -32,7 +32,7 @@ def test_log(dtype): omega = sp.Symbol("omega") - reduction_assignment = reduced_assign(omega, "+", x.center()) + reduction_assignment = AddReducedAssignment(omega, x.center()) ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype) code_reduction = ps.get_code_str(ast_reduction) -- GitLab From ba1458538a9c954803d26337f7b428f599421f2c Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 16:36:18 +0100 Subject: [PATCH 004/180] Get rid of reduction using the division operator --- src/pystencils/__init__.py | 2 -- src/pystencils/sympyextensions/reduction.py | 6 +----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index eecd929cf..916a61392 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -42,7 +42,6 @@ from .sympyextensions.reduction import ( AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, - DivReducedAssignment, MinReducedssignment, MaxReducedssignment ) @@ -80,7 +79,6 @@ __all__ = [ "AddReducedAssignment", "SubReducedAssignment", "MulReducedAssignment", - "DivReducedAssignment", "MinReducedssignment", "MaxReducedssignment", "assignment_from_stencil", diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index aa947c1d2..90ab61ede 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -32,10 +32,6 @@ class MulReducedAssignment(ReducedAssignment): binop = '*' -class DivReducedAssignment(ReducedAssignment): - binop = '/' - - class MinReducedssignment(ReducedAssignment): binop = 'min' @@ -46,7 +42,7 @@ class MaxReducedssignment(ReducedAssignment): # Mapping from binary op strings to AugmentedAssignment subclasses reduced_assign_classes = { cls.binop: cls for cls in [ - AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, DivReducedAssignment, + AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, MinReducedssignment, MaxReducedssignment ] } -- GitLab From 778cfd51b4df56c7fccf3686b0c0d3273b43a202 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 16:39:39 +0100 Subject: [PATCH 005/180] Add functions for numeric limits (to be supported by the backends) --- src/pystencils/backend/functions.py | 10 ++++++++++ src/pystencils/backend/platforms/generic_cpu.py | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index 388160f30..ea0d6cb9d 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -94,6 +94,16 @@ class MathFunctions(Enum): self.num_args = num_args +class NumericLimitsFunctions(MathFunctions): + """Numerical limits functions supported by the backend. + + Each platform has to materialize these functions to a concrete implementation. + """ + + min = ("min", 0) + max = ("max", 0) + + class PsMathFunction(PsFunction): """Homogenously typed mathematical functions.""" diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 94fbfa0e1..7cb378703 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -43,7 +43,7 @@ class GenericCpu(Platform): @property def required_headers(self) -> set[str]: - return {"<math.h>"} + return {"<math.h>", "<climits.h"} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace @@ -62,6 +62,8 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args + # TODO: numeric limits + if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): cfunc: CFunction match func: -- GitLab From 719a76fba40197320d03bab06e1d139e6d24a724 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 16:42:00 +0100 Subject: [PATCH 006/180] Introduce reduction symbol property and add to lhs of reduced symbol --- .../backend/kernelcreation/context.py | 2 ++ .../backend/kernelcreation/freeze.py | 28 ++++++++++++------- src/pystencils/codegen/properties.py | 10 +++++++ src/pystencils/sympyextensions/reduction.py | 6 ++-- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 39fb8ef6d..4b4604a21 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -75,6 +75,8 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) + # TODO: add list of reduction symbols + self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 4d75f1ca6..0d1ce72e1 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -65,6 +65,9 @@ from ..exceptions import PsInputError from ..functions import PsMathFunction, MathFunctions from ..exceptions import FreezeError +import backend.functions +from codegen.properties import ReductionSymbolProperty + ExprLike = ( sp.Expr @@ -188,27 +191,32 @@ class FreezeExpressions: lhs = self.visit(expr.lhs) rhs = self.visit(expr.rhs) - assert isinstance(lhs, PsExpression) assert isinstance(rhs, PsExpression) + assert isinstance(lhs, PsSymbolExpr) match expr.op: - case "+=": + case "+": op = add - case "-=": + init_val = PsConstant(0) + case "-": op = sub - case "*=": + init_val = PsConstant(0) + case "*": op = mul - case "/=": - op = truediv - # TODO: unsure if sp.Min & sp.Max work here - case "min=": + init_val = PsConstant(1) + # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards + case "min": op = sp.Min - case "max=": + init_val = backend.functions.NumericLimitsFunctions("min") + case "max": op = sp.Max + init_val = backend.functions.NumericLimitsFunctions("max") case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") - return PsAssignment(lhs, op(lhs.clone(), rhs)) # TODO: PsReducedAssignment? + lhs.symbol.add_property(ReductionSymbolProperty(expr.op, init_val)) + + return PsAssignment(lhs, op(lhs.clone(), rhs)) def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index d377fb3d3..5578d2408 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -3,6 +3,8 @@ from dataclasses import dataclass from ..field import Field +from backend.ast.expressions import PsExpression + @dataclass(frozen=True) class PsSymbolProperty: @@ -14,6 +16,14 @@ class UniqueSymbolProperty(PsSymbolProperty): """Base class for unique properties, of which only one instance may be registered at a time.""" +@dataclass(frozen=True) +class ReductionSymbolProperty(UniqueSymbolProperty): + """Symbol acts as a base pointer to a field.""" + + op: str + init_val: PsExpression + + @dataclass(frozen=True) class FieldShape(PsSymbolProperty): """Symbol acts as a shape parameter to a field.""" diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index 90ab61ede..e2760cc6c 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -12,13 +12,11 @@ class ReducedAssignment(AssignmentBase): Symbol for binary operation being applied in the assignment, such as "+", "*", etc. """ - binop = None # type: str - - # TODO: initial value + binop = None # type: str @property def op(self): - return self.binop + '=' + return self.binop class AddReducedAssignment(ReducedAssignment): -- GitLab From 66ce43954c585e5d576f895ab0d60f62196db813 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 17:19:09 +0100 Subject: [PATCH 007/180] Minor import fixes --- src/pystencils/backend/kernelcreation/freeze.py | 9 ++++----- src/pystencils/codegen/properties.py | 2 +- src/pystencils/sympyextensions/__init__.py | 2 -- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 0d1ce72e1..7316e2f9f 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -62,11 +62,10 @@ from ..ast.vector import PsVecMemAcc from ..constants import PsConstant from ...types import PsNumericType, PsStructType, PsType from ..exceptions import PsInputError -from ..functions import PsMathFunction, MathFunctions +from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions from ..exceptions import FreezeError -import backend.functions -from codegen.properties import ReductionSymbolProperty +from ...codegen.properties import ReductionSymbolProperty ExprLike = ( @@ -207,10 +206,10 @@ class FreezeExpressions: # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards case "min": op = sp.Min - init_val = backend.functions.NumericLimitsFunctions("min") + init_val = NumericLimitsFunctions("min") case "max": op = sp.Max - init_val = backend.functions.NumericLimitsFunctions("max") + init_val = NumericLimitsFunctions("max") case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index 5578d2408..cc4ff4101 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from ..field import Field -from backend.ast.expressions import PsExpression +from ..backend.ast.expressions import PsExpression @dataclass(frozen=True) diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 6ab24e936..7431416c9 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,7 +1,6 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc from .pointers import mem_acc -from .reduction import reduced_assign from .math import ( prod, @@ -34,7 +33,6 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", - "reduced_assign", "TypedSymbol", "CastFunc", "mem_acc", -- GitLab From 53fc7ca4c0ad2e601a09050b3273781ded65542a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 17:50:51 +0100 Subject: [PATCH 008/180] Add dictionary of reduced symbols to codegen context --- .../backend/kernelcreation/context.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 4b4604a21..b9df6f682 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -75,7 +75,7 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) - # TODO: add list of reduction symbols + self._symbols_with_reduction: dict[PsSymbol, ReductionSymbolProperty] = dict() self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() @@ -170,6 +170,21 @@ class KernelCreationContext: self._symbols[old.name] = new + def add_reduction_to_symbol(self, symbol: PsSymbol, reduction: ReductionSymbolProperty): + """Adds a reduction property to a symbol. + + The symbol ``symbol`` should not have a reduction property and must exist in the symbol table. + """ + if self.find_symbol(symbol.name) is None: + raise PsInternalCompilerError( + "add_reduction_to_symbol: Symbol does not exist in the symbol table" + ) + + if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty): + self._symbols_with_reduction[symbol] = reduction + else: + raise PsInternalCompilerError(f"add_reduction_to_symbol: Symbol {symbol.name} already has a reduction property") + def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None ) -> PsSymbol: -- GitLab From b8718cb1d67b14b39b9d806ff3131179fa97e24e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 17:51:22 +0100 Subject: [PATCH 009/180] Try fixing circular module import --- src/pystencils/backend/kernelcreation/context.py | 2 ++ src/pystencils/codegen/properties.py | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index b9df6f682..686646815 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -9,6 +9,8 @@ from ...defaults import DEFAULTS from ...field import Field, FieldType from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType +from ...codegen.properties import ReductionSymbolProperty + from ..memory import PsSymbol, PsBuffer from ..constants import PsConstant from ...types import ( diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index cc4ff4101..2b0af986a 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -2,9 +2,7 @@ from __future__ import annotations from dataclasses import dataclass from ..field import Field - -from ..backend.ast.expressions import PsExpression - +from typing import Any @dataclass(frozen=True) class PsSymbolProperty: @@ -21,7 +19,7 @@ class ReductionSymbolProperty(UniqueSymbolProperty): """Symbol acts as a base pointer to a field.""" op: str - init_val: PsExpression + init_val: Any # TODO: type? @dataclass(frozen=True) -- GitLab From af855492661c5649d3286eb2153f369b3813fb88 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 12:59:16 +0100 Subject: [PATCH 010/180] Minor adaptation on how symbols are given reduction property --- src/pystencils/backend/kernelcreation/context.py | 5 +++-- src/pystencils/backend/kernelcreation/freeze.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 686646815..bcb3a53f8 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -179,13 +179,14 @@ class KernelCreationContext: """ if self.find_symbol(symbol.name) is None: raise PsInternalCompilerError( - "add_reduction_to_symbol: Symbol does not exist in the symbol table" + f"add_reduction_to_symbol: {symbol.name} does not exist in the symbol table" ) if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty): + symbol.add_property(reduction) self._symbols_with_reduction[symbol] = reduction else: - raise PsInternalCompilerError(f"add_reduction_to_symbol: Symbol {symbol.name} already has a reduction property") + raise PsInternalCompilerError(f"add_reduction_to_symbol: {symbol.name} already has a reduction property") def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 7316e2f9f..ae728dd49 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -213,7 +213,7 @@ class FreezeExpressions: case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") - lhs.symbol.add_property(ReductionSymbolProperty(expr.op, init_val)) + self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val)) return PsAssignment(lhs, op(lhs.clone(), rhs)) -- GitLab From 4ae330dc8e87ac8cbb83c6f402ab99cdcb9e9edb Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 13:33:12 +0100 Subject: [PATCH 011/180] Add C function selection for numeric limits functions --- src/pystencils/backend/platforms/generic_cpu.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 7cb378703..ea7799a14 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -3,8 +3,8 @@ from typing import Sequence from pystencils.backend.ast.expressions import PsCall -from ..functions import CFunction, PsMathFunction, MathFunctions -from ...types import PsIntegerType, PsIeeeFloatType +from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions +from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType from .platform import Platform from ..exceptions import MaterializationError @@ -62,7 +62,10 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - # TODO: numeric limits + if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): + cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) + call.function = cfunc + return call if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): cfunc: CFunction -- GitLab From a16969bf5c054b93c5e8a1a69a291d8437bbaa35 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 15:52:05 +0100 Subject: [PATCH 012/180] Add omp reduction clauses for reduced symbols --- src/pystencils/backend/kernelcreation/context.py | 5 +++++ src/pystencils/backend/transformations/add_pragmas.py | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index bcb3a53f8..f3ee646a5 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -219,6 +219,11 @@ class KernelCreationContext: """Return an iterable of all symbols listed in the symbol table.""" return self._symbols.values() + @property + def symbols_with_reduction(self) -> dict[PsSymbol, ReductionSymbolProperty]: + """Return a dictionary holding symbols and their reduction property.""" + return self._symbols_with_reduction + # Fields and Arrays @property diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index 78e721f38..6d72e1550 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -10,6 +10,8 @@ from ..ast import PsAstNode from ..ast.structural import PsBlock, PsLoop, PsPragma from ..ast.expressions import PsExpression +from ...types import PsScalarType + if TYPE_CHECKING: from ...codegen.config import OpenMpConfig @@ -110,6 +112,13 @@ class AddOpenMP: pragma_text += " parallel" if not omp_params.omit_parallel_construct else "" pragma_text += f" for schedule({omp_params.schedule})" + if bool(ctx.symbols_with_reduction): + for symbol, reduction in ctx.symbols_with_reduction.items(): + if isinstance(symbol.dtype, PsScalarType): + pragma_text += f" reduction({reduction.op}: {symbol.name})" + else: + NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.") + if omp_params.num_threads is not None: pragma_text += f" num_threads({str(omp_params.num_threads)})" -- GitLab From 555a6a836408071ca32c705bdf0fdf5e5e610437 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 16:15:24 +0100 Subject: [PATCH 013/180] Reformat reduction.py --- src/pystencils/sympyextensions/reduction.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index e2760cc6c..c9e5bfdfb 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -12,7 +12,7 @@ class ReducedAssignment(AssignmentBase): Symbol for binary operation being applied in the assignment, such as "+", "*", etc. """ - binop = None # type: str + binop = None # type: str @property def op(self): @@ -22,6 +22,7 @@ class ReducedAssignment(AssignmentBase): class AddReducedAssignment(ReducedAssignment): binop = '+' + class SubReducedAssignment(ReducedAssignment): binop = '-' @@ -33,6 +34,7 @@ class MulReducedAssignment(ReducedAssignment): class MinReducedssignment(ReducedAssignment): binop = 'min' + class MaxReducedssignment(ReducedAssignment): binop = 'max' @@ -45,7 +47,8 @@ reduced_assign_classes = { ] } + def reduced_assign(lhs, op, rhs): if op not in reduced_assign_classes: raise ValueError("Unrecognized operator %s" % op) - return reduced_assign_classes[op](lhs, rhs) \ No newline at end of file + return reduced_assign_classes[op](lhs, rhs) -- GitLab From ef9239ede7d00457d90aab5a1740894dfc47d21a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 16:18:37 +0100 Subject: [PATCH 014/180] Add back reduced_assign to sympyextensions interface --- src/pystencils/sympyextensions/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 7431416c9..6ab24e936 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,6 +1,7 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc from .pointers import mem_acc +from .reduction import reduced_assign from .math import ( prod, @@ -33,6 +34,7 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", + "reduced_assign", "TypedSymbol", "CastFunc", "mem_acc", -- GitLab From cf2ec0662b8ea423c252c71e4e926e7fc388d4da Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 16:19:23 +0100 Subject: [PATCH 015/180] Fix inheritance of special math function enum classes --- src/pystencils/backend/functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index ea0d6cb9d..736345395 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -94,7 +94,7 @@ class MathFunctions(Enum): self.num_args = num_args -class NumericLimitsFunctions(MathFunctions): +class NumericLimitsFunctions(Enum): """Numerical limits functions supported by the backend. Each platform has to materialize these functions to a concrete implementation. @@ -109,12 +109,12 @@ class PsMathFunction(PsFunction): __match_args__ = ("func",) - def __init__(self, func: MathFunctions) -> None: + def __init__(self, func: MathFunctions | NumericLimitsFunctions) -> None: super().__init__(func.function_name, func.num_args) self._func = func @property - def func(self) -> MathFunctions: + def func(self) -> MathFunctions | NumericLimitsFunctions: return self._func def __str__(self) -> str: -- GitLab From 9741c024245137405c2c7b09db63267d88d6c12b Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 16:20:14 +0100 Subject: [PATCH 016/180] Fix header include of limits.h --- src/pystencils/backend/platforms/generic_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index ea7799a14..e1a34564d 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -43,7 +43,7 @@ class GenericCpu(Platform): @property def required_headers(self) -> set[str]: - return {"<math.h>", "<climits.h"} + return {"<math.h>", "<limits.h>"} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace -- GitLab From 9a8e6f9bb9a14a144f80b678147c1a4c36456741 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 17:30:25 +0100 Subject: [PATCH 017/180] Omit distinction between normal and reduced assignments in AssignmentCollection --- src/pystencils/simp/assignment_collection.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py index 212dbf751..03b4edccf 100644 --- a/src/pystencils/simp/assignment_collection.py +++ b/src/pystencils/simp/assignment_collection.py @@ -9,8 +9,6 @@ import pystencils from ..assignment import Assignment from .simplifications import (sort_assignments_topologically, transform_lhs_and_rhs, transform_rhs) from ..sympyextensions import count_operations, fast_subs -from ..sympyextensions import reduced_assign -from ..sympyextensions.reduction import ReducedAssignment class AssignmentCollection: @@ -58,13 +56,8 @@ class AssignmentCollection: subexpressions = list(itertools.chain.from_iterable( [(a if isinstance(a, Iterable) else [a]) for a in subexpressions])) - # filter out reduced assignments - reduced_assignments = [a for a in main_assignments if isinstance(a, ReducedAssignment)] - main_assignments = [a for a in main_assignments if (a not in reduced_assignments)] - self.main_assignments = main_assignments self.subexpressions = subexpressions - self.reductions = reduced_assignments if simplification_hints is None: simplification_hints = {} @@ -79,11 +72,6 @@ class AssignmentCollection: else: self.subexpression_symbol_generator = subexpression_symbol_generator - def add_reduction(self, lhs: sp.Symbol, op: str, rhs: sp.Expr) -> None: - """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet.""" - assert lhs not in self.reductions, f"Reduction for lhs {lhs} exists" - self.reductions.append(reduced_assign(lhs, op, rhs)) - def add_simplification_hint(self, key: str, value: Any) -> None: """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet.""" assert key not in self.simplification_hints, "This hint already exists" -- GitLab From e9ee769d2b8d2f177611d6ca5c2ccc0394d83874 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 17:38:46 +0100 Subject: [PATCH 018/180] Adaptations to reduction test --- tests/kernelcreation/test_reduction.py | 40 ++++++++------------------ 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index f8c2b1870..0532b30f5 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -6,39 +6,23 @@ import pystencils as ps from pystencils import AddReducedAssignment -@pytest.mark.parametrize('dtype', ["float64", "float32"]) -def test_log(dtype): - a = sp.Symbol("a") +@pytest.mark.parametrize('dtype', ["float64"]) +def test_reduction(dtype): x = ps.fields(f'x: {dtype}[1d]') + w = sp.Symbol("w") - # kernel with main assignments and no reduction + # kernel with reduction assignment - main_assignment = ps.AssignmentCollection({x.center(): a}) + reduction_assignment = AddReducedAssignment(w, x.center()) - ast_main = ps.create_kernel(main_assignment, default_dtype=dtype) - code_main = ps.get_code_str(ast_main) - kernel_main = ast_main.compile() + config = ps.CreateKernelConfig(cpu_openmp=True) - # ps.show_code(ast) - - if dtype == "float64": - assert "float" not in code_main - - array = np.zeros((10,), dtype=dtype) - kernel_main(x=array, a=100) - assert np.allclose(array, 4.60517019) - - # kernel with single reduction assignment - - omega = sp.Symbol("omega") - - reduction_assignment = AddReducedAssignment(omega, x.center()) - - ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype) - code_reduction = ps.get_code_str(ast_reduction) + ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype) + #code_reduction = ps.get_code_str(ast_reduction) kernel_reduction = ast_reduction.compile() - if dtype == "float64": - assert "float" not in code_reduction + ps.show_code(ast_reduction) - ps.show_code(ast_reduction) \ No newline at end of file + array = np.ones((10,), dtype=dtype) + kernel_reduction(x=array, w=0) + # TODO: check if "w = #points" \ No newline at end of file -- GitLab From f16d8e7978174c24f682202b65aa64e1d53003bb Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 18:15:58 +0100 Subject: [PATCH 019/180] Rename min/max of numeric limits enum --- src/pystencils/backend/platforms/generic_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index e1a34564d..27df6aee4 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -62,7 +62,7 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): + if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.min, NumericLimitsFunctions.max): cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) call.function = cfunc return call -- GitLab From 1a1c23b57015ace0c821d0a709d2bf17fa67b42a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 18:38:34 +0100 Subject: [PATCH 020/180] Adapt comment of ReductionSymbolProperty --- src/pystencils/codegen/properties.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index 2b0af986a..0bad4e898 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from ..field import Field from typing import Any + @dataclass(frozen=True) class PsSymbolProperty: """Base class for symbol properties, which can be used to add additional information to symbols""" @@ -16,10 +17,10 @@ class UniqueSymbolProperty(PsSymbolProperty): @dataclass(frozen=True) class ReductionSymbolProperty(UniqueSymbolProperty): - """Symbol acts as a base pointer to a field.""" + """Property for symbols specifying the operation and initial value for a reduction.""" op: str - init_val: Any # TODO: type? + init_val: Any # TODO: type? @dataclass(frozen=True) -- GitLab From fff5a079171ad884069cd307f2777a2dc0aa68f6 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 13:43:32 +0100 Subject: [PATCH 021/180] Fix removal of function parameters for lhs symbols that are not declared in the kernel --- src/pystencils/backend/ast/analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/ast/analysis.py b/src/pystencils/backend/ast/analysis.py index edeba04f2..7032690a0 100644 --- a/src/pystencils/backend/ast/analysis.py +++ b/src/pystencils/backend/ast/analysis.py @@ -62,7 +62,7 @@ class UndefinedSymbolsCollector: case PsAssignment(lhs, rhs): undefined_vars = self(lhs) | self(rhs) - if isinstance(lhs, PsSymbolExpr): + if isinstance(node, PsDeclaration) and isinstance(lhs, PsSymbolExpr): undefined_vars.remove(lhs.symbol) return undefined_vars -- GitLab From bb984679607d2f3625849667f06c348245b1813a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 14:29:10 +0100 Subject: [PATCH 022/180] Fix usage of numerical limits for init value of reduction --- src/pystencils/backend/functions.py | 8 ++++++-- src/pystencils/backend/kernelcreation/freeze.py | 4 ++-- src/pystencils/backend/platforms/generic_cpu.py | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index 736345395..18c2277cf 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -100,8 +100,12 @@ class NumericLimitsFunctions(Enum): Each platform has to materialize these functions to a concrete implementation. """ - min = ("min", 0) - max = ("max", 0) + Min = ("min", 0) + Max = ("max", 0) + + def __init__(self, func_name, num_args): + self.function_name = func_name + self.num_args = num_args class PsMathFunction(PsFunction): diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index ae728dd49..9a34303e2 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -206,10 +206,10 @@ class FreezeExpressions: # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards case "min": op = sp.Min - init_val = NumericLimitsFunctions("min") + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) case "max": op = sp.Max - init_val = NumericLimitsFunctions("max") + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 27df6aee4..e1a34564d 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -62,7 +62,7 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.min, NumericLimitsFunctions.max): + if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) call.function = cfunc return call -- GitLab From a3025645e30265621d41315e7a0449768225c361 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 14:55:42 +0100 Subject: [PATCH 023/180] Fix min/max reductions --- src/pystencils/backend/kernelcreation/freeze.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 9a34303e2..64230203f 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -193,29 +193,31 @@ class FreezeExpressions: assert isinstance(rhs, PsExpression) assert isinstance(lhs, PsSymbolExpr) + # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) + new_rhs: PsExpression match expr.op: case "+": - op = add init_val = PsConstant(0) + new_rhs = add(lhs.clone(), rhs) case "-": - op = sub init_val = PsConstant(0) + new_rhs = sub(lhs.clone(), rhs) case "*": - op = mul init_val = PsConstant(1) - # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards + new_rhs = mul(lhs.clone(), rhs) case "min": - op = sp.Min init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) + new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs]) case "max": - op = sp.Max init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) + new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs]) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") + # set reduction symbol property in context self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val)) - return PsAssignment(lhs, op(lhs.clone(), rhs)) + return PsAssignment(lhs, new_rhs) def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) -- GitLab From 9bbb8181dcd0717dd61de853f33e11c1ca19d806 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 14:56:12 +0100 Subject: [PATCH 024/180] Parameterize test_reduction.py for different reduction operations --- tests/kernelcreation/test_reduction.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 0532b30f5..c41d250f4 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -3,17 +3,18 @@ import numpy as np import sympy as sp import pystencils as ps -from pystencils import AddReducedAssignment +from pystencils.sympyextensions import reduced_assign @pytest.mark.parametrize('dtype', ["float64"]) -def test_reduction(dtype): +@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) +def test_reduction(dtype, op): x = ps.fields(f'x: {dtype}[1d]') w = sp.Symbol("w") # kernel with reduction assignment - reduction_assignment = AddReducedAssignment(w, x.center()) + reduction_assignment = reduced_assign(w, op, x.center()) config = ps.CreateKernelConfig(cpu_openmp=True) -- GitLab From 3c5a93b4a0016a2c12e0f210fd1324816e6df15e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 15:02:19 +0100 Subject: [PATCH 025/180] Define type of init_val for reduction as Any --- src/pystencils/backend/kernelcreation/freeze.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 64230203f..840329013 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -195,6 +195,7 @@ class FreezeExpressions: # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) new_rhs: PsExpression + init_val: Any # TODO: type? match expr.op: case "+": init_val = PsConstant(0) -- GitLab From 75ea862f50d5372126d1a934b4b1e15ba3dc8c85 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 15:06:57 +0100 Subject: [PATCH 026/180] Try fix mypy no-redef error --- src/pystencils/backend/platforms/generic_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index e1a34564d..3deb03329 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -63,12 +63,12 @@ class GenericCpu(Platform): arg_types = (dtype,) * func.num_args if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): + cfunc: CFunction cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) call.function = cfunc return call if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): - cfunc: CFunction match func: case ( MathFunctions.Exp -- GitLab From 90ca9ead0199cd4f5988e6c43e9c9c5350f566b6 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 20 Jan 2025 17:46:49 +0100 Subject: [PATCH 027/180] Try initializing kernel-local reduction variable copy --- .../backend/kernelcreation/freeze.py | 28 +++++++++++-------- src/pystencils/codegen/driver.py | 12 +++++++- src/pystencils/codegen/properties.py | 7 +++-- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 840329013..e0dcba8fd 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -7,6 +7,7 @@ import sympy.core.relational import sympy.logic.boolalg from sympy.codegen.ast import AssignmentBase, AugmentedAssignment +from ..memory import PsSymbol from ...assignment import Assignment from ...simp import AssignmentCollection from ...sympyextensions import ( @@ -193,32 +194,37 @@ class FreezeExpressions: assert isinstance(rhs, PsExpression) assert isinstance(lhs, PsSymbolExpr) + # create kernel-local copy of lhs symbol to work with + new_lhs_symbol = PsSymbol(f"{lhs.symbol.name}_local", lhs.dtype) + new_lhs = PsSymbolExpr(new_lhs_symbol) + self._ctx.add_symbol(new_lhs_symbol) + # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) new_rhs: PsExpression - init_val: Any # TODO: type? + init_val: PsExpression match expr.op: case "+": - init_val = PsConstant(0) - new_rhs = add(lhs.clone(), rhs) + init_val = PsConstantExpr(PsConstant(0)) + new_rhs = add(new_lhs.clone(), rhs) case "-": - init_val = PsConstant(0) - new_rhs = sub(lhs.clone(), rhs) + init_val = PsConstantExpr(PsConstant(0)) + new_rhs = sub(new_lhs.clone(), rhs) case "*": - init_val = PsConstant(1) - new_rhs = mul(lhs.clone(), rhs) + init_val = PsConstantExpr(PsConstant(1)) + new_rhs = mul(new_lhs.clone(), rhs) case "min": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) - new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs]) + new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs]) case "max": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) - new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs]) + new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs]) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") # set reduction symbol property in context - self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val)) + self._ctx.add_reduction_to_symbol(new_lhs_symbol, ReductionSymbolProperty(expr.op, init_val, lhs.symbol)) - return PsAssignment(lhs, new_rhs) + return PsAssignment(new_lhs, new_rhs) def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 7bdec96cc..199860743 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -7,12 +7,13 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr from .parameters import Parameter +from ..backend.ast.expressions import PsSymbolExpr from ..types import create_numeric_type, PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode -from ..backend.ast.structural import PsBlock, PsLoop +from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -151,6 +152,14 @@ class DefaultKernelCreationDriver: if self._intermediates is not None: self._intermediates.constants_eliminated = kernel_ast.clone() + # Init local reduction variable copy + # for red, prop in self._ctx.symbols_with_reduction.items(): + # kernel_ast.statements = [PsAssignment(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements + + # Write back result to reduction target variable + # for red, prop in self._ctx.symbols_with_reduction.items(): + # kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))] + # Target-Specific optimizations if self._cfg.target.is_cpu(): kernel_ast = self._transform_for_cpu(kernel_ast) @@ -449,6 +458,7 @@ def _get_function_params( props: set[PsSymbolProperty] = set() for prop in symb.properties: match prop: + # TODO: how to export reduction result (via pointer)? case FieldShape() | FieldStride(): props.add(prop) case BufferBasePtr(buf): diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index 0bad4e898..4b8e7f2bf 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -2,7 +2,6 @@ from __future__ import annotations from dataclasses import dataclass from ..field import Field -from typing import Any @dataclass(frozen=True) @@ -19,8 +18,12 @@ class UniqueSymbolProperty(PsSymbolProperty): class ReductionSymbolProperty(UniqueSymbolProperty): """Property for symbols specifying the operation and initial value for a reduction.""" + from ..backend.memory import PsSymbol + from ..backend.ast.expressions import PsExpression + op: str - init_val: Any # TODO: type? + init_val: PsExpression + orig_symbol: PsSymbol @dataclass(frozen=True) -- GitLab From 3fc9a049683b0cbac6bfa721efd38db05c201236 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 21 Jan 2025 13:55:35 +0100 Subject: [PATCH 028/180] Swap out neutral init values for reduced assignments with min/max op --- src/pystencils/backend/kernelcreation/freeze.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 840329013..b58813fcd 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -207,10 +207,10 @@ class FreezeExpressions: init_val = PsConstant(1) new_rhs = mul(lhs.clone(), rhs) case "min": - init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs]) case "max": - init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs]) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") -- GitLab From 9fd1c2ad9bb4f8c225627306f0369273092ef737 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 21 Jan 2025 17:34:06 +0100 Subject: [PATCH 029/180] Fix declaration of local reduction var and write back to original variable --- src/pystencils/codegen/driver.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 199860743..4b08b84ef 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -13,7 +13,7 @@ from ..types import create_numeric_type, PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode -from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment +from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment, PsDeclaration from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -153,12 +153,16 @@ class DefaultKernelCreationDriver: self._intermediates.constants_eliminated = kernel_ast.clone() # Init local reduction variable copy - # for red, prop in self._ctx.symbols_with_reduction.items(): - # kernel_ast.statements = [PsAssignment(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements + for red, prop in self._ctx.symbols_with_reduction.items(): + kernel_ast.statements = [PsDeclaration(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements # Write back result to reduction target variable - # for red, prop in self._ctx.symbols_with_reduction.items(): - # kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))] + for red, prop in self._ctx.symbols_with_reduction.items(): + kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))] + + # TODO: can this be omitted? + typify = Typifier(self._ctx) + kernel_ast = typify(kernel_ast) # Target-Specific optimizations if self._cfg.target.is_cpu(): -- GitLab From 6bc3cf3f17ed3395dabc15a2207739de245cd038 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 21 Jan 2025 18:42:43 +0100 Subject: [PATCH 030/180] Set type of reduced variable to pointer and write back via PsMemAcc --- src/pystencils/backend/kernelcreation/freeze.py | 15 ++++++++++----- src/pystencils/codegen/driver.py | 10 ++++------ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 06d98a44e..d8fb1b91e 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -61,7 +61,7 @@ from ..ast.expressions import ( from ..ast.vector import PsVecMemAcc from ..constants import PsConstant -from ...types import PsNumericType, PsStructType, PsType +from ...types import PsNumericType, PsStructType, PsType, PsPointerType from ..exceptions import PsInputError from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions from ..exceptions import FreezeError @@ -195,9 +195,9 @@ class FreezeExpressions: assert isinstance(lhs, PsSymbolExpr) # create kernel-local copy of lhs symbol to work with - new_lhs_symbol = PsSymbol(f"{lhs.symbol.name}_local", lhs.dtype) - new_lhs = PsSymbolExpr(new_lhs_symbol) - self._ctx.add_symbol(new_lhs_symbol) + new_lhs_symb = PsSymbol(f"{lhs.symbol.name}_local", rhs.dtype) + new_lhs = PsSymbolExpr(new_lhs_symb) + self._ctx.add_symbol(new_lhs_symb) # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) new_rhs: PsExpression @@ -221,8 +221,13 @@ class FreezeExpressions: case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") + # replace original symbol with pointer-based type used for export + orig_symbol_as_ptr = PsSymbol(lhs.symbol.name, PsPointerType(rhs.dtype)) + self._ctx.replace_symbol(lhs.symbol, orig_symbol_as_ptr) + # set reduction symbol property in context - self._ctx.add_reduction_to_symbol(new_lhs_symbol, ReductionSymbolProperty(expr.op, init_val, lhs.symbol)) + init_val.dtype = rhs.dtype + self._ctx.add_reduction_to_symbol(new_lhs_symb, ReductionSymbolProperty(expr.op, init_val, orig_symbol_as_ptr)) return PsAssignment(new_lhs, new_rhs) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 4b08b84ef..04d7376d0 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -7,7 +7,7 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr from .parameters import Parameter -from ..backend.ast.expressions import PsSymbolExpr +from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr from ..types import create_numeric_type, PsIntegerType, PsScalarType @@ -158,11 +158,9 @@ class DefaultKernelCreationDriver: # Write back result to reduction target variable for red, prop in self._ctx.symbols_with_reduction.items(): - kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))] - - # TODO: can this be omitted? - typify = Typifier(self._ctx) - kernel_ast = typify(kernel_ast) + kernel_ast.statements += [PsAssignment( + PsMemAcc(PsSymbolExpr(prop.orig_symbol), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))), + PsSymbolExpr(red))] # Target-Specific optimizations if self._cfg.target.is_cpu(): -- GitLab From b5dd2ef085b19e505d0331d8aa8f6dae9ee85eb0 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 15:16:32 +0100 Subject: [PATCH 031/180] Split reduction var property into local and pointer-based reduction var properties --- .../backend/kernelcreation/context.py | 54 ++++++++++++++----- .../backend/kernelcreation/freeze.py | 31 ++++++----- .../backend/transformations/add_pragmas.py | 4 +- src/pystencils/codegen/driver.py | 10 ++-- src/pystencils/codegen/properties.py | 16 ++++-- 5 files changed, 78 insertions(+), 37 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index f3ee646a5..5e5ca117d 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -9,7 +9,7 @@ from ...defaults import DEFAULTS from ...field import Field, FieldType from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType -from ...codegen.properties import ReductionSymbolProperty +from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable from ..memory import PsSymbol, PsBuffer from ..constants import PsConstant @@ -77,7 +77,8 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) - self._symbols_with_reduction: dict[PsSymbol, ReductionSymbolProperty] = dict() + self._local_reduction_symbols: dict[PsSymbol, LocalReductionVariable] = dict() + self._reduction_ptr_symbols: dict[PsSymbol, ReductionPointerVariable] = dict() self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() @@ -172,21 +173,41 @@ class KernelCreationContext: self._symbols[old.name] = new - def add_reduction_to_symbol(self, symbol: PsSymbol, reduction: ReductionSymbolProperty): - """Adds a reduction property to a symbol. + def add_local_reduction_symbol(self, local_symb: PsSymbol, local_var_prop: LocalReductionVariable): + """Adds entry for a symbol and its property to the lookup table for local reduction variables. - The symbol ``symbol`` should not have a reduction property and must exist in the symbol table. + The symbol ``symbol`` should not have a 'LocalReductionSymbol' property and shall not exist in the symbol table. """ - if self.find_symbol(symbol.name) is None: + if self.find_symbol(local_symb.name) is not None: raise PsInternalCompilerError( - f"add_reduction_to_symbol: {symbol.name} does not exist in the symbol table" + f"add_local_reduction_symbol: {local_symb.name} already exist in the symbol table" ) + self.add_symbol(local_symb) - if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty): - symbol.add_property(reduction) - self._symbols_with_reduction[symbol] = reduction + if local_symb not in self._local_reduction_symbols and not local_symb.get_properties(LocalReductionVariable): + local_symb.add_property(local_var_prop) + self._local_reduction_symbols[local_symb] = local_var_prop else: - raise PsInternalCompilerError(f"add_reduction_to_symbol: {symbol.name} already has a reduction property") + raise PsInternalCompilerError( + f"add_local_reduction_symbol: {local_symb.name} already exists in local reduction table" + ) + + def add_reduction_ptr_symbol(self, orig_symb: PsSymbol, ptr_symb: PsSymbol, ptr_var_prop: ReductionPointerVariable): + """Replaces reduction symbol with a pointer-based counterpart used for export + and adds the new symbol and its property to the lookup table for pointer-based reduction variables + + The symbol ``ptr_symbol`` should not exist in the symbol table. + """ + self.replace_symbol(orig_symb, ptr_symb) + + if ptr_symb not in self._reduction_ptr_symbols and not ptr_symb.get_properties( + ReductionPointerVariable): + ptr_symb.add_property(ptr_var_prop) + self._reduction_ptr_symbols[ptr_symb] = ptr_var_prop + else: + raise PsInternalCompilerError( + f"add_reduction_ptr_symbol: {ptr_symb.name} already exists in pointer-based reduction variable table " + ) def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None @@ -220,9 +241,14 @@ class KernelCreationContext: return self._symbols.values() @property - def symbols_with_reduction(self) -> dict[PsSymbol, ReductionSymbolProperty]: - """Return a dictionary holding symbols and their reduction property.""" - return self._symbols_with_reduction + def local_reduction_symbols(self) -> dict[PsSymbol, LocalReductionVariable]: + """Return a dictionary holding kernel-local reduction symbols and their reduction properties.""" + return self._local_reduction_symbols + + @property + def reduction_pointer_symbols(self) -> dict[PsSymbol, ReductionPointerVariable]: + """Return a dictionary holding pointer-based reduction symbols and their reduction properties.""" + return self._reduction_ptr_symbols # Fields and Arrays diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index d8fb1b91e..1e9984def 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -66,7 +66,7 @@ from ..exceptions import PsInputError from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions from ..exceptions import FreezeError -from ...codegen.properties import ReductionSymbolProperty +from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable ExprLike = ( @@ -194,40 +194,45 @@ class FreezeExpressions: assert isinstance(rhs, PsExpression) assert isinstance(lhs, PsSymbolExpr) + orig_lhs_symb = lhs.symbol + dtype = rhs.dtype # TODO: kernel with (implicit) up/downcasts? + + # replace original symbol with pointer-based type used for export + orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype)) + # create kernel-local copy of lhs symbol to work with - new_lhs_symb = PsSymbol(f"{lhs.symbol.name}_local", rhs.dtype) + new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype) new_lhs = PsSymbolExpr(new_lhs_symb) - self._ctx.add_symbol(new_lhs_symb) # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) new_rhs: PsExpression init_val: PsExpression match expr.op: case "+": - init_val = PsConstantExpr(PsConstant(0)) + init_val = PsConstantExpr(PsConstant(0, dtype)) new_rhs = add(new_lhs.clone(), rhs) case "-": - init_val = PsConstantExpr(PsConstant(0)) + init_val = PsConstantExpr(PsConstant(0, dtype)) new_rhs = sub(new_lhs.clone(), rhs) case "*": - init_val = PsConstantExpr(PsConstant(1)) + init_val = PsConstantExpr(PsConstant(1, dtype)) new_rhs = mul(new_lhs.clone(), rhs) case "min": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) + init_val.dtype = dtype new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs]) case "max": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) + init_val.dtype = dtype new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs]) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") - # replace original symbol with pointer-based type used for export - orig_symbol_as_ptr = PsSymbol(lhs.symbol.name, PsPointerType(rhs.dtype)) - self._ctx.replace_symbol(lhs.symbol, orig_symbol_as_ptr) - - # set reduction symbol property in context - init_val.dtype = rhs.dtype - self._ctx.add_reduction_to_symbol(new_lhs_symb, ReductionSymbolProperty(expr.op, init_val, orig_symbol_as_ptr)) + # set reduction symbol properties (local/pointer variables) in context + self._ctx.add_local_reduction_symbol(new_lhs_symb, + LocalReductionVariable(expr.op, init_val, orig_lhs_symb_as_ptr)) + self._ctx.add_reduction_ptr_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr, + ReductionPointerVariable(expr.op, new_lhs_symb)) return PsAssignment(new_lhs, new_rhs) diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index 6d72e1550..44d1d1ede 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -112,8 +112,8 @@ class AddOpenMP: pragma_text += " parallel" if not omp_params.omit_parallel_construct else "" pragma_text += f" for schedule({omp_params.schedule})" - if bool(ctx.symbols_with_reduction): - for symbol, reduction in ctx.symbols_with_reduction.items(): + if bool(ctx.local_reduction_symbols): + for symbol, reduction in ctx.local_reduction_symbols.items(): if isinstance(symbol.dtype, PsScalarType): pragma_text += f" reduction({reduction.op}: {symbol.name})" else: diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 04d7376d0..3fe2fe74e 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -153,14 +153,14 @@ class DefaultKernelCreationDriver: self._intermediates.constants_eliminated = kernel_ast.clone() # Init local reduction variable copy - for red, prop in self._ctx.symbols_with_reduction.items(): - kernel_ast.statements = [PsDeclaration(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements + for local_red, prop in self._ctx.local_reduction_symbols.items(): + kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), prop.init_val)] + kernel_ast.statements # Write back result to reduction target variable - for red, prop in self._ctx.symbols_with_reduction.items(): + for red_ptr, prop in self._ctx.reduction_pointer_symbols.items(): kernel_ast.statements += [PsAssignment( - PsMemAcc(PsSymbolExpr(prop.orig_symbol), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))), - PsSymbolExpr(red))] + PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))), + PsSymbolExpr(prop.local_symbol))] # Target-Specific optimizations if self._cfg.target.is_cpu(): diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index 4b8e7f2bf..1e71c5b98 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -15,15 +15,25 @@ class UniqueSymbolProperty(PsSymbolProperty): @dataclass(frozen=True) -class ReductionSymbolProperty(UniqueSymbolProperty): - """Property for symbols specifying the operation and initial value for a reduction.""" +class LocalReductionVariable(PsSymbolProperty): + """Property for symbols specifying the operation and initial value for a kernel-local reduction variable.""" from ..backend.memory import PsSymbol from ..backend.ast.expressions import PsExpression op: str init_val: PsExpression - orig_symbol: PsSymbol + ptr_symbol: PsSymbol + + +@dataclass(frozen=True) +class ReductionPointerVariable(PsSymbolProperty): + """Property for pointer-type symbols exporting the reduction result from the kernel.""" + + from ..backend.memory import PsSymbol + + op: str + local_symbol: PsSymbol @dataclass(frozen=True) -- GitLab From 350a4eac9394ca7bb95231b5ad14a1526952e23c Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 16:01:00 +0100 Subject: [PATCH 032/180] Propagate properties of reduction pointer symbols to kernel parameters --- src/pystencils/codegen/driver.py | 5 +++-- src/pystencils/jit/cpu_extension_module.py | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 3fe2fe74e..dd71e30be 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -5,7 +5,7 @@ from dataclasses import dataclass, replace from .target import Target from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange -from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr +from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable from .parameters import Parameter from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr @@ -460,7 +460,8 @@ def _get_function_params( props: set[PsSymbolProperty] = set() for prop in symb.properties: match prop: - # TODO: how to export reduction result (via pointer)? + case ReductionPointerVariable(): + props.add(prop) case FieldShape() | FieldStride(): props.add(prop) case BufferBasePtr(buf): diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index befb033e6..c2c969eaa 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -13,7 +13,7 @@ from ..codegen import ( Kernel, Parameter, ) -from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride +from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride, ReductionPointerVariable from ..types import ( PsType, PsUnsignedIntegerType, @@ -265,7 +265,10 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return self._array_buffers[field] def extract_scalar(self, param: Parameter) -> str: - if param not in self._scalar_extractions: + if any(isinstance(e, ReductionPointerVariable) for e in param.properties): + # TODO: implement + pass + elif param not in self._scalar_extractions: extract_func = self._scalar_extractor(param.dtype) code = self.TMPL_EXTRACT_SCALAR.format( name=param.name, -- GitLab From 6c8ee44f148f13b456c0c9c754e434cc9cf5b59a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 16:01:51 +0100 Subject: [PATCH 033/180] Use literals for C macros used for the numeric limits --- .../backend/platforms/generic_cpu.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 3deb03329..40c338315 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -4,6 +4,7 @@ from typing import Sequence from pystencils.backend.ast.expressions import PsCall from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions +from ..literals import PsLiteral from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType from .platform import Platform @@ -25,7 +26,7 @@ from ..ast.expressions import ( PsLookup, PsGe, PsLe, - PsTernary, + PsTernary, PsLiteralExpr, ) from ..ast.vector import PsVecMemAcc from ...types import PsVectorType, PsCustomType @@ -43,7 +44,7 @@ class GenericCpu(Platform): @property def required_headers(self) -> set[str]: - return {"<math.h>", "<limits.h>"} + return {"<math.h>", "<limits.h>", "<float.h>"} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace @@ -63,12 +64,25 @@ class GenericCpu(Platform): arg_types = (dtype,) * func.num_args if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): - cfunc: CFunction - cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) - call.function = cfunc - return call + # get type prefix for macro + # TODO: there must be a better way... + tpe = "" + match dtype: + case PsIeeeFloatType(): + match dtype.width: + case 32: + tpe = "FLT" + case 64: + tpe = "DBL" + case _: + raise MaterializationError( + f"No implementation available for function {func} on data type {dtype}" + ) + + return PsLiteralExpr(PsLiteral(f"{tpe}_{func.function_name}".upper(), dtype)) if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): + cfunc: CFunction match func: case ( MathFunctions.Exp -- GitLab From 4e748308aca64b02cfa689fb0356a9a38d8c35af Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 16:30:43 +0100 Subject: [PATCH 034/180] Integrate reduction pointers to parameters.py --- src/pystencils/codegen/parameters.py | 16 ++++++++++++++-- src/pystencils/jit/cpu_extension_module.py | 17 ++++++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py index d8411266e..094553517 100644 --- a/src/pystencils/codegen/parameters.py +++ b/src/pystencils/codegen/parameters.py @@ -1,14 +1,14 @@ from __future__ import annotations from warnings import warn -from typing import Sequence, Iterable +from typing import Sequence, Iterable, Optional from .properties import ( PsSymbolProperty, _FieldProperty, FieldShape, FieldStride, - FieldBasePtr, + FieldBasePtr, ReductionPointerVariable, ) from ..types import PsType from ..field import Field @@ -39,6 +39,9 @@ class Parameter: key=lambda f: f.name, ) ) + self._reduction_ptr: Optional[ReductionPointerVariable] = next( + (e for e in self._properties if isinstance(e, ReductionPointerVariable)), None + ) @property def name(self): @@ -79,6 +82,11 @@ class Parameter: """Set of fields associated with this parameter.""" return self._fields + @property + def reduction_pointer(self) -> Optional[ReductionPointerVariable]: + """Reduction pointer associated with this parameter.""" + return self._reduction_ptr + def get_properties( self, prop_type: type[PsSymbolProperty] | tuple[type[PsSymbolProperty], ...] ) -> set[PsSymbolProperty]: @@ -105,6 +113,10 @@ class Parameter: ) return bool(self.get_properties(FieldBasePtr)) + @property + def is_reduction_pointer(self) -> bool: + return bool(self._reduction_ptr) + @property def is_field_stride(self) -> bool: # pragma: no cover warn( diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index c2c969eaa..f9c04200c 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -206,6 +206,8 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ self._array_assoc_var_extractions: dict[Parameter, str] = dict() self._scalar_extractions: dict[Parameter, str] = dict() + self._reduction_ptrs: dict[Parameter, str] = dict() + self._constraint_checks: list[str] = [] self._call: str | None = None @@ -265,10 +267,7 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return self._array_buffers[field] def extract_scalar(self, param: Parameter) -> str: - if any(isinstance(e, ReductionPointerVariable) for e in param.properties): - # TODO: implement - pass - elif param not in self._scalar_extractions: + if param not in self._scalar_extractions: extract_func = self._scalar_extractor(param.dtype) code = self.TMPL_EXTRACT_SCALAR.format( name=param.name, @@ -279,6 +278,12 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return param.name + def extract_reduction_ptr(self, param: Parameter) -> str: + if param not in self._reduction_ptrs: + # TODO: implement + pass + return param.name + def extract_array_assoc_var(self, param: Parameter) -> str: if param not in self._array_assoc_var_extractions: field = param.fields[0] @@ -306,7 +311,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return param.name def extract_parameter(self, param: Parameter): - if param.is_field_parameter: + if param.is_reduction_pointer: + self.extract_reduction_ptr(param) + elif param.is_field_parameter: self.extract_array_assoc_var(param) else: self.extract_scalar(param) -- GitLab From 4f6f5580bc5cb302e1a064a9d642e66822155db6 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 17:23:02 +0100 Subject: [PATCH 035/180] Rewire existing code extraction of fields to support reduction pointer extraction --- src/pystencils/jit/cpu_extension_module.py | 53 ++++++++++++---------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index f9c04200c..d8d90c924 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -199,9 +199,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ """ def __init__(self) -> None: - self._array_buffers: dict[Field, str] = dict() - self._array_extractions: dict[Field, str] = dict() - self._array_frees: dict[Field, str] = dict() + self._array_buffers: dict[Any, str] = dict() + self._array_extractions: dict[Any, str] = dict() + self._array_frees: dict[Any, str] = dict() self._array_assoc_var_extractions: dict[Parameter, str] = dict() self._scalar_extractions: dict[Parameter, str] = dict() @@ -235,36 +235,37 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ else: return None - def extract_field(self, field: Field) -> str: + def extract_buffer(self, buffer: Any, name: str, dtype: PsType) -> str: """Adds an array, and returns the name of the underlying Py_Buffer.""" - if field not in self._array_extractions: - extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=field.name) + if buffer not in self._array_extractions: + extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=name) # Check array type - type_char = self._type_char(field.dtype) + type_char = self._type_char(dtype) if type_char is not None: - dtype_cond = f"buffer_{field.name}.format[0] == '{type_char}'" + dtype_cond = f"buffer_{name}.format[0] == '{type_char}'" extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( cond=dtype_cond, what="data type", - name=field.name, - expected=str(field.dtype), + name=name, + expected=str(dtype), ) # Check item size - itemsize = field.dtype.itemsize - item_size_cond = f"buffer_{field.name}.itemsize == {itemsize}" - extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( - cond=item_size_cond, what="itemsize", name=field.name, expected=itemsize - ) + itemsize = dtype.itemsize + if itemsize is not None: # itemsize of pointer not known (TODO?) + item_size_cond = f"buffer_{name}.itemsize == {itemsize}" + extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( + cond=item_size_cond, what="itemsize", name=name, expected=itemsize + ) - self._array_buffers[field] = f"buffer_{field.name}" - self._array_extractions[field] = extraction_code + self._array_buffers[buffer] = f"buffer_{name}" + self._array_extractions[buffer] = extraction_code - release_code = f"PyBuffer_Release(&buffer_{field.name});" - self._array_frees[field] = release_code + release_code = f"PyBuffer_Release(&buffer_{name});" + self._array_frees[buffer] = release_code - return self._array_buffers[field] + return self._array_buffers[buffer] def extract_scalar(self, param: Parameter) -> str: if param not in self._scalar_extractions: @@ -280,14 +281,20 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ def extract_reduction_ptr(self, param: Parameter) -> str: if param not in self._reduction_ptrs: - # TODO: implement - pass + ptr = param.reduction_pointer + buffer = self.extract_buffer(ptr, param.name, param.dtype) + code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" + + assert code is not None + + self._array_assoc_var_extractions[param] = code + return param.name def extract_array_assoc_var(self, param: Parameter) -> str: if param not in self._array_assoc_var_extractions: field = param.fields[0] - buffer = self.extract_field(field) + buffer = self.extract_buffer(field, field.name, field.dtype) code: str | None = None for prop in param.properties: -- GitLab From 72fa86729d84028e8b900e8a9cd0f9a8cdfab401 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 17:35:50 +0100 Subject: [PATCH 036/180] Refine test_reduction.py to check for result correctness --- tests/kernelcreation/test_reduction.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index c41d250f4..b97343e72 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -5,6 +5,15 @@ import sympy as sp import pystencils as ps from pystencils.sympyextensions import reduced_assign +INIT=2 +SIZE=15 +SOLUTION = { + "+": INIT * SIZE, + "-": INIT * -SIZE, + "*": INIT**SIZE, + "min": INIT, + "max": INIT +} @pytest.mark.parametrize('dtype', ["float64"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) @@ -24,6 +33,7 @@ def test_reduction(dtype, op): ps.show_code(ast_reduction) - array = np.ones((10,), dtype=dtype) - kernel_reduction(x=array, w=0) - # TODO: check if "w = #points" \ No newline at end of file + array = np.full((SIZE,), INIT, dtype=dtype) + reduction_array = np.zeros(1, dtype=dtype) + kernel_reduction(x=array, w=reduction_array) + assert np.allclose(reduction_array, SOLUTION[op]) \ No newline at end of file -- GitLab From 6b8bff09ef5b1c3f451aacae6aa730a9a822f5b2 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 14 Jan 2025 18:13:18 +0100 Subject: [PATCH 037/180] Initial work for introducing reduction capabilities to pystencils Signed-off-by: zy69guqi <richard.angersbach@fau.de> --- .../backend/kernelcreation/freeze.py | 27 +++++++++ src/pystencils/simp/assignment_collection.py | 13 +++++ src/pystencils/sympyextensions/__init__.py | 2 + src/pystencils/sympyextensions/reduction.py | 57 +++++++++++++++++++ tests/kernelcreation/test_reduction.py | 44 ++++++++++++++ 5 files changed, 143 insertions(+) create mode 100644 src/pystencils/sympyextensions/reduction.py create mode 100644 tests/kernelcreation/test_reduction.py diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 44ee17077..65be23065 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -1,3 +1,4 @@ +from sympyextensions.reduction import ReducedAssignment from typing import overload, cast, Any from functools import reduce from operator import add, mul, sub, truediv @@ -183,6 +184,32 @@ class FreezeExpressions: return PsAssignment(lhs, op(lhs.clone(), rhs)) + def map_ReducedAssignment(self, expr: ReducedAssignment): + lhs = self.visit(expr.lhs) + rhs = self.visit(expr.rhs) + + assert isinstance(lhs, PsExpression) + assert isinstance(rhs, PsExpression) + + match expr.op: + case "+=": + op = add + case "-=": + op = sub + case "*=": + op = mul + case "/=": + op = truediv + # TODO: unsure if sp.Min & sp.Max work here + case "min=": + op = sp.Min + case "max=": + op = sp.Max + case _: + raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") + + return PsAssignment(lhs, op(lhs.clone(), rhs)) # TODO: PsReducedAssignment? + def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) return PsSymbolExpr(symb) diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py index f1ba87154..4de3e8dc6 100644 --- a/src/pystencils/simp/assignment_collection.py +++ b/src/pystencils/simp/assignment_collection.py @@ -1,5 +1,8 @@ import itertools from copy import copy + +from sympyextensions import reduced_assign +from sympyextensions.reduction import ReducedAssignment from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union import sympy as sp @@ -55,8 +58,13 @@ class AssignmentCollection: subexpressions = list(itertools.chain.from_iterable( [(a if isinstance(a, Iterable) else [a]) for a in subexpressions])) + # filter out reduced assignments + reduced_assignments = [a for a in main_assignments if isinstance(a, ReducedAssignment)] + main_assignments = [a for a in main_assignments if (a not in reduced_assignments)] + self.main_assignments = main_assignments self.subexpressions = subexpressions + self.reductions = reduced_assignments if simplification_hints is None: simplification_hints = {} @@ -71,6 +79,11 @@ class AssignmentCollection: else: self.subexpression_symbol_generator = subexpression_symbol_generator + def add_reduction(self, lhs: sp.Symbol, op: str, rhs: sp.Expr) -> None: + """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet.""" + assert lhs not in self.reductions, f"Reduction for lhs {lhs} exists" + self.reductions.append(reduced_assign(lhs, op, rhs)) + def add_simplification_hint(self, key: str, value: Any) -> None: """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet.""" assert key not in self.simplification_hints, "This hint already exists" diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 7431416c9..6ab24e936 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,6 +1,7 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc from .pointers import mem_acc +from .reduction import reduced_assign from .math import ( prod, @@ -33,6 +34,7 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", + "reduced_assign", "TypedSymbol", "CastFunc", "mem_acc", diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py new file mode 100644 index 000000000..aa947c1d2 --- /dev/null +++ b/src/pystencils/sympyextensions/reduction.py @@ -0,0 +1,57 @@ +from sympy.codegen.ast import AssignmentBase + + +class ReducedAssignment(AssignmentBase): + """ + Base class for reduced assignments. + + Attributes: + =========== + + binop : str + Symbol for binary operation being applied in the assignment, such as "+", + "*", etc. + """ + binop = None # type: str + + # TODO: initial value + + @property + def op(self): + return self.binop + '=' + + +class AddReducedAssignment(ReducedAssignment): + binop = '+' + +class SubReducedAssignment(ReducedAssignment): + binop = '-' + + +class MulReducedAssignment(ReducedAssignment): + binop = '*' + + +class DivReducedAssignment(ReducedAssignment): + binop = '/' + + +class MinReducedssignment(ReducedAssignment): + binop = 'min' + +class MaxReducedssignment(ReducedAssignment): + binop = 'max' + + +# Mapping from binary op strings to AugmentedAssignment subclasses +reduced_assign_classes = { + cls.binop: cls for cls in [ + AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, DivReducedAssignment, + MinReducedssignment, MaxReducedssignment + ] +} + +def reduced_assign(lhs, op, rhs): + if op not in reduced_assign_classes: + raise ValueError("Unrecognized operator %s" % op) + return reduced_assign_classes[op](lhs, rhs) \ No newline at end of file diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py new file mode 100644 index 000000000..47509e267 --- /dev/null +++ b/tests/kernelcreation/test_reduction.py @@ -0,0 +1,44 @@ +import pytest +import numpy as np +import sympy as sp + +import pystencils as ps +from sympyextensions.reduction import reduced_assign + + +@pytest.mark.parametrize('dtype', ["float64", "float32"]) +def test_log(dtype): + a = sp.Symbol("a") + x = ps.fields(f'x: {dtype}[1d]') + + # kernel with main assignments and no reduction + + main_assignment = ps.AssignmentCollection({x.center(): a}) + + ast_main = ps.create_kernel(main_assignment, default_dtype=dtype) + code_main = ps.get_code_str(ast_main) + kernel_main = ast_main.compile() + + # ps.show_code(ast) + + if dtype == "float64": + assert "float" not in code_main + + array = np.zeros((10,), dtype=dtype) + kernel_main(x=array, a=100) + assert np.allclose(array, 4.60517019) + + # kernel with single reduction assignment + + omega = sp.Symbol("omega") + + reduction_assignment = reduced_assign(omega, "+", x.center()) + + ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype) + code_reduction = ps.get_code_str(ast_reduction) + kernel_reduction = ast_reduction.compile() + + if dtype == "float64": + assert "float" not in code_reduction + + ps.show_code(ast_reduction) \ No newline at end of file -- GitLab From f54fa321869682e71edd3f2828725104de9abe36 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 11:36:53 +0100 Subject: [PATCH 038/180] Fix relative module imports for newly introduced sympyextensions for reductions --- src/pystencils/backend/kernelcreation/freeze.py | 2 +- src/pystencils/simp/assignment_collection.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 65be23065..4d75f1ca6 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -1,4 +1,3 @@ -from sympyextensions.reduction import ReducedAssignment from typing import overload, cast, Any from functools import reduce from operator import add, mul, sub, truediv @@ -16,6 +15,7 @@ from ...sympyextensions import ( ) from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType from ...sympyextensions.pointers import AddressOf, mem_acc +from ...sympyextensions.reduction import ReducedAssignment from ...field import Field, FieldType from .context import KernelCreationContext diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py index 4de3e8dc6..212dbf751 100644 --- a/src/pystencils/simp/assignment_collection.py +++ b/src/pystencils/simp/assignment_collection.py @@ -1,8 +1,6 @@ import itertools from copy import copy -from sympyextensions import reduced_assign -from sympyextensions.reduction import ReducedAssignment from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union import sympy as sp @@ -11,6 +9,8 @@ import pystencils from ..assignment import Assignment from .simplifications import (sort_assignments_topologically, transform_lhs_and_rhs, transform_rhs) from ..sympyextensions import count_operations, fast_subs +from ..sympyextensions import reduced_assign +from ..sympyextensions.reduction import ReducedAssignment class AssignmentCollection: -- GitLab From c7b9bb522828f8d5e32d3c20ad7d17eee689d2eb Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 12:59:36 +0100 Subject: [PATCH 039/180] Expose new reduced assignments to pystencils interface --- src/pystencils/__init__.py | 14 ++++++++++++++ tests/kernelcreation/test_reduction.py | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 6cb375b61..eecd929cf 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -38,6 +38,14 @@ from .simp import AssignmentCollection from .sympyextensions.typed_sympy import TypedSymbol, DynamicType from .sympyextensions import SymbolCreator from .datahandling import create_data_handling +from .sympyextensions.reduction import ( + AddReducedAssignment, + SubReducedAssignment, + MulReducedAssignment, + DivReducedAssignment, + MinReducedssignment, + MaxReducedssignment +) __all__ = [ "Field", @@ -69,6 +77,12 @@ __all__ = [ "AssignmentCollection", "Assignment", "AddAugmentedAssignment", + "AddReducedAssignment", + "SubReducedAssignment", + "MulReducedAssignment", + "DivReducedAssignment", + "MinReducedssignment", + "MaxReducedssignment", "assignment_from_stencil", "SymbolCreator", "create_data_handling", diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 47509e267..f8c2b1870 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -3,7 +3,7 @@ import numpy as np import sympy as sp import pystencils as ps -from sympyextensions.reduction import reduced_assign +from pystencils import AddReducedAssignment @pytest.mark.parametrize('dtype', ["float64", "float32"]) @@ -32,7 +32,7 @@ def test_log(dtype): omega = sp.Symbol("omega") - reduction_assignment = reduced_assign(omega, "+", x.center()) + reduction_assignment = AddReducedAssignment(omega, x.center()) ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype) code_reduction = ps.get_code_str(ast_reduction) -- GitLab From fae371d48773f9423855be824f495f3f20abcdc2 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 16:36:18 +0100 Subject: [PATCH 040/180] Get rid of reduction using the division operator --- src/pystencils/__init__.py | 2 -- src/pystencils/sympyextensions/reduction.py | 6 +----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index eecd929cf..916a61392 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -42,7 +42,6 @@ from .sympyextensions.reduction import ( AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, - DivReducedAssignment, MinReducedssignment, MaxReducedssignment ) @@ -80,7 +79,6 @@ __all__ = [ "AddReducedAssignment", "SubReducedAssignment", "MulReducedAssignment", - "DivReducedAssignment", "MinReducedssignment", "MaxReducedssignment", "assignment_from_stencil", diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index aa947c1d2..90ab61ede 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -32,10 +32,6 @@ class MulReducedAssignment(ReducedAssignment): binop = '*' -class DivReducedAssignment(ReducedAssignment): - binop = '/' - - class MinReducedssignment(ReducedAssignment): binop = 'min' @@ -46,7 +42,7 @@ class MaxReducedssignment(ReducedAssignment): # Mapping from binary op strings to AugmentedAssignment subclasses reduced_assign_classes = { cls.binop: cls for cls in [ - AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, DivReducedAssignment, + AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, MinReducedssignment, MaxReducedssignment ] } -- GitLab From b263d752d8af83569517aa56ef17facfa26adc91 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 16:39:39 +0100 Subject: [PATCH 041/180] Add functions for numeric limits (to be supported by the backends) --- src/pystencils/backend/functions.py | 10 ++++++++++ src/pystencils/backend/platforms/generic_cpu.py | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index 388160f30..ea0d6cb9d 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -94,6 +94,16 @@ class MathFunctions(Enum): self.num_args = num_args +class NumericLimitsFunctions(MathFunctions): + """Numerical limits functions supported by the backend. + + Each platform has to materialize these functions to a concrete implementation. + """ + + min = ("min", 0) + max = ("max", 0) + + class PsMathFunction(PsFunction): """Homogenously typed mathematical functions.""" diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index b6d7dd551..affeb34d4 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -43,7 +43,7 @@ class GenericCpu(Platform): @property def required_headers(self) -> set[str]: - return {"<math.h>"} + return {"<math.h>", "<climits.h"} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace @@ -62,6 +62,8 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args + # TODO: numeric limits + if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): cfunc: CFunction match func: -- GitLab From 548375295a08f88bef9ba8e597c47fe7143d8139 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 16:42:00 +0100 Subject: [PATCH 042/180] Introduce reduction symbol property and add to lhs of reduced symbol --- .../backend/kernelcreation/context.py | 2 ++ .../backend/kernelcreation/freeze.py | 28 ++++++++++++------- src/pystencils/codegen/properties.py | 10 +++++++ src/pystencils/sympyextensions/reduction.py | 6 ++-- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 8f5931c64..b6bf09dba 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -75,6 +75,8 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) + # TODO: add list of reduction symbols + self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 4d75f1ca6..0d1ce72e1 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -65,6 +65,9 @@ from ..exceptions import PsInputError from ..functions import PsMathFunction, MathFunctions from ..exceptions import FreezeError +import backend.functions +from codegen.properties import ReductionSymbolProperty + ExprLike = ( sp.Expr @@ -188,27 +191,32 @@ class FreezeExpressions: lhs = self.visit(expr.lhs) rhs = self.visit(expr.rhs) - assert isinstance(lhs, PsExpression) assert isinstance(rhs, PsExpression) + assert isinstance(lhs, PsSymbolExpr) match expr.op: - case "+=": + case "+": op = add - case "-=": + init_val = PsConstant(0) + case "-": op = sub - case "*=": + init_val = PsConstant(0) + case "*": op = mul - case "/=": - op = truediv - # TODO: unsure if sp.Min & sp.Max work here - case "min=": + init_val = PsConstant(1) + # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards + case "min": op = sp.Min - case "max=": + init_val = backend.functions.NumericLimitsFunctions("min") + case "max": op = sp.Max + init_val = backend.functions.NumericLimitsFunctions("max") case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") - return PsAssignment(lhs, op(lhs.clone(), rhs)) # TODO: PsReducedAssignment? + lhs.symbol.add_property(ReductionSymbolProperty(expr.op, init_val)) + + return PsAssignment(lhs, op(lhs.clone(), rhs)) def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index d377fb3d3..5578d2408 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -3,6 +3,8 @@ from dataclasses import dataclass from ..field import Field +from backend.ast.expressions import PsExpression + @dataclass(frozen=True) class PsSymbolProperty: @@ -14,6 +16,14 @@ class UniqueSymbolProperty(PsSymbolProperty): """Base class for unique properties, of which only one instance may be registered at a time.""" +@dataclass(frozen=True) +class ReductionSymbolProperty(UniqueSymbolProperty): + """Symbol acts as a base pointer to a field.""" + + op: str + init_val: PsExpression + + @dataclass(frozen=True) class FieldShape(PsSymbolProperty): """Symbol acts as a shape parameter to a field.""" diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index 90ab61ede..e2760cc6c 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -12,13 +12,11 @@ class ReducedAssignment(AssignmentBase): Symbol for binary operation being applied in the assignment, such as "+", "*", etc. """ - binop = None # type: str - - # TODO: initial value + binop = None # type: str @property def op(self): - return self.binop + '=' + return self.binop class AddReducedAssignment(ReducedAssignment): -- GitLab From 35c8160bf5f9da900255583bbfb10f935d5b3687 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 17:19:09 +0100 Subject: [PATCH 043/180] Minor import fixes --- src/pystencils/backend/kernelcreation/freeze.py | 9 ++++----- src/pystencils/codegen/properties.py | 2 +- src/pystencils/sympyextensions/__init__.py | 2 -- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 0d1ce72e1..7316e2f9f 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -62,11 +62,10 @@ from ..ast.vector import PsVecMemAcc from ..constants import PsConstant from ...types import PsNumericType, PsStructType, PsType from ..exceptions import PsInputError -from ..functions import PsMathFunction, MathFunctions +from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions from ..exceptions import FreezeError -import backend.functions -from codegen.properties import ReductionSymbolProperty +from ...codegen.properties import ReductionSymbolProperty ExprLike = ( @@ -207,10 +206,10 @@ class FreezeExpressions: # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards case "min": op = sp.Min - init_val = backend.functions.NumericLimitsFunctions("min") + init_val = NumericLimitsFunctions("min") case "max": op = sp.Max - init_val = backend.functions.NumericLimitsFunctions("max") + init_val = NumericLimitsFunctions("max") case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index 5578d2408..cc4ff4101 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from ..field import Field -from backend.ast.expressions import PsExpression +from ..backend.ast.expressions import PsExpression @dataclass(frozen=True) diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 6ab24e936..7431416c9 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,7 +1,6 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc from .pointers import mem_acc -from .reduction import reduced_assign from .math import ( prod, @@ -34,7 +33,6 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", - "reduced_assign", "TypedSymbol", "CastFunc", "mem_acc", -- GitLab From d8a717874a0156d0c27c75b04c859b3a4a98268d Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 17:50:51 +0100 Subject: [PATCH 044/180] Add dictionary of reduced symbols to codegen context --- .../backend/kernelcreation/context.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index b6bf09dba..39205d707 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -75,7 +75,7 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) - # TODO: add list of reduction symbols + self._symbols_with_reduction: dict[PsSymbol, ReductionSymbolProperty] = dict() self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() @@ -170,6 +170,21 @@ class KernelCreationContext: self._symbols[old.name] = new + def add_reduction_to_symbol(self, symbol: PsSymbol, reduction: ReductionSymbolProperty): + """Adds a reduction property to a symbol. + + The symbol ``symbol`` should not have a reduction property and must exist in the symbol table. + """ + if self.find_symbol(symbol.name) is None: + raise PsInternalCompilerError( + "add_reduction_to_symbol: Symbol does not exist in the symbol table" + ) + + if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty): + self._symbols_with_reduction[symbol] = reduction + else: + raise PsInternalCompilerError(f"add_reduction_to_symbol: Symbol {symbol.name} already has a reduction property") + def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None ) -> PsSymbol: -- GitLab From 3d592ab07194bc623884b81413278d954434e89b Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 15 Jan 2025 17:51:22 +0100 Subject: [PATCH 045/180] Try fixing circular module import --- src/pystencils/backend/kernelcreation/context.py | 2 ++ src/pystencils/codegen/properties.py | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 39205d707..258204f8d 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -9,6 +9,8 @@ from ...defaults import DEFAULTS from ...field import Field, FieldType from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType +from ...codegen.properties import ReductionSymbolProperty + from ..memory import PsSymbol, PsBuffer from ..constants import PsConstant from ...types import ( diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index cc4ff4101..2b0af986a 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -2,9 +2,7 @@ from __future__ import annotations from dataclasses import dataclass from ..field import Field - -from ..backend.ast.expressions import PsExpression - +from typing import Any @dataclass(frozen=True) class PsSymbolProperty: @@ -21,7 +19,7 @@ class ReductionSymbolProperty(UniqueSymbolProperty): """Symbol acts as a base pointer to a field.""" op: str - init_val: PsExpression + init_val: Any # TODO: type? @dataclass(frozen=True) -- GitLab From e5861425c5e2e4868e0c696763297768814c02ec Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 12:59:16 +0100 Subject: [PATCH 046/180] Minor adaptation on how symbols are given reduction property --- src/pystencils/backend/kernelcreation/context.py | 5 +++-- src/pystencils/backend/kernelcreation/freeze.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 258204f8d..e41f8371c 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -179,13 +179,14 @@ class KernelCreationContext: """ if self.find_symbol(symbol.name) is None: raise PsInternalCompilerError( - "add_reduction_to_symbol: Symbol does not exist in the symbol table" + f"add_reduction_to_symbol: {symbol.name} does not exist in the symbol table" ) if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty): + symbol.add_property(reduction) self._symbols_with_reduction[symbol] = reduction else: - raise PsInternalCompilerError(f"add_reduction_to_symbol: Symbol {symbol.name} already has a reduction property") + raise PsInternalCompilerError(f"add_reduction_to_symbol: {symbol.name} already has a reduction property") def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 7316e2f9f..ae728dd49 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -213,7 +213,7 @@ class FreezeExpressions: case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") - lhs.symbol.add_property(ReductionSymbolProperty(expr.op, init_val)) + self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val)) return PsAssignment(lhs, op(lhs.clone(), rhs)) -- GitLab From c96a94619ff058ba81d798b6f4bfbe06cd273535 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 13:33:12 +0100 Subject: [PATCH 047/180] Add C function selection for numeric limits functions --- src/pystencils/backend/platforms/generic_cpu.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index affeb34d4..6e3c58e6f 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -3,8 +3,8 @@ from typing import Sequence from pystencils.backend.ast.expressions import PsCall -from ..functions import CFunction, PsMathFunction, MathFunctions -from ...types import PsIntegerType, PsIeeeFloatType +from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions +from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType from .platform import Platform from ..exceptions import MaterializationError @@ -62,7 +62,10 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - # TODO: numeric limits + if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): + cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) + call.function = cfunc + return call if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): cfunc: CFunction -- GitLab From 99a3335135baf5da92ec56713f96beae37798b60 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 15:52:05 +0100 Subject: [PATCH 048/180] Add omp reduction clauses for reduced symbols --- src/pystencils/backend/kernelcreation/context.py | 5 +++++ src/pystencils/backend/transformations/add_pragmas.py | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index e41f8371c..a8728e6ac 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -223,6 +223,11 @@ class KernelCreationContext: """Return an iterable of all symbols listed in the symbol table.""" return self._symbols.values() + @property + def symbols_with_reduction(self) -> dict[PsSymbol, ReductionSymbolProperty]: + """Return a dictionary holding symbols and their reduction property.""" + return self._symbols_with_reduction + # Fields and Arrays @property diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index 78e721f38..6d72e1550 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -10,6 +10,8 @@ from ..ast import PsAstNode from ..ast.structural import PsBlock, PsLoop, PsPragma from ..ast.expressions import PsExpression +from ...types import PsScalarType + if TYPE_CHECKING: from ...codegen.config import OpenMpConfig @@ -110,6 +112,13 @@ class AddOpenMP: pragma_text += " parallel" if not omp_params.omit_parallel_construct else "" pragma_text += f" for schedule({omp_params.schedule})" + if bool(ctx.symbols_with_reduction): + for symbol, reduction in ctx.symbols_with_reduction.items(): + if isinstance(symbol.dtype, PsScalarType): + pragma_text += f" reduction({reduction.op}: {symbol.name})" + else: + NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.") + if omp_params.num_threads is not None: pragma_text += f" num_threads({str(omp_params.num_threads)})" -- GitLab From f00708edcaec67412905fbd26842735152da9812 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 16:15:24 +0100 Subject: [PATCH 049/180] Reformat reduction.py --- src/pystencils/sympyextensions/reduction.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index e2760cc6c..c9e5bfdfb 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -12,7 +12,7 @@ class ReducedAssignment(AssignmentBase): Symbol for binary operation being applied in the assignment, such as "+", "*", etc. """ - binop = None # type: str + binop = None # type: str @property def op(self): @@ -22,6 +22,7 @@ class ReducedAssignment(AssignmentBase): class AddReducedAssignment(ReducedAssignment): binop = '+' + class SubReducedAssignment(ReducedAssignment): binop = '-' @@ -33,6 +34,7 @@ class MulReducedAssignment(ReducedAssignment): class MinReducedssignment(ReducedAssignment): binop = 'min' + class MaxReducedssignment(ReducedAssignment): binop = 'max' @@ -45,7 +47,8 @@ reduced_assign_classes = { ] } + def reduced_assign(lhs, op, rhs): if op not in reduced_assign_classes: raise ValueError("Unrecognized operator %s" % op) - return reduced_assign_classes[op](lhs, rhs) \ No newline at end of file + return reduced_assign_classes[op](lhs, rhs) -- GitLab From 71aaf722f0a3aecfcc79e2633311aa8f5677b42d Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 16:18:37 +0100 Subject: [PATCH 050/180] Add back reduced_assign to sympyextensions interface --- src/pystencils/sympyextensions/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 7431416c9..6ab24e936 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,6 +1,7 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc from .pointers import mem_acc +from .reduction import reduced_assign from .math import ( prod, @@ -33,6 +34,7 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", + "reduced_assign", "TypedSymbol", "CastFunc", "mem_acc", -- GitLab From f30ca33b9ea897e66c996759a40b1aebf43a3688 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 16:19:23 +0100 Subject: [PATCH 051/180] Fix inheritance of special math function enum classes --- src/pystencils/backend/functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index ea0d6cb9d..736345395 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -94,7 +94,7 @@ class MathFunctions(Enum): self.num_args = num_args -class NumericLimitsFunctions(MathFunctions): +class NumericLimitsFunctions(Enum): """Numerical limits functions supported by the backend. Each platform has to materialize these functions to a concrete implementation. @@ -109,12 +109,12 @@ class PsMathFunction(PsFunction): __match_args__ = ("func",) - def __init__(self, func: MathFunctions) -> None: + def __init__(self, func: MathFunctions | NumericLimitsFunctions) -> None: super().__init__(func.function_name, func.num_args) self._func = func @property - def func(self) -> MathFunctions: + def func(self) -> MathFunctions | NumericLimitsFunctions: return self._func def __str__(self) -> str: -- GitLab From 8fb5af398d208cf9779f7b0c528237d0d79ef7fb Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 16:20:14 +0100 Subject: [PATCH 052/180] Fix header include of limits.h --- src/pystencils/backend/platforms/generic_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 6e3c58e6f..ae59d0423 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -43,7 +43,7 @@ class GenericCpu(Platform): @property def required_headers(self) -> set[str]: - return {"<math.h>", "<climits.h"} + return {"<math.h>", "<limits.h>"} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace -- GitLab From dc5898a77e7b72a2221af8b66fa56640c2516e76 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 17:30:25 +0100 Subject: [PATCH 053/180] Omit distinction between normal and reduced assignments in AssignmentCollection --- src/pystencils/simp/assignment_collection.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py index 212dbf751..03b4edccf 100644 --- a/src/pystencils/simp/assignment_collection.py +++ b/src/pystencils/simp/assignment_collection.py @@ -9,8 +9,6 @@ import pystencils from ..assignment import Assignment from .simplifications import (sort_assignments_topologically, transform_lhs_and_rhs, transform_rhs) from ..sympyextensions import count_operations, fast_subs -from ..sympyextensions import reduced_assign -from ..sympyextensions.reduction import ReducedAssignment class AssignmentCollection: @@ -58,13 +56,8 @@ class AssignmentCollection: subexpressions = list(itertools.chain.from_iterable( [(a if isinstance(a, Iterable) else [a]) for a in subexpressions])) - # filter out reduced assignments - reduced_assignments = [a for a in main_assignments if isinstance(a, ReducedAssignment)] - main_assignments = [a for a in main_assignments if (a not in reduced_assignments)] - self.main_assignments = main_assignments self.subexpressions = subexpressions - self.reductions = reduced_assignments if simplification_hints is None: simplification_hints = {} @@ -79,11 +72,6 @@ class AssignmentCollection: else: self.subexpression_symbol_generator = subexpression_symbol_generator - def add_reduction(self, lhs: sp.Symbol, op: str, rhs: sp.Expr) -> None: - """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet.""" - assert lhs not in self.reductions, f"Reduction for lhs {lhs} exists" - self.reductions.append(reduced_assign(lhs, op, rhs)) - def add_simplification_hint(self, key: str, value: Any) -> None: """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet.""" assert key not in self.simplification_hints, "This hint already exists" -- GitLab From 97c171f3cd5371152fd102d92e302fc13ae4ee2b Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 17:38:46 +0100 Subject: [PATCH 054/180] Adaptations to reduction test --- tests/kernelcreation/test_reduction.py | 40 ++++++++------------------ 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index f8c2b1870..0532b30f5 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -6,39 +6,23 @@ import pystencils as ps from pystencils import AddReducedAssignment -@pytest.mark.parametrize('dtype', ["float64", "float32"]) -def test_log(dtype): - a = sp.Symbol("a") +@pytest.mark.parametrize('dtype', ["float64"]) +def test_reduction(dtype): x = ps.fields(f'x: {dtype}[1d]') + w = sp.Symbol("w") - # kernel with main assignments and no reduction + # kernel with reduction assignment - main_assignment = ps.AssignmentCollection({x.center(): a}) + reduction_assignment = AddReducedAssignment(w, x.center()) - ast_main = ps.create_kernel(main_assignment, default_dtype=dtype) - code_main = ps.get_code_str(ast_main) - kernel_main = ast_main.compile() + config = ps.CreateKernelConfig(cpu_openmp=True) - # ps.show_code(ast) - - if dtype == "float64": - assert "float" not in code_main - - array = np.zeros((10,), dtype=dtype) - kernel_main(x=array, a=100) - assert np.allclose(array, 4.60517019) - - # kernel with single reduction assignment - - omega = sp.Symbol("omega") - - reduction_assignment = AddReducedAssignment(omega, x.center()) - - ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype) - code_reduction = ps.get_code_str(ast_reduction) + ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype) + #code_reduction = ps.get_code_str(ast_reduction) kernel_reduction = ast_reduction.compile() - if dtype == "float64": - assert "float" not in code_reduction + ps.show_code(ast_reduction) - ps.show_code(ast_reduction) \ No newline at end of file + array = np.ones((10,), dtype=dtype) + kernel_reduction(x=array, w=0) + # TODO: check if "w = #points" \ No newline at end of file -- GitLab From 355d638aab1179db95204d490fe585f7f4fcb7c1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 18:15:58 +0100 Subject: [PATCH 055/180] Rename min/max of numeric limits enum --- src/pystencils/backend/platforms/generic_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index ae59d0423..620cf9cfb 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -62,7 +62,7 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): + if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.min, NumericLimitsFunctions.max): cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) call.function = cfunc return call -- GitLab From 55c9812023c384c7978f2536020d8587b7a12019 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 16 Jan 2025 18:38:34 +0100 Subject: [PATCH 056/180] Adapt comment of ReductionSymbolProperty --- src/pystencils/codegen/properties.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index 2b0af986a..0bad4e898 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from ..field import Field from typing import Any + @dataclass(frozen=True) class PsSymbolProperty: """Base class for symbol properties, which can be used to add additional information to symbols""" @@ -16,10 +17,10 @@ class UniqueSymbolProperty(PsSymbolProperty): @dataclass(frozen=True) class ReductionSymbolProperty(UniqueSymbolProperty): - """Symbol acts as a base pointer to a field.""" + """Property for symbols specifying the operation and initial value for a reduction.""" op: str - init_val: Any # TODO: type? + init_val: Any # TODO: type? @dataclass(frozen=True) -- GitLab From 6f8fbdfe7a6cbf14c6f64a86315fa435ed0c9336 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 13:43:32 +0100 Subject: [PATCH 057/180] Fix removal of function parameters for lhs symbols that are not declared in the kernel --- src/pystencils/backend/ast/analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/ast/analysis.py b/src/pystencils/backend/ast/analysis.py index edeba04f2..7032690a0 100644 --- a/src/pystencils/backend/ast/analysis.py +++ b/src/pystencils/backend/ast/analysis.py @@ -62,7 +62,7 @@ class UndefinedSymbolsCollector: case PsAssignment(lhs, rhs): undefined_vars = self(lhs) | self(rhs) - if isinstance(lhs, PsSymbolExpr): + if isinstance(node, PsDeclaration) and isinstance(lhs, PsSymbolExpr): undefined_vars.remove(lhs.symbol) return undefined_vars -- GitLab From b4dd0c8c55d26f87b4467f814c526fafc2ced76b Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 14:29:10 +0100 Subject: [PATCH 058/180] Fix usage of numerical limits for init value of reduction --- src/pystencils/backend/functions.py | 8 ++++++-- src/pystencils/backend/kernelcreation/freeze.py | 4 ++-- src/pystencils/backend/platforms/generic_cpu.py | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index 736345395..18c2277cf 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -100,8 +100,12 @@ class NumericLimitsFunctions(Enum): Each platform has to materialize these functions to a concrete implementation. """ - min = ("min", 0) - max = ("max", 0) + Min = ("min", 0) + Max = ("max", 0) + + def __init__(self, func_name, num_args): + self.function_name = func_name + self.num_args = num_args class PsMathFunction(PsFunction): diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index ae728dd49..9a34303e2 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -206,10 +206,10 @@ class FreezeExpressions: # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards case "min": op = sp.Min - init_val = NumericLimitsFunctions("min") + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) case "max": op = sp.Max - init_val = NumericLimitsFunctions("max") + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 620cf9cfb..ae59d0423 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -62,7 +62,7 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.min, NumericLimitsFunctions.max): + if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) call.function = cfunc return call -- GitLab From ffcd54e053a2ddda2e015d2836184f2fcaef59f3 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 14:55:42 +0100 Subject: [PATCH 059/180] Fix min/max reductions --- src/pystencils/backend/kernelcreation/freeze.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 9a34303e2..64230203f 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -193,29 +193,31 @@ class FreezeExpressions: assert isinstance(rhs, PsExpression) assert isinstance(lhs, PsSymbolExpr) + # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) + new_rhs: PsExpression match expr.op: case "+": - op = add init_val = PsConstant(0) + new_rhs = add(lhs.clone(), rhs) case "-": - op = sub init_val = PsConstant(0) + new_rhs = sub(lhs.clone(), rhs) case "*": - op = mul init_val = PsConstant(1) - # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards + new_rhs = mul(lhs.clone(), rhs) case "min": - op = sp.Min init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) + new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs]) case "max": - op = sp.Max init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) + new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs]) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") + # set reduction symbol property in context self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val)) - return PsAssignment(lhs, op(lhs.clone(), rhs)) + return PsAssignment(lhs, new_rhs) def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) -- GitLab From dcdfff042120db9960f1859a60fdeb1890f02878 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 14:56:12 +0100 Subject: [PATCH 060/180] Parameterize test_reduction.py for different reduction operations --- tests/kernelcreation/test_reduction.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 0532b30f5..c41d250f4 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -3,17 +3,18 @@ import numpy as np import sympy as sp import pystencils as ps -from pystencils import AddReducedAssignment +from pystencils.sympyextensions import reduced_assign @pytest.mark.parametrize('dtype', ["float64"]) -def test_reduction(dtype): +@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) +def test_reduction(dtype, op): x = ps.fields(f'x: {dtype}[1d]') w = sp.Symbol("w") # kernel with reduction assignment - reduction_assignment = AddReducedAssignment(w, x.center()) + reduction_assignment = reduced_assign(w, op, x.center()) config = ps.CreateKernelConfig(cpu_openmp=True) -- GitLab From 2c15b9890291deea0cb929ae3a5221f3a0671a45 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 15:02:19 +0100 Subject: [PATCH 061/180] Define type of init_val for reduction as Any --- src/pystencils/backend/kernelcreation/freeze.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 64230203f..840329013 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -195,6 +195,7 @@ class FreezeExpressions: # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) new_rhs: PsExpression + init_val: Any # TODO: type? match expr.op: case "+": init_val = PsConstant(0) -- GitLab From 6a7a251f77d0274c32729bc5bfacfe0308d3fec9 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 17 Jan 2025 15:06:57 +0100 Subject: [PATCH 062/180] Try fix mypy no-redef error --- src/pystencils/backend/platforms/generic_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index ae59d0423..2b4309627 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -63,12 +63,12 @@ class GenericCpu(Platform): arg_types = (dtype,) * func.num_args if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): + cfunc: CFunction cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) call.function = cfunc return call if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): - cfunc: CFunction match func: case ( MathFunctions.Exp -- GitLab From 45ab4e86617492462188a5fc46d3160450a54bf0 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 20 Jan 2025 17:46:49 +0100 Subject: [PATCH 063/180] Try initializing kernel-local reduction variable copy --- .../backend/kernelcreation/freeze.py | 28 +++++++++++-------- src/pystencils/codegen/driver.py | 12 +++++++- src/pystencils/codegen/properties.py | 7 +++-- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 840329013..e0dcba8fd 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -7,6 +7,7 @@ import sympy.core.relational import sympy.logic.boolalg from sympy.codegen.ast import AssignmentBase, AugmentedAssignment +from ..memory import PsSymbol from ...assignment import Assignment from ...simp import AssignmentCollection from ...sympyextensions import ( @@ -193,32 +194,37 @@ class FreezeExpressions: assert isinstance(rhs, PsExpression) assert isinstance(lhs, PsSymbolExpr) + # create kernel-local copy of lhs symbol to work with + new_lhs_symbol = PsSymbol(f"{lhs.symbol.name}_local", lhs.dtype) + new_lhs = PsSymbolExpr(new_lhs_symbol) + self._ctx.add_symbol(new_lhs_symbol) + # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) new_rhs: PsExpression - init_val: Any # TODO: type? + init_val: PsExpression match expr.op: case "+": - init_val = PsConstant(0) - new_rhs = add(lhs.clone(), rhs) + init_val = PsConstantExpr(PsConstant(0)) + new_rhs = add(new_lhs.clone(), rhs) case "-": - init_val = PsConstant(0) - new_rhs = sub(lhs.clone(), rhs) + init_val = PsConstantExpr(PsConstant(0)) + new_rhs = sub(new_lhs.clone(), rhs) case "*": - init_val = PsConstant(1) - new_rhs = mul(lhs.clone(), rhs) + init_val = PsConstantExpr(PsConstant(1)) + new_rhs = mul(new_lhs.clone(), rhs) case "min": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) - new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs]) + new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs]) case "max": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) - new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs]) + new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs]) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") # set reduction symbol property in context - self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val)) + self._ctx.add_reduction_to_symbol(new_lhs_symbol, ReductionSymbolProperty(expr.op, init_val, lhs.symbol)) - return PsAssignment(lhs, new_rhs) + return PsAssignment(new_lhs, new_rhs) def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: symb = self._ctx.get_symbol(spsym.name) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 28b685b55..0293cce48 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -7,12 +7,13 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr from .parameters import Parameter +from ..backend.ast.expressions import PsSymbolExpr from ..types import create_numeric_type, PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode -from ..backend.ast.structural import PsBlock, PsLoop +from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -152,6 +153,14 @@ class DefaultKernelCreationDriver: if self._intermediates is not None: self._intermediates.constants_eliminated = kernel_ast.clone() + # Init local reduction variable copy + # for red, prop in self._ctx.symbols_with_reduction.items(): + # kernel_ast.statements = [PsAssignment(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements + + # Write back result to reduction target variable + # for red, prop in self._ctx.symbols_with_reduction.items(): + # kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))] + # Target-Specific optimizations if self._cfg.target.is_cpu(): kernel_ast = self._transform_for_cpu(kernel_ast) @@ -450,6 +459,7 @@ def _get_function_params( props: set[PsSymbolProperty] = set() for prop in symb.properties: match prop: + # TODO: how to export reduction result (via pointer)? case FieldShape() | FieldStride(): props.add(prop) case BufferBasePtr(buf): diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index 0bad4e898..4b8e7f2bf 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -2,7 +2,6 @@ from __future__ import annotations from dataclasses import dataclass from ..field import Field -from typing import Any @dataclass(frozen=True) @@ -19,8 +18,12 @@ class UniqueSymbolProperty(PsSymbolProperty): class ReductionSymbolProperty(UniqueSymbolProperty): """Property for symbols specifying the operation and initial value for a reduction.""" + from ..backend.memory import PsSymbol + from ..backend.ast.expressions import PsExpression + op: str - init_val: Any # TODO: type? + init_val: PsExpression + orig_symbol: PsSymbol @dataclass(frozen=True) -- GitLab From 0a9abc2a24ec30a444613178a94382da5355a6ef Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 21 Jan 2025 13:55:35 +0100 Subject: [PATCH 064/180] Swap out neutral init values for reduced assignments with min/max op --- src/pystencils/backend/kernelcreation/freeze.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index e0dcba8fd..06d98a44e 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -213,10 +213,10 @@ class FreezeExpressions: init_val = PsConstantExpr(PsConstant(1)) new_rhs = mul(new_lhs.clone(), rhs) case "min": - init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs]) case "max": - init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs]) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") -- GitLab From 3c276118bec2981a9bfc5a0d9654fec358094269 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 21 Jan 2025 17:34:06 +0100 Subject: [PATCH 065/180] Fix declaration of local reduction var and write back to original variable --- src/pystencils/codegen/driver.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 0293cce48..06a5fd44a 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -13,7 +13,7 @@ from ..types import create_numeric_type, PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode -from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment +from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment, PsDeclaration from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -154,12 +154,16 @@ class DefaultKernelCreationDriver: self._intermediates.constants_eliminated = kernel_ast.clone() # Init local reduction variable copy - # for red, prop in self._ctx.symbols_with_reduction.items(): - # kernel_ast.statements = [PsAssignment(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements + for red, prop in self._ctx.symbols_with_reduction.items(): + kernel_ast.statements = [PsDeclaration(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements # Write back result to reduction target variable - # for red, prop in self._ctx.symbols_with_reduction.items(): - # kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))] + for red, prop in self._ctx.symbols_with_reduction.items(): + kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))] + + # TODO: can this be omitted? + typify = Typifier(self._ctx) + kernel_ast = typify(kernel_ast) # Target-Specific optimizations if self._cfg.target.is_cpu(): -- GitLab From c51ae2b438d688f871ba45b69476ef0c3b475462 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 21 Jan 2025 18:42:43 +0100 Subject: [PATCH 066/180] Set type of reduced variable to pointer and write back via PsMemAcc --- src/pystencils/backend/kernelcreation/freeze.py | 15 ++++++++++----- src/pystencils/codegen/driver.py | 10 ++++------ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 06d98a44e..d8fb1b91e 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -61,7 +61,7 @@ from ..ast.expressions import ( from ..ast.vector import PsVecMemAcc from ..constants import PsConstant -from ...types import PsNumericType, PsStructType, PsType +from ...types import PsNumericType, PsStructType, PsType, PsPointerType from ..exceptions import PsInputError from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions from ..exceptions import FreezeError @@ -195,9 +195,9 @@ class FreezeExpressions: assert isinstance(lhs, PsSymbolExpr) # create kernel-local copy of lhs symbol to work with - new_lhs_symbol = PsSymbol(f"{lhs.symbol.name}_local", lhs.dtype) - new_lhs = PsSymbolExpr(new_lhs_symbol) - self._ctx.add_symbol(new_lhs_symbol) + new_lhs_symb = PsSymbol(f"{lhs.symbol.name}_local", rhs.dtype) + new_lhs = PsSymbolExpr(new_lhs_symb) + self._ctx.add_symbol(new_lhs_symb) # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) new_rhs: PsExpression @@ -221,8 +221,13 @@ class FreezeExpressions: case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") + # replace original symbol with pointer-based type used for export + orig_symbol_as_ptr = PsSymbol(lhs.symbol.name, PsPointerType(rhs.dtype)) + self._ctx.replace_symbol(lhs.symbol, orig_symbol_as_ptr) + # set reduction symbol property in context - self._ctx.add_reduction_to_symbol(new_lhs_symbol, ReductionSymbolProperty(expr.op, init_val, lhs.symbol)) + init_val.dtype = rhs.dtype + self._ctx.add_reduction_to_symbol(new_lhs_symb, ReductionSymbolProperty(expr.op, init_val, orig_symbol_as_ptr)) return PsAssignment(new_lhs, new_rhs) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 06a5fd44a..20615ba21 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -7,7 +7,7 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr from .parameters import Parameter -from ..backend.ast.expressions import PsSymbolExpr +from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr from ..types import create_numeric_type, PsIntegerType, PsScalarType @@ -159,11 +159,9 @@ class DefaultKernelCreationDriver: # Write back result to reduction target variable for red, prop in self._ctx.symbols_with_reduction.items(): - kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))] - - # TODO: can this be omitted? - typify = Typifier(self._ctx) - kernel_ast = typify(kernel_ast) + kernel_ast.statements += [PsAssignment( + PsMemAcc(PsSymbolExpr(prop.orig_symbol), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))), + PsSymbolExpr(red))] # Target-Specific optimizations if self._cfg.target.is_cpu(): -- GitLab From c6eedfcda96e84e8279ee624ba3c113f2339bfbe Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 15:16:32 +0100 Subject: [PATCH 067/180] Split reduction var property into local and pointer-based reduction var properties --- .../backend/kernelcreation/context.py | 54 ++++++++++++++----- .../backend/kernelcreation/freeze.py | 31 ++++++----- .../backend/transformations/add_pragmas.py | 4 +- src/pystencils/codegen/driver.py | 10 ++-- src/pystencils/codegen/properties.py | 16 ++++-- 5 files changed, 78 insertions(+), 37 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index a8728e6ac..2f46a7421 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -9,7 +9,7 @@ from ...defaults import DEFAULTS from ...field import Field, FieldType from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType -from ...codegen.properties import ReductionSymbolProperty +from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable from ..memory import PsSymbol, PsBuffer from ..constants import PsConstant @@ -77,7 +77,8 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) - self._symbols_with_reduction: dict[PsSymbol, ReductionSymbolProperty] = dict() + self._local_reduction_symbols: dict[PsSymbol, LocalReductionVariable] = dict() + self._reduction_ptr_symbols: dict[PsSymbol, ReductionPointerVariable] = dict() self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() @@ -172,21 +173,41 @@ class KernelCreationContext: self._symbols[old.name] = new - def add_reduction_to_symbol(self, symbol: PsSymbol, reduction: ReductionSymbolProperty): - """Adds a reduction property to a symbol. + def add_local_reduction_symbol(self, local_symb: PsSymbol, local_var_prop: LocalReductionVariable): + """Adds entry for a symbol and its property to the lookup table for local reduction variables. - The symbol ``symbol`` should not have a reduction property and must exist in the symbol table. + The symbol ``symbol`` should not have a 'LocalReductionSymbol' property and shall not exist in the symbol table. """ - if self.find_symbol(symbol.name) is None: + if self.find_symbol(local_symb.name) is not None: raise PsInternalCompilerError( - f"add_reduction_to_symbol: {symbol.name} does not exist in the symbol table" + f"add_local_reduction_symbol: {local_symb.name} already exist in the symbol table" ) + self.add_symbol(local_symb) - if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty): - symbol.add_property(reduction) - self._symbols_with_reduction[symbol] = reduction + if local_symb not in self._local_reduction_symbols and not local_symb.get_properties(LocalReductionVariable): + local_symb.add_property(local_var_prop) + self._local_reduction_symbols[local_symb] = local_var_prop else: - raise PsInternalCompilerError(f"add_reduction_to_symbol: {symbol.name} already has a reduction property") + raise PsInternalCompilerError( + f"add_local_reduction_symbol: {local_symb.name} already exists in local reduction table" + ) + + def add_reduction_ptr_symbol(self, orig_symb: PsSymbol, ptr_symb: PsSymbol, ptr_var_prop: ReductionPointerVariable): + """Replaces reduction symbol with a pointer-based counterpart used for export + and adds the new symbol and its property to the lookup table for pointer-based reduction variables + + The symbol ``ptr_symbol`` should not exist in the symbol table. + """ + self.replace_symbol(orig_symb, ptr_symb) + + if ptr_symb not in self._reduction_ptr_symbols and not ptr_symb.get_properties( + ReductionPointerVariable): + ptr_symb.add_property(ptr_var_prop) + self._reduction_ptr_symbols[ptr_symb] = ptr_var_prop + else: + raise PsInternalCompilerError( + f"add_reduction_ptr_symbol: {ptr_symb.name} already exists in pointer-based reduction variable table " + ) def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None @@ -224,9 +245,14 @@ class KernelCreationContext: return self._symbols.values() @property - def symbols_with_reduction(self) -> dict[PsSymbol, ReductionSymbolProperty]: - """Return a dictionary holding symbols and their reduction property.""" - return self._symbols_with_reduction + def local_reduction_symbols(self) -> dict[PsSymbol, LocalReductionVariable]: + """Return a dictionary holding kernel-local reduction symbols and their reduction properties.""" + return self._local_reduction_symbols + + @property + def reduction_pointer_symbols(self) -> dict[PsSymbol, ReductionPointerVariable]: + """Return a dictionary holding pointer-based reduction symbols and their reduction properties.""" + return self._reduction_ptr_symbols # Fields and Arrays diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index d8fb1b91e..1e9984def 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -66,7 +66,7 @@ from ..exceptions import PsInputError from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions from ..exceptions import FreezeError -from ...codegen.properties import ReductionSymbolProperty +from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable ExprLike = ( @@ -194,40 +194,45 @@ class FreezeExpressions: assert isinstance(rhs, PsExpression) assert isinstance(lhs, PsSymbolExpr) + orig_lhs_symb = lhs.symbol + dtype = rhs.dtype # TODO: kernel with (implicit) up/downcasts? + + # replace original symbol with pointer-based type used for export + orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype)) + # create kernel-local copy of lhs symbol to work with - new_lhs_symb = PsSymbol(f"{lhs.symbol.name}_local", rhs.dtype) + new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype) new_lhs = PsSymbolExpr(new_lhs_symb) - self._ctx.add_symbol(new_lhs_symb) # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) new_rhs: PsExpression init_val: PsExpression match expr.op: case "+": - init_val = PsConstantExpr(PsConstant(0)) + init_val = PsConstantExpr(PsConstant(0, dtype)) new_rhs = add(new_lhs.clone(), rhs) case "-": - init_val = PsConstantExpr(PsConstant(0)) + init_val = PsConstantExpr(PsConstant(0, dtype)) new_rhs = sub(new_lhs.clone(), rhs) case "*": - init_val = PsConstantExpr(PsConstant(1)) + init_val = PsConstantExpr(PsConstant(1, dtype)) new_rhs = mul(new_lhs.clone(), rhs) case "min": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) + init_val.dtype = dtype new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs]) case "max": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) + init_val.dtype = dtype new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs]) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") - # replace original symbol with pointer-based type used for export - orig_symbol_as_ptr = PsSymbol(lhs.symbol.name, PsPointerType(rhs.dtype)) - self._ctx.replace_symbol(lhs.symbol, orig_symbol_as_ptr) - - # set reduction symbol property in context - init_val.dtype = rhs.dtype - self._ctx.add_reduction_to_symbol(new_lhs_symb, ReductionSymbolProperty(expr.op, init_val, orig_symbol_as_ptr)) + # set reduction symbol properties (local/pointer variables) in context + self._ctx.add_local_reduction_symbol(new_lhs_symb, + LocalReductionVariable(expr.op, init_val, orig_lhs_symb_as_ptr)) + self._ctx.add_reduction_ptr_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr, + ReductionPointerVariable(expr.op, new_lhs_symb)) return PsAssignment(new_lhs, new_rhs) diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index 6d72e1550..44d1d1ede 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -112,8 +112,8 @@ class AddOpenMP: pragma_text += " parallel" if not omp_params.omit_parallel_construct else "" pragma_text += f" for schedule({omp_params.schedule})" - if bool(ctx.symbols_with_reduction): - for symbol, reduction in ctx.symbols_with_reduction.items(): + if bool(ctx.local_reduction_symbols): + for symbol, reduction in ctx.local_reduction_symbols.items(): if isinstance(symbol.dtype, PsScalarType): pragma_text += f" reduction({reduction.op}: {symbol.name})" else: diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 20615ba21..7f90f62ce 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -154,14 +154,14 @@ class DefaultKernelCreationDriver: self._intermediates.constants_eliminated = kernel_ast.clone() # Init local reduction variable copy - for red, prop in self._ctx.symbols_with_reduction.items(): - kernel_ast.statements = [PsDeclaration(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements + for local_red, prop in self._ctx.local_reduction_symbols.items(): + kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), prop.init_val)] + kernel_ast.statements # Write back result to reduction target variable - for red, prop in self._ctx.symbols_with_reduction.items(): + for red_ptr, prop in self._ctx.reduction_pointer_symbols.items(): kernel_ast.statements += [PsAssignment( - PsMemAcc(PsSymbolExpr(prop.orig_symbol), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))), - PsSymbolExpr(red))] + PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))), + PsSymbolExpr(prop.local_symbol))] # Target-Specific optimizations if self._cfg.target.is_cpu(): diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index 4b8e7f2bf..1e71c5b98 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -15,15 +15,25 @@ class UniqueSymbolProperty(PsSymbolProperty): @dataclass(frozen=True) -class ReductionSymbolProperty(UniqueSymbolProperty): - """Property for symbols specifying the operation and initial value for a reduction.""" +class LocalReductionVariable(PsSymbolProperty): + """Property for symbols specifying the operation and initial value for a kernel-local reduction variable.""" from ..backend.memory import PsSymbol from ..backend.ast.expressions import PsExpression op: str init_val: PsExpression - orig_symbol: PsSymbol + ptr_symbol: PsSymbol + + +@dataclass(frozen=True) +class ReductionPointerVariable(PsSymbolProperty): + """Property for pointer-type symbols exporting the reduction result from the kernel.""" + + from ..backend.memory import PsSymbol + + op: str + local_symbol: PsSymbol @dataclass(frozen=True) -- GitLab From 3e0daa67359c7ddc17264b7fd21aa0a0429552e5 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 16:01:00 +0100 Subject: [PATCH 068/180] Propagate properties of reduction pointer symbols to kernel parameters --- src/pystencils/codegen/driver.py | 5 +++-- src/pystencils/jit/cpu_extension_module.py | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 7f90f62ce..f414b953e 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -5,7 +5,7 @@ from dataclasses import dataclass, replace from .target import Target from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange -from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr +from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable from .parameters import Parameter from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr @@ -461,7 +461,8 @@ def _get_function_params( props: set[PsSymbolProperty] = set() for prop in symb.properties: match prop: - # TODO: how to export reduction result (via pointer)? + case ReductionPointerVariable(): + props.add(prop) case FieldShape() | FieldStride(): props.add(prop) case BufferBasePtr(buf): diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index befb033e6..c2c969eaa 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -13,7 +13,7 @@ from ..codegen import ( Kernel, Parameter, ) -from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride +from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride, ReductionPointerVariable from ..types import ( PsType, PsUnsignedIntegerType, @@ -265,7 +265,10 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return self._array_buffers[field] def extract_scalar(self, param: Parameter) -> str: - if param not in self._scalar_extractions: + if any(isinstance(e, ReductionPointerVariable) for e in param.properties): + # TODO: implement + pass + elif param not in self._scalar_extractions: extract_func = self._scalar_extractor(param.dtype) code = self.TMPL_EXTRACT_SCALAR.format( name=param.name, -- GitLab From 777ab888d5032d7630827f91fd26c388a9a09db2 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 16:01:51 +0100 Subject: [PATCH 069/180] Use literals for C macros used for the numeric limits --- .../backend/platforms/generic_cpu.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 2b4309627..58b9c7946 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -4,6 +4,7 @@ from typing import Sequence from pystencils.backend.ast.expressions import PsCall from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions +from ..literals import PsLiteral from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType from .platform import Platform @@ -25,7 +26,7 @@ from ..ast.expressions import ( PsLookup, PsGe, PsLe, - PsTernary, + PsTernary, PsLiteralExpr, ) from ..ast.vector import PsVecMemAcc from ...types import PsVectorType, PsCustomType @@ -43,7 +44,7 @@ class GenericCpu(Platform): @property def required_headers(self) -> set[str]: - return {"<math.h>", "<limits.h>"} + return {"<math.h>", "<limits.h>", "<float.h>"} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace @@ -63,12 +64,25 @@ class GenericCpu(Platform): arg_types = (dtype,) * func.num_args if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): - cfunc: CFunction - cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype) - call.function = cfunc - return call + # get type prefix for macro + # TODO: there must be a better way... + tpe = "" + match dtype: + case PsIeeeFloatType(): + match dtype.width: + case 32: + tpe = "FLT" + case 64: + tpe = "DBL" + case _: + raise MaterializationError( + f"No implementation available for function {func} on data type {dtype}" + ) + + return PsLiteralExpr(PsLiteral(f"{tpe}_{func.function_name}".upper(), dtype)) if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): + cfunc: CFunction match func: case ( MathFunctions.Exp -- GitLab From f1c556e6f93d5fa042e12e8a0a9c57f3bdea47b7 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 16:30:43 +0100 Subject: [PATCH 070/180] Integrate reduction pointers to parameters.py --- src/pystencils/codegen/parameters.py | 16 ++++++++++++++-- src/pystencils/jit/cpu_extension_module.py | 17 ++++++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py index d8411266e..094553517 100644 --- a/src/pystencils/codegen/parameters.py +++ b/src/pystencils/codegen/parameters.py @@ -1,14 +1,14 @@ from __future__ import annotations from warnings import warn -from typing import Sequence, Iterable +from typing import Sequence, Iterable, Optional from .properties import ( PsSymbolProperty, _FieldProperty, FieldShape, FieldStride, - FieldBasePtr, + FieldBasePtr, ReductionPointerVariable, ) from ..types import PsType from ..field import Field @@ -39,6 +39,9 @@ class Parameter: key=lambda f: f.name, ) ) + self._reduction_ptr: Optional[ReductionPointerVariable] = next( + (e for e in self._properties if isinstance(e, ReductionPointerVariable)), None + ) @property def name(self): @@ -79,6 +82,11 @@ class Parameter: """Set of fields associated with this parameter.""" return self._fields + @property + def reduction_pointer(self) -> Optional[ReductionPointerVariable]: + """Reduction pointer associated with this parameter.""" + return self._reduction_ptr + def get_properties( self, prop_type: type[PsSymbolProperty] | tuple[type[PsSymbolProperty], ...] ) -> set[PsSymbolProperty]: @@ -105,6 +113,10 @@ class Parameter: ) return bool(self.get_properties(FieldBasePtr)) + @property + def is_reduction_pointer(self) -> bool: + return bool(self._reduction_ptr) + @property def is_field_stride(self) -> bool: # pragma: no cover warn( diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index c2c969eaa..f9c04200c 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -206,6 +206,8 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ self._array_assoc_var_extractions: dict[Parameter, str] = dict() self._scalar_extractions: dict[Parameter, str] = dict() + self._reduction_ptrs: dict[Parameter, str] = dict() + self._constraint_checks: list[str] = [] self._call: str | None = None @@ -265,10 +267,7 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return self._array_buffers[field] def extract_scalar(self, param: Parameter) -> str: - if any(isinstance(e, ReductionPointerVariable) for e in param.properties): - # TODO: implement - pass - elif param not in self._scalar_extractions: + if param not in self._scalar_extractions: extract_func = self._scalar_extractor(param.dtype) code = self.TMPL_EXTRACT_SCALAR.format( name=param.name, @@ -279,6 +278,12 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return param.name + def extract_reduction_ptr(self, param: Parameter) -> str: + if param not in self._reduction_ptrs: + # TODO: implement + pass + return param.name + def extract_array_assoc_var(self, param: Parameter) -> str: if param not in self._array_assoc_var_extractions: field = param.fields[0] @@ -306,7 +311,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return param.name def extract_parameter(self, param: Parameter): - if param.is_field_parameter: + if param.is_reduction_pointer: + self.extract_reduction_ptr(param) + elif param.is_field_parameter: self.extract_array_assoc_var(param) else: self.extract_scalar(param) -- GitLab From ba697180cac45133f756364ef8798d8437852026 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 17:23:02 +0100 Subject: [PATCH 071/180] Rewire existing code extraction of fields to support reduction pointer extraction --- src/pystencils/jit/cpu_extension_module.py | 53 ++++++++++++---------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index f9c04200c..d8d90c924 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -199,9 +199,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ """ def __init__(self) -> None: - self._array_buffers: dict[Field, str] = dict() - self._array_extractions: dict[Field, str] = dict() - self._array_frees: dict[Field, str] = dict() + self._array_buffers: dict[Any, str] = dict() + self._array_extractions: dict[Any, str] = dict() + self._array_frees: dict[Any, str] = dict() self._array_assoc_var_extractions: dict[Parameter, str] = dict() self._scalar_extractions: dict[Parameter, str] = dict() @@ -235,36 +235,37 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ else: return None - def extract_field(self, field: Field) -> str: + def extract_buffer(self, buffer: Any, name: str, dtype: PsType) -> str: """Adds an array, and returns the name of the underlying Py_Buffer.""" - if field not in self._array_extractions: - extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=field.name) + if buffer not in self._array_extractions: + extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=name) # Check array type - type_char = self._type_char(field.dtype) + type_char = self._type_char(dtype) if type_char is not None: - dtype_cond = f"buffer_{field.name}.format[0] == '{type_char}'" + dtype_cond = f"buffer_{name}.format[0] == '{type_char}'" extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( cond=dtype_cond, what="data type", - name=field.name, - expected=str(field.dtype), + name=name, + expected=str(dtype), ) # Check item size - itemsize = field.dtype.itemsize - item_size_cond = f"buffer_{field.name}.itemsize == {itemsize}" - extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( - cond=item_size_cond, what="itemsize", name=field.name, expected=itemsize - ) + itemsize = dtype.itemsize + if itemsize is not None: # itemsize of pointer not known (TODO?) + item_size_cond = f"buffer_{name}.itemsize == {itemsize}" + extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format( + cond=item_size_cond, what="itemsize", name=name, expected=itemsize + ) - self._array_buffers[field] = f"buffer_{field.name}" - self._array_extractions[field] = extraction_code + self._array_buffers[buffer] = f"buffer_{name}" + self._array_extractions[buffer] = extraction_code - release_code = f"PyBuffer_Release(&buffer_{field.name});" - self._array_frees[field] = release_code + release_code = f"PyBuffer_Release(&buffer_{name});" + self._array_frees[buffer] = release_code - return self._array_buffers[field] + return self._array_buffers[buffer] def extract_scalar(self, param: Parameter) -> str: if param not in self._scalar_extractions: @@ -280,14 +281,20 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ def extract_reduction_ptr(self, param: Parameter) -> str: if param not in self._reduction_ptrs: - # TODO: implement - pass + ptr = param.reduction_pointer + buffer = self.extract_buffer(ptr, param.name, param.dtype) + code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" + + assert code is not None + + self._array_assoc_var_extractions[param] = code + return param.name def extract_array_assoc_var(self, param: Parameter) -> str: if param not in self._array_assoc_var_extractions: field = param.fields[0] - buffer = self.extract_field(field) + buffer = self.extract_buffer(field, field.name, field.dtype) code: str | None = None for prop in param.properties: -- GitLab From 3e595df6c79cc1a7a8c2ff4ab86825e81aadbf43 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 22 Jan 2025 17:35:50 +0100 Subject: [PATCH 072/180] Refine test_reduction.py to check for result correctness --- tests/kernelcreation/test_reduction.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index c41d250f4..b97343e72 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -5,6 +5,15 @@ import sympy as sp import pystencils as ps from pystencils.sympyextensions import reduced_assign +INIT=2 +SIZE=15 +SOLUTION = { + "+": INIT * SIZE, + "-": INIT * -SIZE, + "*": INIT**SIZE, + "min": INIT, + "max": INIT +} @pytest.mark.parametrize('dtype', ["float64"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) @@ -24,6 +33,7 @@ def test_reduction(dtype, op): ps.show_code(ast_reduction) - array = np.ones((10,), dtype=dtype) - kernel_reduction(x=array, w=0) - # TODO: check if "w = #points" \ No newline at end of file + array = np.full((SIZE,), INIT, dtype=dtype) + reduction_array = np.zeros(1, dtype=dtype) + kernel_reduction(x=array, w=reduction_array) + assert np.allclose(reduction_array, SOLUTION[op]) \ No newline at end of file -- GitLab From b352a2e2e8c2d7f4eeb0861dacff5d703ae51869 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 23 Jan 2025 18:18:24 +0100 Subject: [PATCH 073/180] Fix lint for jit/cpu_extension_module.py --- src/pystencils/jit/cpu_extension_module.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index d8d90c924..6ec62c28d 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -13,7 +13,7 @@ from ..codegen import ( Kernel, Parameter, ) -from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride, ReductionPointerVariable +from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride from ..types import ( PsType, PsUnsignedIntegerType, @@ -21,7 +21,6 @@ from ..types import ( PsIeeeFloatType, ) from ..types.quick import Fp, SInt, UInt -from ..field import Field class PsKernelExtensioNModule: -- GitLab From 4c726aa6aa2df6312252cb848ace95e790d62331 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 24 Jan 2025 14:43:32 +0100 Subject: [PATCH 074/180] Prepare reduction test for GPU support --- tests/kernelcreation/test_reduction.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index b97343e72..b56a24a19 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -1,6 +1,7 @@ import pytest import numpy as np import sympy as sp +import cupy as cp import pystencils as ps from pystencils.sympyextensions import reduced_assign @@ -18,6 +19,9 @@ SOLUTION = { @pytest.mark.parametrize('dtype', ["float64"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction(dtype, op): + + gpu_avail = True + x = ps.fields(f'x: {dtype}[1d]') w = sp.Symbol("w") @@ -25,7 +29,7 @@ def test_reduction(dtype, op): reduction_assignment = reduced_assign(w, op, x.center()) - config = ps.CreateKernelConfig(cpu_openmp=True) + config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True) ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype) #code_reduction = ps.get_code_str(ast_reduction) @@ -35,5 +39,13 @@ def test_reduction(dtype, op): array = np.full((SIZE,), INIT, dtype=dtype) reduction_array = np.zeros(1, dtype=dtype) - kernel_reduction(x=array, w=reduction_array) - assert np.allclose(reduction_array, SOLUTION[op]) \ No newline at end of file + + if gpu_avail: + array_gpu = cp.asarray(array) + reduction_array_gpu = cp.asarray(reduction_array) + + kernel_reduction(x=array_gpu, w=reduction_array_gpu) + assert np.allclose(reduction_array_gpu.get(), SOLUTION[op]) + else: + kernel_reduction(x=array, w=reduction_array) + assert np.allclose(reduction_array, SOLUTION[op]) \ No newline at end of file -- GitLab From f0d2fde6848f9cddcf0c3b38ca169f2e85abc093 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 24 Jan 2025 16:20:37 +0100 Subject: [PATCH 075/180] Encapsulate mapping of binop strings to actual operands and now also use for considering initial value of passed reduction pointer value --- .../backend/kernelcreation/freeze.py | 25 ++++----------- src/pystencils/codegen/driver.py | 7 +++-- .../sympyextensions/binop_mapping.py | 31 +++++++++++++++++++ tests/kernelcreation/test_reduction.py | 29 ++++++++--------- 4 files changed, 56 insertions(+), 36 deletions(-) create mode 100644 src/pystencils/sympyextensions/binop_mapping.py diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 1e9984def..f5f207acf 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -14,6 +14,7 @@ from ...sympyextensions import ( integer_functions, ConditionalFieldAccess, ) +from ...sympyextensions.binop_mapping import binop_str_to_expr from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType from ...sympyextensions.pointers import AddressOf, mem_acc from ...sympyextensions.reduction import ReducedAssignment @@ -173,19 +174,7 @@ class FreezeExpressions: assert isinstance(lhs, PsExpression) assert isinstance(rhs, PsExpression) - match expr.op: - case "+=": - op = add - case "-=": - op = sub - case "*=": - op = mul - case "/=": - op = truediv - case _: - raise FreezeError(f"Unsupported augmented assignment: {expr.op}.") - - return PsAssignment(lhs, op(lhs.clone(), rhs)) + return PsAssignment(lhs, binop_str_to_expr(expr.op[0], lhs.clone(), rhs)) def map_ReducedAssignment(self, expr: ReducedAssignment): lhs = self.visit(expr.lhs) @@ -204,27 +193,25 @@ class FreezeExpressions: new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype) new_lhs = PsSymbolExpr(new_lhs_symb) - # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment) + # get new rhs from augmented assignment + new_rhs: PsExpression = binop_str_to_expr(expr.op, new_lhs.clone(), rhs) + + # match for reduction operation and set neutral init_val new_rhs: PsExpression init_val: PsExpression match expr.op: case "+": init_val = PsConstantExpr(PsConstant(0, dtype)) - new_rhs = add(new_lhs.clone(), rhs) case "-": init_val = PsConstantExpr(PsConstant(0, dtype)) - new_rhs = sub(new_lhs.clone(), rhs) case "*": init_val = PsConstantExpr(PsConstant(1, dtype)) - new_rhs = mul(new_lhs.clone(), rhs) case "min": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) init_val.dtype = dtype - new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs]) case "max": init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) init_val.dtype = dtype - new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs]) case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index f414b953e..8b8ecd15b 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -7,7 +7,8 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable from .parameters import Parameter -from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr +from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr, PsExpression +from ..sympyextensions.binop_mapping import binop_str_to_expr from ..types import create_numeric_type, PsIntegerType, PsScalarType @@ -159,9 +160,9 @@ class DefaultKernelCreationDriver: # Write back result to reduction target variable for red_ptr, prop in self._ctx.reduction_pointer_symbols.items(): + ptr_access = PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) kernel_ast.statements += [PsAssignment( - PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))), - PsSymbolExpr(prop.local_symbol))] + ptr_access, binop_str_to_expr(prop.op, ptr_access, PsSymbolExpr(prop.local_symbol)))] # Target-Specific optimizations if self._cfg.target.is_cpu(): diff --git a/src/pystencils/sympyextensions/binop_mapping.py b/src/pystencils/sympyextensions/binop_mapping.py new file mode 100644 index 000000000..1cb2a3ab5 --- /dev/null +++ b/src/pystencils/sympyextensions/binop_mapping.py @@ -0,0 +1,31 @@ +from operator import truediv, mul, sub, add + +from src.pystencils.backend.ast.expressions import PsCall, PsExpression +from src.pystencils.backend.exceptions import FreezeError +from src.pystencils.backend.functions import MathFunctions, PsMathFunction + +_available_operator_interface: set[str] = {'+', '-', '*', '/'} + + +def binop_str_to_expr(op: str, op1, op2) -> PsExpression: + if op in _available_operator_interface: + match op: + case "+": + operator = add + case "-": + operator = sub + case "*": + operator = mul + case "/": + operator = truediv + case _: + raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.") + return operator(op1, op2) + else: + match op: + case "min": + return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2]) + case "max": + return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) + case _: + raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.") diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index b56a24a19..c01dce5a6 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -6,21 +6,22 @@ import cupy as cp import pystencils as ps from pystencils.sympyextensions import reduced_assign -INIT=2 -SIZE=15 +INIT_W = 5 +INIT_ARR = 2 +SIZE = 15 SOLUTION = { - "+": INIT * SIZE, - "-": INIT * -SIZE, - "*": INIT**SIZE, - "min": INIT, - "max": INIT + "+": INIT_W + INIT_ARR * SIZE, + "-": INIT_W - INIT_ARR * -SIZE, + "*": INIT_W * INIT_ARR ** SIZE, + "min": min(INIT_W, INIT_ARR), + "max": max(INIT_W, INIT_ARR), } + @pytest.mark.parametrize('dtype', ["float64"]) -@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) +@pytest.mark.parametrize("op", ["+", "-", "*"]) #, "min", "max"]) # TODO: min/max broken due to error in BasePrinter def test_reduction(dtype, op): - - gpu_avail = True + gpu_avail = False x = ps.fields(f'x: {dtype}[1d]') w = sp.Symbol("w") @@ -32,13 +33,13 @@ def test_reduction(dtype, op): config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True) ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype) - #code_reduction = ps.get_code_str(ast_reduction) + # code_reduction = ps.get_code_str(ast_reduction) kernel_reduction = ast_reduction.compile() ps.show_code(ast_reduction) - array = np.full((SIZE,), INIT, dtype=dtype) - reduction_array = np.zeros(1, dtype=dtype) + array = np.full((SIZE,), INIT_ARR, dtype=dtype) + reduction_array = np.full((1,), INIT_W, dtype=dtype) if gpu_avail: array_gpu = cp.asarray(array) @@ -48,4 +49,4 @@ def test_reduction(dtype, op): assert np.allclose(reduction_array_gpu.get(), SOLUTION[op]) else: kernel_reduction(x=array, w=reduction_array) - assert np.allclose(reduction_array, SOLUTION[op]) \ No newline at end of file + assert np.allclose(reduction_array, SOLUTION[op]) -- GitLab From 96b5cbf286a29882b496faee3b9fe3be481d8bb3 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 28 Jan 2025 12:44:21 +0100 Subject: [PATCH 076/180] Fix lint --- src/pystencils/backend/kernelcreation/freeze.py | 5 ++--- src/pystencils/codegen/driver.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index f5f207acf..1238f16af 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -1,9 +1,8 @@ from typing import overload, cast, Any from functools import reduce -from operator import add, mul, sub, truediv +from operator import add, mul, sub import sympy as sp -import sympy.core.relational import sympy.logic.boolalg from sympy.codegen.ast import AssignmentBase, AugmentedAssignment @@ -184,7 +183,7 @@ class FreezeExpressions: assert isinstance(lhs, PsSymbolExpr) orig_lhs_symb = lhs.symbol - dtype = rhs.dtype # TODO: kernel with (implicit) up/downcasts? + dtype = rhs.dtype # TODO: kernel with (implicit) up/downcasts? # replace original symbol with pointer-based type used for export orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype)) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 8b8ecd15b..b47ad8a9e 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -7,7 +7,7 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable from .parameters import Parameter -from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr, PsExpression +from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr from ..sympyextensions.binop_mapping import binop_str_to_expr from ..types import create_numeric_type, PsIntegerType, PsScalarType -- GitLab From 3daaa5e5a1f92a5482cb838cd791ed062dae1398 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 28 Jan 2025 12:47:15 +0100 Subject: [PATCH 077/180] Fix typecheck --- src/pystencils/sympyextensions/__init__.py | 2 ++ src/pystencils/sympyextensions/binop_mapping.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 6ab24e936..8d832ba2a 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -2,6 +2,7 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc from .pointers import mem_acc from .reduction import reduced_assign +from .binop_mapping import binop_str_to_expr from .math import ( prod, @@ -35,6 +36,7 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", "reduced_assign", + "binop_str_to_expr", "TypedSymbol", "CastFunc", "mem_acc", diff --git a/src/pystencils/sympyextensions/binop_mapping.py b/src/pystencils/sympyextensions/binop_mapping.py index 1cb2a3ab5..04cfb6107 100644 --- a/src/pystencils/sympyextensions/binop_mapping.py +++ b/src/pystencils/sympyextensions/binop_mapping.py @@ -1,8 +1,8 @@ from operator import truediv, mul, sub, add -from src.pystencils.backend.ast.expressions import PsCall, PsExpression -from src.pystencils.backend.exceptions import FreezeError -from src.pystencils.backend.functions import MathFunctions, PsMathFunction +from ..backend.ast.expressions import PsCall, PsExpression +from ..backend.exceptions import FreezeError +from ..backend.functions import MathFunctions, PsMathFunction _available_operator_interface: set[str] = {'+', '-', '*', '/'} -- GitLab From c73deaf6d54da2b95310dc1e606ed132b465c874 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 28 Jan 2025 13:11:14 +0100 Subject: [PATCH 078/180] Fix mypy errors and move binop mapping function --- src/pystencils/__init__.py | 2 ++ src/pystencils/backend/kernelcreation/freeze.py | 5 +++-- src/pystencils/{sympyextensions => }/binop_mapping.py | 6 +++--- src/pystencils/codegen/driver.py | 10 +++++----- src/pystencils/sympyextensions/__init__.py | 2 -- 5 files changed, 13 insertions(+), 12 deletions(-) rename src/pystencils/{sympyextensions => }/binop_mapping.py (85%) diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 916a61392..3e8e8d8e4 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -45,6 +45,7 @@ from .sympyextensions.reduction import ( MinReducedssignment, MaxReducedssignment ) +from .binop_mapping import binop_str_to_expr __all__ = [ "Field", @@ -75,6 +76,7 @@ __all__ = [ "inspect", "AssignmentCollection", "Assignment", + "binop_str_to_expr", "AddAugmentedAssignment", "AddReducedAssignment", "SubReducedAssignment", diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 1238f16af..68868e143 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -13,7 +13,7 @@ from ...sympyextensions import ( integer_functions, ConditionalFieldAccess, ) -from ...sympyextensions.binop_mapping import binop_str_to_expr +from ...binop_mapping import binop_str_to_expr from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType from ...sympyextensions.pointers import AddressOf, mem_acc from ...sympyextensions.reduction import ReducedAssignment @@ -185,6 +185,8 @@ class FreezeExpressions: orig_lhs_symb = lhs.symbol dtype = rhs.dtype # TODO: kernel with (implicit) up/downcasts? + assert isinstance(dtype, PsNumericType) + # replace original symbol with pointer-based type used for export orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype)) @@ -196,7 +198,6 @@ class FreezeExpressions: new_rhs: PsExpression = binop_str_to_expr(expr.op, new_lhs.clone(), rhs) # match for reduction operation and set neutral init_val - new_rhs: PsExpression init_val: PsExpression match expr.op: case "+": diff --git a/src/pystencils/sympyextensions/binop_mapping.py b/src/pystencils/binop_mapping.py similarity index 85% rename from src/pystencils/sympyextensions/binop_mapping.py rename to src/pystencils/binop_mapping.py index 04cfb6107..060fa40aa 100644 --- a/src/pystencils/sympyextensions/binop_mapping.py +++ b/src/pystencils/binop_mapping.py @@ -1,8 +1,8 @@ from operator import truediv, mul, sub, add -from ..backend.ast.expressions import PsCall, PsExpression -from ..backend.exceptions import FreezeError -from ..backend.functions import MathFunctions, PsMathFunction +from .backend.ast.expressions import PsExpression, PsCall +from .backend.exceptions import FreezeError +from .backend.functions import PsMathFunction, MathFunctions _available_operator_interface: set[str] = {'+', '-', '*', '/'} diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index b47ad8a9e..d68bfbcac 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -7,8 +7,8 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable from .parameters import Parameter +from ..binop_mapping import binop_str_to_expr from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr -from ..sympyextensions.binop_mapping import binop_str_to_expr from ..types import create_numeric_type, PsIntegerType, PsScalarType @@ -155,14 +155,14 @@ class DefaultKernelCreationDriver: self._intermediates.constants_eliminated = kernel_ast.clone() # Init local reduction variable copy - for local_red, prop in self._ctx.local_reduction_symbols.items(): - kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), prop.init_val)] + kernel_ast.statements + for local_red, local_prop in self._ctx.local_reduction_symbols.items(): + kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), local_prop.init_val)] + kernel_ast.statements # Write back result to reduction target variable - for red_ptr, prop in self._ctx.reduction_pointer_symbols.items(): + for red_ptr, ptr_prop in self._ctx.reduction_pointer_symbols.items(): ptr_access = PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) kernel_ast.statements += [PsAssignment( - ptr_access, binop_str_to_expr(prop.op, ptr_access, PsSymbolExpr(prop.local_symbol)))] + ptr_access, binop_str_to_expr(ptr_prop.op, ptr_access, PsSymbolExpr(ptr_prop.local_symbol)))] # Target-Specific optimizations if self._cfg.target.is_cpu(): diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 8d832ba2a..6ab24e936 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -2,7 +2,6 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc from .pointers import mem_acc from .reduction import reduced_assign -from .binop_mapping import binop_str_to_expr from .math import ( prod, @@ -36,7 +35,6 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", "reduced_assign", - "binop_str_to_expr", "TypedSymbol", "CastFunc", "mem_acc", -- GitLab From f71ce708a1aaa876a700e880d8cd1b63a0d080ee Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 28 Jan 2025 18:00:53 +0100 Subject: [PATCH 079/180] Enforce usage of typed symbols for reductions --- src/pystencils/backend/kernelcreation/freeze.py | 4 +++- tests/kernelcreation/test_reduction.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 68868e143..de272cf44 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -176,6 +176,8 @@ class FreezeExpressions: return PsAssignment(lhs, binop_str_to_expr(expr.op[0], lhs.clone(), rhs)) def map_ReducedAssignment(self, expr: ReducedAssignment): + assert isinstance(expr.lhs, TypedSymbol) + lhs = self.visit(expr.lhs) rhs = self.visit(expr.rhs) @@ -183,7 +185,7 @@ class FreezeExpressions: assert isinstance(lhs, PsSymbolExpr) orig_lhs_symb = lhs.symbol - dtype = rhs.dtype # TODO: kernel with (implicit) up/downcasts? + dtype = lhs.dtype assert isinstance(dtype, PsNumericType) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index c01dce5a6..8095f4e1d 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -24,7 +24,7 @@ def test_reduction(dtype, op): gpu_avail = False x = ps.fields(f'x: {dtype}[1d]') - w = sp.Symbol("w") + w = ps.TypedSymbol("w", dtype) # kernel with reduction assignment -- GitLab From e94c4980ed860b127a27743fd9727192d667c906 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 28 Jan 2025 19:18:00 +0100 Subject: [PATCH 080/180] Adapt reduction assignment interface and employ enums instead of strings for the binary operation employed --- src/pystencils/__init__.py | 22 +++---- .../backend/kernelcreation/freeze.py | 27 +++++--- .../backend/transformations/add_pragmas.py | 2 +- src/pystencils/codegen/driver.py | 2 +- src/pystencils/codegen/properties.py | 5 +- ...inop_mapping.py => compound_op_mapping.py} | 18 +++--- src/pystencils/sympyextensions/__init__.py | 6 +- src/pystencils/sympyextensions/reduction.py | 63 ++++++++++++------- tests/kernelcreation/test_reduction.py | 7 +-- 9 files changed, 90 insertions(+), 62 deletions(-) rename src/pystencils/{binop_mapping.py => compound_op_mapping.py} (65%) diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index 3e8e8d8e4..6aa305a16 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -39,13 +39,12 @@ from .sympyextensions.typed_sympy import TypedSymbol, DynamicType from .sympyextensions import SymbolCreator from .datahandling import create_data_handling from .sympyextensions.reduction import ( - AddReducedAssignment, - SubReducedAssignment, - MulReducedAssignment, - MinReducedssignment, - MaxReducedssignment + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, ) -from .binop_mapping import binop_str_to_expr __all__ = [ "Field", @@ -76,13 +75,12 @@ __all__ = [ "inspect", "AssignmentCollection", "Assignment", - "binop_str_to_expr", "AddAugmentedAssignment", - "AddReducedAssignment", - "SubReducedAssignment", - "MulReducedAssignment", - "MinReducedssignment", - "MaxReducedssignment", + "AddReductionAssignment", + "SubReductionAssignment", + "MulReductionAssignment", + "MinReductionAssignment", + "MaxReductionAssignment", "assignment_from_stencil", "SymbolCreator", "create_data_handling", diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index de272cf44..4bf136562 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -13,10 +13,10 @@ from ...sympyextensions import ( integer_functions, ConditionalFieldAccess, ) -from ...binop_mapping import binop_str_to_expr +from ...compound_op_mapping import compound_op_to_expr from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType from ...sympyextensions.pointers import AddressOf, mem_acc -from ...sympyextensions.reduction import ReducedAssignment +from ...sympyextensions.reduction import ReductionAssignment, ReductionOp from ...field import Field, FieldType from .context import KernelCreationContext @@ -173,9 +173,16 @@ class FreezeExpressions: assert isinstance(lhs, PsExpression) assert isinstance(rhs, PsExpression) - return PsAssignment(lhs, binop_str_to_expr(expr.op[0], lhs.clone(), rhs)) + _str_to_compound_op: dict[str, ReductionOp] = { + "+=": ReductionOp.Add, + "-=": ReductionOp.Sub, + "*=": ReductionOp.Mul, + "/=": ReductionOp.Div, + } - def map_ReducedAssignment(self, expr: ReducedAssignment): + return PsAssignment(lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs)) + + def map_ReductionAssignment(self, expr: ReductionAssignment): assert isinstance(expr.lhs, TypedSymbol) lhs = self.visit(expr.lhs) @@ -197,21 +204,21 @@ class FreezeExpressions: new_lhs = PsSymbolExpr(new_lhs_symb) # get new rhs from augmented assignment - new_rhs: PsExpression = binop_str_to_expr(expr.op, new_lhs.clone(), rhs) + new_rhs: PsExpression = compound_op_to_expr(expr.op, new_lhs.clone(), rhs) # match for reduction operation and set neutral init_val init_val: PsExpression match expr.op: - case "+": + case ReductionOp.Add: init_val = PsConstantExpr(PsConstant(0, dtype)) - case "-": + case ReductionOp.Sub: init_val = PsConstantExpr(PsConstant(0, dtype)) - case "*": + case ReductionOp.Mul: init_val = PsConstantExpr(PsConstant(1, dtype)) - case "min": + case ReductionOp.Min: init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) init_val.dtype = dtype - case "max": + case ReductionOp.Max: init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) init_val.dtype = dtype case _: diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index 44d1d1ede..f4046d87d 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -115,7 +115,7 @@ class AddOpenMP: if bool(ctx.local_reduction_symbols): for symbol, reduction in ctx.local_reduction_symbols.items(): if isinstance(symbol.dtype, PsScalarType): - pragma_text += f" reduction({reduction.op}: {symbol.name})" + pragma_text += f" reduction({reduction.op.value}: {symbol.name})" else: NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.") diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index d68bfbcac..6e0611a4b 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -7,7 +7,7 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable from .parameters import Parameter -from ..binop_mapping import binop_str_to_expr +from ..compound_op_mapping import compound_op_to_expr from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr from ..types import create_numeric_type, PsIntegerType, PsScalarType diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index 1e71c5b98..d3c2435ed 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -2,6 +2,7 @@ from __future__ import annotations from dataclasses import dataclass from ..field import Field +from ..sympyextensions.reduction import ReductionOp @dataclass(frozen=True) @@ -21,7 +22,7 @@ class LocalReductionVariable(PsSymbolProperty): from ..backend.memory import PsSymbol from ..backend.ast.expressions import PsExpression - op: str + op: ReductionOp init_val: PsExpression ptr_symbol: PsSymbol @@ -32,7 +33,7 @@ class ReductionPointerVariable(PsSymbolProperty): from ..backend.memory import PsSymbol - op: str + op: ReductionOp local_symbol: PsSymbol diff --git a/src/pystencils/binop_mapping.py b/src/pystencils/compound_op_mapping.py similarity index 65% rename from src/pystencils/binop_mapping.py rename to src/pystencils/compound_op_mapping.py index 060fa40aa..eb10b3381 100644 --- a/src/pystencils/binop_mapping.py +++ b/src/pystencils/compound_op_mapping.py @@ -1,31 +1,33 @@ +from enum import Enum from operator import truediv, mul, sub, add from .backend.ast.expressions import PsExpression, PsCall from .backend.exceptions import FreezeError from .backend.functions import PsMathFunction, MathFunctions +from .sympyextensions.reduction import ReductionOp -_available_operator_interface: set[str] = {'+', '-', '*', '/'} +_available_operator_interface: set[ReductionOp] = {ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Div} -def binop_str_to_expr(op: str, op1, op2) -> PsExpression: +def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: if op in _available_operator_interface: match op: - case "+": + case ReductionOp.Add: operator = add - case "-": + case ReductionOp.Sub: operator = sub - case "*": + case ReductionOp.Mul: operator = mul - case "/": + case ReductionOp.Div: operator = truediv case _: raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.") return operator(op1, op2) else: match op: - case "min": + case ReductionOp.Min: return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2]) - case "max": + case ReductionOp.Max: return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) case _: raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.") diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 6ab24e936..eb90f4bed 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,7 +1,7 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc from .pointers import mem_acc -from .reduction import reduced_assign +from .reduction import reduction_assignment, reduction_assignment_from_str, ReductionOp from .math import ( prod, @@ -34,7 +34,9 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", - "reduced_assign", + "reduction_assignment", + "reduction_assignment_from_str", + "ReductionOp", "TypedSymbol", "CastFunc", "mem_acc", diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index c9e5bfdfb..9d8aecb5b 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -1,54 +1,73 @@ +from enum import Enum + from sympy.codegen.ast import AssignmentBase -class ReducedAssignment(AssignmentBase): +class ReductionOp(Enum): + Add = "+" + Sub = "-" + Mul = "*" + Div = "/" + Min = "min" + Max = "max" + + +class ReductionAssignment(AssignmentBase): """ Base class for reduced assignments. Attributes: =========== - binop : str - Symbol for binary operation being applied in the assignment, such as "+", - "*", etc. + binop : CompoundOp + Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc. """ - binop = None # type: str + binop = None # type: ReductionOp @property def op(self): return self.binop -class AddReducedAssignment(ReducedAssignment): - binop = '+' +class AddReductionAssignment(ReductionAssignment): + binop = ReductionOp.Add -class SubReducedAssignment(ReducedAssignment): - binop = '-' +class SubReductionAssignment(ReductionAssignment): + binop = ReductionOp.Sub -class MulReducedAssignment(ReducedAssignment): - binop = '*' +class MulReductionAssignment(ReductionAssignment): + binop = ReductionOp.Mul -class MinReducedssignment(ReducedAssignment): - binop = 'min' +class MinReductionAssignment(ReductionAssignment): + binop = ReductionOp.Min -class MaxReducedssignment(ReducedAssignment): - binop = 'max' +class MaxReductionAssignment(ReductionAssignment): + binop = ReductionOp.Max -# Mapping from binary op strings to AugmentedAssignment subclasses -reduced_assign_classes = { +# Mapping from ReductionOp enum to ReductionAssigment classes +_reduction_assignment_classes = { cls.binop: cls for cls in [ - AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, - MinReducedssignment, MaxReducedssignment + AddReductionAssignment, SubReductionAssignment, MulReductionAssignment, + MinReductionAssignment, MaxReductionAssignment ] } +# Mapping from ReductionOp str to ReductionAssigment classes +_reduction_assignment_classes_for_str = { + cls.value: cls for cls in _reduction_assignment_classes +} -def reduced_assign(lhs, op, rhs): - if op not in reduced_assign_classes: + +def reduction_assignment(lhs, op: ReductionOp, rhs): + if op not in _reduction_assignment_classes: raise ValueError("Unrecognized operator %s" % op) - return reduced_assign_classes[op](lhs, rhs) + return _reduction_assignment_classes[op](lhs, rhs) + + +def reduction_assignment_from_str(lhs, op: str, rhs): + return reduction_assignment(lhs, _reduction_assignment_classes_for_str[op], rhs) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 8095f4e1d..c84417ac7 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -1,10 +1,9 @@ import pytest import numpy as np -import sympy as sp import cupy as cp import pystencils as ps -from pystencils.sympyextensions import reduced_assign +from pystencils.sympyextensions import reduction_assignment_from_str INIT_W = 5 INIT_ARR = 2 @@ -28,11 +27,11 @@ def test_reduction(dtype, op): # kernel with reduction assignment - reduction_assignment = reduced_assign(w, op, x.center()) + red_assign = reduction_assignment_from_str(w, op, x.center()) config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True) - ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype) + ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype) # code_reduction = ps.get_code_str(ast_reduction) kernel_reduction = ast_reduction.compile() -- GitLab From 2424c15725cd621fd8bfa573f928b82255a17693 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 28 Jan 2025 20:05:51 +0100 Subject: [PATCH 081/180] Move reduction properties for local and ptr variable into single dataclass --- .../backend/kernelcreation/context.py | 63 +++++++------------ .../backend/kernelcreation/freeze.py | 24 ++++--- .../backend/transformations/add_pragmas.py | 6 +- src/pystencils/codegen/driver.py | 24 ++++--- src/pystencils/codegen/parameters.py | 14 +---- src/pystencils/codegen/properties.py | 23 ------- src/pystencils/jit/cpu_extension_module.py | 16 ++--- tests/kernelcreation/test_reduction.py | 2 +- 8 files changed, 60 insertions(+), 112 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 2f46a7421..868a7852c 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -1,16 +1,17 @@ from __future__ import annotations +from dataclasses import dataclass from typing import Iterable, Iterator, Any from itertools import chain, count from collections import namedtuple, defaultdict import re +from ..ast.expressions import PsExpression from ...defaults import DEFAULTS from ...field import Field, FieldType +from ...sympyextensions import ReductionOp from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType -from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable - from ..memory import PsSymbol, PsBuffer from ..constants import PsConstant from ...types import ( @@ -46,6 +47,16 @@ class FieldsInKernel: FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array")) +@dataclass(frozen=True) +class ReductionInfo: + + op: ReductionOp + init_val: PsExpression + + orig_symbol: PsSymbol + ptr_symbol: PsSymbol + + class KernelCreationContext: """Manages the translation process from the SymPy frontend to the backend AST, and collects all necessary information for the translation: @@ -77,8 +88,7 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) - self._local_reduction_symbols: dict[PsSymbol, LocalReductionVariable] = dict() - self._reduction_ptr_symbols: dict[PsSymbol, ReductionPointerVariable] = dict() + self._symbols_reduction_info: dict[PsSymbol, ReductionInfo] = dict() self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() @@ -173,41 +183,17 @@ class KernelCreationContext: self._symbols[old.name] = new - def add_local_reduction_symbol(self, local_symb: PsSymbol, local_var_prop: LocalReductionVariable): - """Adds entry for a symbol and its property to the lookup table for local reduction variables. + def add_symbol_reduction_info(self, local_symb: PsSymbol, reduction_info: ReductionInfo): + """Adds entry for a symbol and its reduction info to its corresponding lookup table. - The symbol ``symbol`` should not have a 'LocalReductionSymbol' property and shall not exist in the symbol table. + The symbol ``symbol`` shall not exist in the symbol table already. """ - if self.find_symbol(local_symb.name) is not None: - raise PsInternalCompilerError( - f"add_local_reduction_symbol: {local_symb.name} already exist in the symbol table" - ) - self.add_symbol(local_symb) - - if local_symb not in self._local_reduction_symbols and not local_symb.get_properties(LocalReductionVariable): - local_symb.add_property(local_var_prop) - self._local_reduction_symbols[local_symb] = local_var_prop - else: + if local_symb in self._symbols_reduction_info: raise PsInternalCompilerError( - f"add_local_reduction_symbol: {local_symb.name} already exists in local reduction table" + f"add_symbol_reduction_info: {local_symb.name} already exist in the symbol table" ) - def add_reduction_ptr_symbol(self, orig_symb: PsSymbol, ptr_symb: PsSymbol, ptr_var_prop: ReductionPointerVariable): - """Replaces reduction symbol with a pointer-based counterpart used for export - and adds the new symbol and its property to the lookup table for pointer-based reduction variables - - The symbol ``ptr_symbol`` should not exist in the symbol table. - """ - self.replace_symbol(orig_symb, ptr_symb) - - if ptr_symb not in self._reduction_ptr_symbols and not ptr_symb.get_properties( - ReductionPointerVariable): - ptr_symb.add_property(ptr_var_prop) - self._reduction_ptr_symbols[ptr_symb] = ptr_var_prop - else: - raise PsInternalCompilerError( - f"add_reduction_ptr_symbol: {ptr_symb.name} already exists in pointer-based reduction variable table " - ) + self._symbols_reduction_info[local_symb] = reduction_info def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None @@ -245,14 +231,9 @@ class KernelCreationContext: return self._symbols.values() @property - def local_reduction_symbols(self) -> dict[PsSymbol, LocalReductionVariable]: + def symbols_reduction_info(self) -> dict[PsSymbol, ReductionInfo]: """Return a dictionary holding kernel-local reduction symbols and their reduction properties.""" - return self._local_reduction_symbols - - @property - def reduction_pointer_symbols(self) -> dict[PsSymbol, ReductionPointerVariable]: - """Return a dictionary holding pointer-based reduction symbols and their reduction properties.""" - return self._reduction_ptr_symbols + return self._symbols_reduction_info # Fields and Arrays diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 4bf136562..5bb7f8b08 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -19,7 +19,7 @@ from ...sympyextensions.pointers import AddressOf, mem_acc from ...sympyextensions.reduction import ReductionAssignment, ReductionOp from ...field import Field, FieldType -from .context import KernelCreationContext +from .context import KernelCreationContext, ReductionInfo from ..ast.structural import ( PsAstNode, @@ -66,8 +66,6 @@ from ..exceptions import PsInputError from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions from ..exceptions import FreezeError -from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable - ExprLike = ( sp.Expr @@ -210,25 +208,25 @@ class FreezeExpressions: init_val: PsExpression match expr.op: case ReductionOp.Add: - init_val = PsConstantExpr(PsConstant(0, dtype)) + init_val = PsConstantExpr(PsConstant(0)) case ReductionOp.Sub: - init_val = PsConstantExpr(PsConstant(0, dtype)) + init_val = PsConstantExpr(PsConstant(0)) case ReductionOp.Mul: - init_val = PsConstantExpr(PsConstant(1, dtype)) + init_val = PsConstantExpr(PsConstant(1)) case ReductionOp.Min: init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) - init_val.dtype = dtype case ReductionOp.Max: init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) - init_val.dtype = dtype case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") - # set reduction symbol properties (local/pointer variables) in context - self._ctx.add_local_reduction_symbol(new_lhs_symb, - LocalReductionVariable(expr.op, init_val, orig_lhs_symb_as_ptr)) - self._ctx.add_reduction_ptr_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr, - ReductionPointerVariable(expr.op, new_lhs_symb)) + reduction_info = ReductionInfo(expr.op, init_val, orig_lhs_symb, orig_lhs_symb_as_ptr) + + # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info + self._ctx.add_symbol(new_lhs_symb) + self._ctx.add_symbol_reduction_info(new_lhs_symb, reduction_info) + self._ctx.replace_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr) + return PsAssignment(new_lhs, new_rhs) diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index f4046d87d..d72008d56 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -112,10 +112,10 @@ class AddOpenMP: pragma_text += " parallel" if not omp_params.omit_parallel_construct else "" pragma_text += f" for schedule({omp_params.schedule})" - if bool(ctx.local_reduction_symbols): - for symbol, reduction in ctx.local_reduction_symbols.items(): + if bool(ctx.symbols_reduction_info): + for symbol, reduction_info in ctx.symbols_reduction_info.items(): if isinstance(symbol.dtype, PsScalarType): - pragma_text += f" reduction({reduction.op.value}: {symbol.name})" + pragma_text += f" reduction({reduction_info.op.value}: {symbol.name})" else: NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.") diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 6e0611a4b..ba7df317a 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -5,7 +5,7 @@ from dataclasses import dataclass, replace from .target import Target from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange -from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable +from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr from .parameters import Parameter from ..compound_op_mapping import compound_op_to_expr from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr @@ -154,15 +154,21 @@ class DefaultKernelCreationDriver: if self._intermediates is not None: self._intermediates.constants_eliminated = kernel_ast.clone() - # Init local reduction variable copy - for local_red, local_prop in self._ctx.local_reduction_symbols.items(): - kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), local_prop.init_val)] + kernel_ast.statements + # Extensions for reductions + for symbol, reduction_info in self._ctx.symbols_reduction_info.items(): + # Init local reduction variable copy + kernel_ast.statements = [PsDeclaration(PsSymbolExpr(symbol), + reduction_info.init_val)] + kernel_ast.statements - # Write back result to reduction target variable - for red_ptr, ptr_prop in self._ctx.reduction_pointer_symbols.items(): - ptr_access = PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) + # Write back result to reduction target variable + ptr_access = PsMemAcc(PsSymbolExpr(reduction_info.ptr_symbol), + PsConstantExpr(PsConstant(0))) kernel_ast.statements += [PsAssignment( - ptr_access, binop_str_to_expr(ptr_prop.op, ptr_access, PsSymbolExpr(ptr_prop.local_symbol)))] + ptr_access, compound_op_to_expr(reduction_info.op, ptr_access, PsSymbolExpr(symbol)))] + + # TODO: only newly introduced nodes + typify = Typifier(self._ctx) + kernel_ast = typify(kernel_ast) # Target-Specific optimizations if self._cfg.target.is_cpu(): @@ -462,8 +468,6 @@ def _get_function_params( props: set[PsSymbolProperty] = set() for prop in symb.properties: match prop: - case ReductionPointerVariable(): - props.add(prop) case FieldShape() | FieldStride(): props.add(prop) case BufferBasePtr(buf): diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py index 094553517..e6a513cc7 100644 --- a/src/pystencils/codegen/parameters.py +++ b/src/pystencils/codegen/parameters.py @@ -8,7 +8,7 @@ from .properties import ( _FieldProperty, FieldShape, FieldStride, - FieldBasePtr, ReductionPointerVariable, + FieldBasePtr, ) from ..types import PsType from ..field import Field @@ -39,9 +39,6 @@ class Parameter: key=lambda f: f.name, ) ) - self._reduction_ptr: Optional[ReductionPointerVariable] = next( - (e for e in self._properties if isinstance(e, ReductionPointerVariable)), None - ) @property def name(self): @@ -82,11 +79,6 @@ class Parameter: """Set of fields associated with this parameter.""" return self._fields - @property - def reduction_pointer(self) -> Optional[ReductionPointerVariable]: - """Reduction pointer associated with this parameter.""" - return self._reduction_ptr - def get_properties( self, prop_type: type[PsSymbolProperty] | tuple[type[PsSymbolProperty], ...] ) -> set[PsSymbolProperty]: @@ -113,10 +105,6 @@ class Parameter: ) return bool(self.get_properties(FieldBasePtr)) - @property - def is_reduction_pointer(self) -> bool: - return bool(self._reduction_ptr) - @property def is_field_stride(self) -> bool: # pragma: no cover warn( diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py index d3c2435ed..d377fb3d3 100644 --- a/src/pystencils/codegen/properties.py +++ b/src/pystencils/codegen/properties.py @@ -2,7 +2,6 @@ from __future__ import annotations from dataclasses import dataclass from ..field import Field -from ..sympyextensions.reduction import ReductionOp @dataclass(frozen=True) @@ -15,28 +14,6 @@ class UniqueSymbolProperty(PsSymbolProperty): """Base class for unique properties, of which only one instance may be registered at a time.""" -@dataclass(frozen=True) -class LocalReductionVariable(PsSymbolProperty): - """Property for symbols specifying the operation and initial value for a kernel-local reduction variable.""" - - from ..backend.memory import PsSymbol - from ..backend.ast.expressions import PsExpression - - op: ReductionOp - init_val: PsExpression - ptr_symbol: PsSymbol - - -@dataclass(frozen=True) -class ReductionPointerVariable(PsSymbolProperty): - """Property for pointer-type symbols exporting the reduction result from the kernel.""" - - from ..backend.memory import PsSymbol - - op: ReductionOp - local_symbol: PsSymbol - - @dataclass(frozen=True) class FieldShape(PsSymbolProperty): """Symbol acts as a shape parameter to a field.""" diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index 6ec62c28d..44185f4ed 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -18,7 +18,7 @@ from ..types import ( PsType, PsUnsignedIntegerType, PsSignedIntegerType, - PsIeeeFloatType, + PsIeeeFloatType, PsPointerType, ) from ..types.quick import Fp, SInt, UInt @@ -205,7 +205,7 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ self._array_assoc_var_extractions: dict[Parameter, str] = dict() self._scalar_extractions: dict[Parameter, str] = dict() - self._reduction_ptrs: dict[Parameter, str] = dict() + self._pointer_extractions: dict[Parameter, str] = dict() self._constraint_checks: list[str] = [] @@ -278,9 +278,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return param.name - def extract_reduction_ptr(self, param: Parameter) -> str: - if param not in self._reduction_ptrs: - ptr = param.reduction_pointer + def extract_ptr(self, param: Parameter) -> str: + if param not in self._pointer_extractions: + ptr = param.symbol buffer = self.extract_buffer(ptr, param.name, param.dtype) code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" @@ -317,10 +317,10 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ return param.name def extract_parameter(self, param: Parameter): - if param.is_reduction_pointer: - self.extract_reduction_ptr(param) - elif param.is_field_parameter: + if param.is_field_parameter: self.extract_array_assoc_var(param) + elif isinstance(param.dtype, PsPointerType): + self.extract_ptr(param) else: self.extract_scalar(param) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index c84417ac7..69b75e711 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -18,7 +18,7 @@ SOLUTION = { @pytest.mark.parametrize('dtype', ["float64"]) -@pytest.mark.parametrize("op", ["+", "-", "*"]) #, "min", "max"]) # TODO: min/max broken due to error in BasePrinter +@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction(dtype, op): gpu_avail = False -- GitLab From 06dc234497d8860c4c7fde704ba26c8c3da030a1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 29 Jan 2025 11:48:26 +0100 Subject: [PATCH 082/180] Use std::numeric_limits as NumericLimitsFunctions backend for cpu --- .../backend/platforms/generic_cpu.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 58b9c7946..b145b6f76 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -44,7 +44,7 @@ class GenericCpu(Platform): @property def required_headers(self) -> set[str]: - return {"<math.h>", "<limits.h>", "<float.h>"} + return {"<math.h>", "<limits>"} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace @@ -64,22 +64,7 @@ class GenericCpu(Platform): arg_types = (dtype,) * func.num_args if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): - # get type prefix for macro - # TODO: there must be a better way... - tpe = "" - match dtype: - case PsIeeeFloatType(): - match dtype.width: - case 32: - tpe = "FLT" - case 64: - tpe = "DBL" - case _: - raise MaterializationError( - f"No implementation available for function {func} on data type {dtype}" - ) - - return PsLiteralExpr(PsLiteral(f"{tpe}_{func.function_name}".upper(), dtype)) + return PsLiteralExpr(PsLiteral(f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype)) if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): cfunc: CFunction -- GitLab From a9da7d432d3bb5ed03da5f08f7b5dbd94c17ff7d Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 29 Jan 2025 11:56:00 +0100 Subject: [PATCH 083/180] Fix lint [skip ci] --- src/pystencils/backend/kernelcreation/freeze.py | 1 - src/pystencils/codegen/parameters.py | 2 +- src/pystencils/compound_op_mapping.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 5bb7f8b08..c8d84c1b4 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -227,7 +227,6 @@ class FreezeExpressions: self._ctx.add_symbol_reduction_info(new_lhs_symb, reduction_info) self._ctx.replace_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr) - return PsAssignment(new_lhs, new_rhs) def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr: diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py index e6a513cc7..d8411266e 100644 --- a/src/pystencils/codegen/parameters.py +++ b/src/pystencils/codegen/parameters.py @@ -1,7 +1,7 @@ from __future__ import annotations from warnings import warn -from typing import Sequence, Iterable, Optional +from typing import Sequence, Iterable from .properties import ( PsSymbolProperty, diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/compound_op_mapping.py index eb10b3381..1eadfa6f0 100644 --- a/src/pystencils/compound_op_mapping.py +++ b/src/pystencils/compound_op_mapping.py @@ -1,4 +1,3 @@ -from enum import Enum from operator import truediv, mul, sub, add from .backend.ast.expressions import PsExpression, PsCall -- GitLab From 53807242b8f2eae36b75abb3d101d53b3948157a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 30 Jan 2025 16:20:25 +0100 Subject: [PATCH 084/180] Remove orig_symbol from reduction info as it is not needed --- src/pystencils/backend/kernelcreation/context.py | 2 -- src/pystencils/backend/kernelcreation/freeze.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 868a7852c..67d5b1c1d 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -52,8 +52,6 @@ class ReductionInfo: op: ReductionOp init_val: PsExpression - - orig_symbol: PsSymbol ptr_symbol: PsSymbol diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index c8d84c1b4..cfa145e5a 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -220,7 +220,7 @@ class FreezeExpressions: case _: raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") - reduction_info = ReductionInfo(expr.op, init_val, orig_lhs_symb, orig_lhs_symb_as_ptr) + reduction_info = ReductionInfo(expr.op, init_val, orig_lhs_symb_as_ptr) # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info self._ctx.add_symbol(new_lhs_symb) -- GitLab From dd8f421d774f59d1dd36d4fdcb76a51375f4bda9 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 30 Jan 2025 16:38:46 +0100 Subject: [PATCH 085/180] Introduce functions to be unfolded by platform into code blocks for reduction init and write-back --- src/pystencils/backend/functions.py | 43 +++++++++++++++++++ .../backend/kernelcreation/typification.py | 4 +- .../backend/platforms/generic_cpu.py | 41 ++++++++++++++++-- src/pystencils/backend/platforms/platform.py | 11 +++++ .../transformations/select_functions.py | 4 +- src/pystencils/codegen/driver.py | 29 +++++++------ tests/kernelcreation/test_reduction.py | 4 +- 7 files changed, 113 insertions(+), 23 deletions(-) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index 18c2277cf..201321693 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -30,6 +30,7 @@ from typing import Any, Sequence, TYPE_CHECKING from abc import ABC from enum import Enum +from ..sympyextensions import ReductionOp from ..types import PsType from .exceptions import PsInternalCompilerError @@ -134,6 +135,48 @@ class PsMathFunction(PsFunction): return hash(self._func) +class ReductionFunctions(Enum): + """Function representing different steps in kernels with reductions supported by the backend. + + Each platform has to materialize these functions to a concrete implementation. + """ + + InitLocalCopy = ("InitLocalCopy", 2) + WriteBackToPtr = ("WriteBackToPtr", 2) + + def __init__(self, func_name, num_args): + self.function_name = func_name + self.num_args = num_args + + +class PsReductionFunction(PsFunction): + + def __init__(self, func: ReductionFunctions, op: ReductionOp) -> None: + super().__init__(func.function_name, func.num_args) + self._func = func + self._op = op + + @property + def func(self) -> ReductionFunctions: + return self._func + + @property + def op(self) -> ReductionOp: + return self._op + + def __str__(self) -> str: + return f"{self._func.function_name}" + + def __eq__(self, other: object) -> bool: + if not isinstance(other, PsReductionFunction): + return False + + return self._func == other._func + + def __hash__(self) -> int: + return hash(self._func) + + class CFunction(PsFunction): """A concrete C function. diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 62feca265..059817bfd 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -50,7 +50,7 @@ from ..ast.expressions import ( PsNot, ) from ..ast.vector import PsVecBroadcast, PsVecMemAcc -from ..functions import PsMathFunction, CFunction +from ..functions import PsMathFunction, CFunction, PsReductionFunction from ..ast.util import determine_memory_object from ..exceptions import TypificationError @@ -590,7 +590,7 @@ class Typifier: case PsCall(function, args): match function: - case PsMathFunction(): + case PsMathFunction() | PsReductionFunction(): for arg in args: self.visit_expr(arg, tc) tc.infer_dtype(expr) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index b145b6f76..33cb28711 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -1,11 +1,14 @@ from abc import ABC, abstractmethod from typing import Sequence -from pystencils.backend.ast.expressions import PsCall +from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr -from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions +from ..ast import PsAstNode +from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions, \ + PsReductionFunction from ..literals import PsLiteral -from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType +from ...compound_op_mapping import compound_op_to_expr +from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType from .platform import Platform from ..exceptions import MaterializationError @@ -18,7 +21,7 @@ from ..kernelcreation.iteration_space import ( ) from ..constants import PsConstant -from ..ast.structural import PsDeclaration, PsLoop, PsBlock +from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsAssignment from ..ast.expressions import ( PsSymbolExpr, PsExpression, @@ -56,6 +59,36 @@ class GenericCpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") + def unfold_function( + self, call: PsCall + ) -> PsAstNode: + assert isinstance(call.function, PsReductionFunction) + + func = call.function.func + + match func: + case ReductionFunctions.InitLocalCopy: + symbol_expr, init_val = call.args + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression) + + return PsDeclaration(symbol_expr, init_val) + case ReductionFunctions.WriteBackToPtr: + ptr_expr, symbol_expr = call.args + op = call.function.op + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) + + ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) + + # TODO: can this be avoided somehow? + potential_call = compound_op_to_expr(op, ptr_access, symbol_expr) + if isinstance(potential_call, PsCall): + potential_call.dtype = symbol_expr.dtype + potential_call = self.select_function(potential_call) + + return PsAssignment(ptr_access, potential_call) + def select_function(self, call: PsCall) -> PsExpression: assert isinstance(call.function, PsMathFunction) diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index 2c7ee1c5f..732f37bbc 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from typing import Any +from ..ast import PsAstNode from ..ast.structural import PsBlock from ..ast.expressions import PsCall, PsExpression @@ -40,3 +41,13 @@ class Platform(ABC): If no viable implementation exists, raise a `MaterializationError`. """ pass + + @abstractmethod + def unfold_function( + self, call: PsCall + ) -> PsAstNode: + """Unfolds an implementation for the given function on the given data type. + + If no viable implementation exists, raise a `MaterializationError`. + """ + pass diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py index e41c345ae..0045de87b 100644 --- a/src/pystencils/backend/transformations/select_functions.py +++ b/src/pystencils/backend/transformations/select_functions.py @@ -1,7 +1,7 @@ from ..platforms import Platform from ..ast import PsAstNode from ..ast.expressions import PsCall -from ..functions import PsMathFunction +from ..functions import PsMathFunction, PsReductionFunction class SelectFunctions: @@ -19,5 +19,7 @@ class SelectFunctions: if isinstance(node, PsCall) and isinstance(node.function, PsMathFunction): return self._platform.select_function(node) + elif isinstance(node, PsCall) and isinstance(node.function, PsReductionFunction): + return self._platform.unfold_function(node) else: return node diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index ba7df317a..9a80439e7 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -7,14 +7,14 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO from .kernel import Kernel, GpuKernel, GpuThreadsRange from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr from .parameters import Parameter -from ..compound_op_mapping import compound_op_to_expr -from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr +from ..backend.functions import PsReductionFunction, ReductionFunctions +from ..backend.ast.expressions import PsSymbolExpr, PsCall from ..types import create_numeric_type, PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode -from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment, PsDeclaration +from ..backend.ast.structural import PsBlock, PsLoop from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -156,19 +156,20 @@ class DefaultKernelCreationDriver: # Extensions for reductions for symbol, reduction_info in self._ctx.symbols_reduction_info.items(): - # Init local reduction variable copy - kernel_ast.statements = [PsDeclaration(PsSymbolExpr(symbol), - reduction_info.init_val)] + kernel_ast.statements + typify = Typifier(self._ctx) + symbol_expr = typify(PsSymbolExpr(symbol)) + ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol)) + init_val = typify(reduction_info.init_val) - # Write back result to reduction target variable - ptr_access = PsMemAcc(PsSymbolExpr(reduction_info.ptr_symbol), - PsConstantExpr(PsConstant(0))) - kernel_ast.statements += [PsAssignment( - ptr_access, compound_op_to_expr(reduction_info.op, ptr_access, PsSymbolExpr(symbol)))] + init_local_copy = PsCall(PsReductionFunction(ReductionFunctions.InitLocalCopy, reduction_info.op), + [symbol_expr, init_val]) + write_back_ptr = PsCall(PsReductionFunction(ReductionFunctions.WriteBackToPtr, reduction_info.op), + [ptr_symbol_expr, symbol_expr]) - # TODO: only newly introduced nodes - typify = Typifier(self._ctx) - kernel_ast = typify(kernel_ast) + # Init local reduction variable copy + kernel_ast.statements = [init_local_copy] + kernel_ast.statements + # Write back result to reduction target variable + kernel_ast.statements += [write_back_ptr] # Target-Specific optimizations if self._cfg.target.is_cpu(): diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 69b75e711..b24058571 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -32,11 +32,11 @@ def test_reduction(dtype, op): config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True) ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype) + ps.show_code(ast_reduction) + # code_reduction = ps.get_code_str(ast_reduction) kernel_reduction = ast_reduction.compile() - ps.show_code(ast_reduction) - array = np.full((SIZE,), INIT_ARR, dtype=dtype) reduction_array = np.full((1,), INIT_W, dtype=dtype) -- GitLab From 6e08683b7e5fe681986a677a014cbcadafa017f2 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 30 Jan 2025 18:04:02 +0100 Subject: [PATCH 086/180] Add dummy implementations for unfold_function in cuda/sycl platforms --- src/pystencils/backend/platforms/cuda.py | 6 ++++++ src/pystencils/backend/platforms/sycl.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index f146cfbfd..bb42e1f9b 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -2,6 +2,7 @@ from __future__ import annotations from warnings import warn from typing import TYPE_CHECKING +from ..ast import PsAstNode from ...types import constify from ..exceptions import MaterializationError from .generic_gpu import GenericGpu @@ -134,6 +135,11 @@ class CudaPlatform(GenericGpu): f"No implementation available for function {func} on data type {dtype}" ) + def unfold_function( + self, call: PsCall + ) -> PsAstNode: + pass + # Internals def _prepend_dense_translation( diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 9c04d6074..dd38aeb48 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import TYPE_CHECKING +from ..ast import PsAstNode from ..functions import CFunction, PsMathFunction, MathFunctions from ..kernelcreation.iteration_space import ( IterationSpace, @@ -108,6 +109,11 @@ class SyclPlatform(GenericGpu): f"No implementation available for function {func} on data type {dtype}" ) + def unfold_function( + self, call: PsCall + ) -> PsAstNode: + pass + def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace ) -> tuple[PsBlock, GpuThreadsRange]: -- GitLab From e15d3cf7a043fc45b78747d85e035dcd314bcd42 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 30 Jan 2025 19:15:13 +0100 Subject: [PATCH 087/180] Add first CUDA reduction impl using atomic operations --- src/pystencils/backend/platforms/cuda.py | 31 +++++++++++++++++++++--- src/pystencils/include/gpu_defines.h | 28 +++++++++++++++++++++ 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index bb42e1f9b..95480de93 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -3,7 +3,7 @@ from warnings import warn from typing import TYPE_CHECKING from ..ast import PsAstNode -from ...types import constify +from ...types import constify, PsPointerType, PsScalarType, PsCustomType from ..exceptions import MaterializationError from .generic_gpu import GenericGpu @@ -23,12 +23,12 @@ from ..ast.expressions import ( PsCast, PsCall, PsLookup, - PsBufferAcc, + PsBufferAcc, PsSymbolExpr ) from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral -from ..functions import PsMathFunction, MathFunctions, CFunction +from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions if TYPE_CHECKING: from ...codegen import GpuIndexingConfig, GpuThreadsRange @@ -138,7 +138,30 @@ class CudaPlatform(GenericGpu): def unfold_function( self, call: PsCall ) -> PsAstNode: - pass + assert isinstance(call.function, PsReductionFunction) + + func = call.function.func + + match func: + case ReductionFunctions.InitLocalCopy: + symbol_expr, init_val = call.args + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression) + + return PsDeclaration(symbol_expr, init_val) + case ReductionFunctions.WriteBackToPtr: + ptr_expr, symbol_expr = call.args + op = call.function.op + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) + + call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) + call.args = [ptr_expr, symbol_expr] + + if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64): + NotImplementedError("atomicMul is only available for float32/64 datatypes") + + return call # Internals diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h index 67e7722e9..04eeace47 100644 --- a/src/pystencils/include/gpu_defines.h +++ b/src/pystencils/include/gpu_defines.h @@ -10,3 +10,31 @@ typedef __hip_int8_t int8_t; typedef __hip_uint16_t uint16_t; typedef __hip_int16_t int16_t; #endif + +#ifdef __CUDA_ARCH__ +// Implementation of atomic multiplication +// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division +__device__ double atomicMul(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int oldValue = *address_as_ull, assumed; + do { + assumed = oldValue; + oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * + __longlong_as_double(assumed))); + } while (assumed != oldValue); + + return __longlong_as_double(oldValue); +} + +__device__ float atomicMul(float* address, float val) { + int* address_as_int = (int*)address; + int old = *address_as_int; + int assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed))); + } while (assumed != old); + + return __int_as_float(old); +} +#endif -- GitLab From 10def05e256d2c316f5063e48bbbfebe2c84ea85 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 30 Jan 2025 19:59:46 +0100 Subject: [PATCH 088/180] Fix typecheck --- src/pystencils/backend/platforms/cuda.py | 2 +- src/pystencils/backend/platforms/sycl.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 95480de93..bf5b91b82 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -156,7 +156,7 @@ class CudaPlatform(GenericGpu): assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) - call.args = [ptr_expr, symbol_expr] + call.args = (ptr_expr, symbol_expr) if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64): NotImplementedError("atomicMul is only available for float32/64 datatypes") diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index dd38aeb48..2ea2934f3 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -112,7 +112,9 @@ class SyclPlatform(GenericGpu): def unfold_function( self, call: PsCall ) -> PsAstNode: - pass + raise MaterializationError( + f"No implementation available for function {call.function.name}" + ) def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace -- GitLab From 0fb11858f2c65fc46c8dca469c75c28bf283dfdb Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 30 Jan 2025 20:06:00 +0100 Subject: [PATCH 089/180] Add CUDA backend for numeric limits --- src/pystencils/backend/platforms/cuda.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index bf5b91b82..ef3c11598 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -28,7 +28,8 @@ from ..ast.expressions import ( from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral -from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions +from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions, \ + NumericLimitsFunctions if TYPE_CHECKING: from ...codegen import GpuIndexingConfig, GpuThreadsRange @@ -64,7 +65,7 @@ class CudaPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return {'"gpu_defines.h"'} + return {'"gpu_defines.h"', "<cuda/std/limits>"} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace @@ -83,6 +84,9 @@ class CudaPlatform(GenericGpu): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args + if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): + return PsLiteralExpr(PsLiteral(f"::cuda::std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype)) + if isinstance(dtype, PsIeeeFloatType): match func: case ( -- GitLab From 616f609f24551439403c13b0d282fab147099f9f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 31 Jan 2025 14:51:45 +0100 Subject: [PATCH 090/180] Fix lint [skip ci] --- src/pystencils/backend/platforms/cuda.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index ef3c11598..3fe5be229 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -85,7 +85,8 @@ class CudaPlatform(GenericGpu): arg_types = (dtype,) * func.num_args if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): - return PsLiteralExpr(PsLiteral(f"::cuda::std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype)) + return PsLiteralExpr( + PsLiteral(f"::cuda::std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype)) if isinstance(dtype, PsIeeeFloatType): match func: -- GitLab From 4c7fd40921a2486f8164efc84e455b59138ae6d1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 31 Jan 2025 14:56:08 +0100 Subject: [PATCH 091/180] Fix lint [skip ci] --- src/pystencils/backend/platforms/cuda.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 3fe5be229..73c4b3b47 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -160,7 +160,8 @@ class CudaPlatform(GenericGpu): assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) - call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) + call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], + PsCustomType("void")) call.args = (ptr_expr, symbol_expr) if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64): -- GitLab From 826ee8e26f2bb09b95bb473b347d06cd6a36207c Mon Sep 17 00:00:00 2001 From: Richard Angersbach <iwia025h@csnhr.nhr.fau.de> Date: Tue, 4 Feb 2025 16:30:13 +0100 Subject: [PATCH 092/180] Try supporting pointer dtypes for reductions in cupy gpu jit --- src/pystencils/jit/gpu_cupy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index c208ac219..467e86be7 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -11,7 +11,7 @@ except ImportError: from ..codegen import Target from ..field import FieldType -from ..types import PsType +from ..types import PsType, PsPointerType from .jit import JitBase, JitError, KernelWrapper from ..codegen import ( Kernel, @@ -183,6 +183,9 @@ class CupyKernelWrapper(KernelWrapper): kparam.dtype, ) break + elif isinstance(kparam.dtype, PsPointerType): + val = kwargs[kparam.name] + args.append(val) else: # scalar parameter val: Any = kwargs[kparam.name] -- GitLab From f60d9d5df3c87c54d587ad7496025e70ee2388f0 Mon Sep 17 00:00:00 2001 From: Richard Angersbach <iwia025h@csnhr.nhr.fau.de> Date: Tue, 4 Feb 2025 16:31:16 +0100 Subject: [PATCH 093/180] Minor adaptations for reduction test --- tests/kernelcreation/test_reduction.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index b24058571..be2589912 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -18,9 +18,9 @@ SOLUTION = { @pytest.mark.parametrize('dtype', ["float64"]) -@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) -def test_reduction(dtype, op): - gpu_avail = False +@pytest.mark.parametrize("op", ["+"]) #, "-", "*", "min", "max" +def test_reduction(target, dtype, op): + gpu_avail = target is ps.Target.GPU x = ps.fields(f'x: {dtype}[1d]') w = ps.TypedSymbol("w", dtype) -- GitLab From d8ae900242264392479f4405678d9a1f1b177890 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 4 Feb 2025 17:49:37 +0100 Subject: [PATCH 094/180] Use predefined macro values for numeric limits in cuda backend --- src/pystencils/backend/platforms/cuda.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 73c4b3b47..fa246c128 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -65,7 +65,7 @@ class CudaPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return {'"gpu_defines.h"', "<cuda/std/limits>"} + return {'"gpu_defines.h"'} def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace @@ -85,8 +85,11 @@ class CudaPlatform(GenericGpu): arg_types = (dtype,) * func.num_args if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): + assert isinstance(dtype, PsIeeeFloatType) + defines = { NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY" } + return PsLiteralExpr( - PsLiteral(f"::cuda::std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype)) + PsLiteral(defines[func.function_name], dtype)) if isinstance(dtype, PsIeeeFloatType): match func: -- GitLab From a2a59d40b66390cebe849870d1b9cf058da82850 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 4 Feb 2025 18:04:15 +0100 Subject: [PATCH 095/180] Wrap statement around generated atomic call [skip ci] --- src/pystencils/backend/platforms/cuda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index fa246c128..a89225a08 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -16,7 +16,7 @@ from ..kernelcreation import ( ) from ..kernelcreation.context import KernelCreationContext -from ..ast.structural import PsBlock, PsConditional, PsDeclaration +from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement from ..ast.expressions import ( PsExpression, PsLiteralExpr, @@ -170,7 +170,7 @@ class CudaPlatform(GenericGpu): if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64): NotImplementedError("atomicMul is only available for float32/64 datatypes") - return call + return PsStatement(call) # Internals -- GitLab From 5caafdd05712621bc990ce7ef13c08f232f260af Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 10 Feb 2025 15:13:51 +0100 Subject: [PATCH 096/180] Add guard for INFINITY numeric limit macro used by cuda backend --- src/pystencils/backend/platforms/cuda.py | 3 +-- src/pystencils/include/gpu_defines.h | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 1a8fdc482..1af8917cc 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -90,8 +90,7 @@ class CudaPlatform(GenericGpu): assert isinstance(dtype, PsIeeeFloatType) defines = { NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY" } - return PsLiteralExpr( - PsLiteral(defines[func.function_name], dtype)) + return PsLiteralExpr(PsLiteral(defines[func], dtype)) if isinstance(dtype, PsIeeeFloatType): match func: diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h index 04eeace47..8f961e25b 100644 --- a/src/pystencils/include/gpu_defines.h +++ b/src/pystencils/include/gpu_defines.h @@ -1,8 +1,10 @@ #pragma once #define POS_INFINITY __int_as_float(0x7f800000) -#define INFINITY POS_INFINITY #define NEG_INFINITY __int_as_float(0xff800000) +#ifndef INFINITY +#define INFINITY POS_INFINITY +#endif #ifdef __HIPCC_RTC__ typedef __hip_uint8_t uint8_t; -- GitLab From a71e0d318a6f96bef38ce4b7b260d8f3fd73d91d Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 10 Feb 2025 18:03:52 +0100 Subject: [PATCH 097/180] Temporarily change default CUDA block size for CUDA jit --- src/pystencils/jit/gpu_cupy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 54fb41173..331b58ce5 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -242,7 +242,7 @@ class CupyKernelWrapper(KernelWrapper): class CupyJit(JitBase): - def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)): + def __init__(self, default_block_size: Sequence[int] = (128, 1, 1)): self._runtime_headers = {"<cstdint>"} if len(default_block_size) > 3: -- GitLab From 60a348f1f3f39d184da872007756c3aeed13ccee Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 10 Feb 2025 18:30:29 +0100 Subject: [PATCH 098/180] Support atomic sub, min, max for fp reductions using custom implementations with CAS mechanism --- src/pystencils/backend/platforms/cuda.py | 12 ++++- src/pystencils/include/gpu_defines.h | 59 +++++++++++++++++++++++- tests/kernelcreation/test_reduction.py | 2 +- 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 1af8917cc..f9fbdfa56 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -3,6 +3,7 @@ from warnings import warn from typing import TYPE_CHECKING from ..ast import PsAstNode +from ...sympyextensions.reduction import ReductionOp from ...types import constify, PsPointerType, PsScalarType, PsCustomType from ..exceptions import MaterializationError from .generic_gpu import GenericGpu @@ -165,9 +166,16 @@ class CudaPlatform(GenericGpu): assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) - call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], + match op: + case ReductionOp.Sub: + # workaround for unsupported atomicSub: use atomic add and invert sign + call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) - call.args = (ptr_expr, symbol_expr) + call.args = (ptr_expr, -symbol_expr) + case _: + call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], + PsCustomType("void")) + call.args = (ptr_expr, symbol_expr) if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64): NotImplementedError("atomicMul is only available for float32/64 datatypes") diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h index 8f961e25b..5525bbc69 100644 --- a/src/pystencils/include/gpu_defines.h +++ b/src/pystencils/include/gpu_defines.h @@ -14,8 +14,11 @@ typedef __hip_int16_t int16_t; #endif #ifdef __CUDA_ARCH__ -// Implementation of atomic multiplication -// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division +// No direct implementation of atomic multiplication, minimum and maximum available +// -> add support by custom implementations using a CAS mechanism + +// - atomicMul (double/float) +// see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division __device__ double atomicMul(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int oldValue = *address_as_ull, assumed; @@ -39,4 +42,56 @@ __device__ float atomicMul(float* address, float val) { return __int_as_float(old); } + +// - atomicMin (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMin(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val < __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMin(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val < __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +// - atomicMax (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMax(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val > __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMax(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val > __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} #endif diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index be2589912..07fb94a7e 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -18,7 +18,7 @@ SOLUTION = { @pytest.mark.parametrize('dtype', ["float64"]) -@pytest.mark.parametrize("op", ["+"]) #, "-", "*", "min", "max" +@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction(target, dtype, op): gpu_avail = target is ps.Target.GPU -- GitLab From d4b7e78fca17ca130c55959e330a161d07ebba80 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 11 Feb 2025 17:16:23 +0100 Subject: [PATCH 099/180] Add initial implementation for horizontal reductions for vectorization --- src/pystencils/backend/ast/vector.py | 40 +++++++++++++++++++ src/pystencils/backend/emission/ir_printer.py | 11 ++++- .../backend/kernelcreation/typification.py | 18 ++++++++- src/pystencils/backend/platforms/x86.py | 16 ++++++-- .../transformations/loop_vectorizer.py | 31 +++++++++++--- .../transformations/select_intrinsics.py | 14 ++++++- tests/kernelcreation/test_reduction.py | 5 ++- 7 files changed, 122 insertions(+), 13 deletions(-) diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 705d25094..8ff1ff8a0 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -5,6 +5,7 @@ from typing import cast from .astnode import PsAstNode from .expressions import PsExpression, PsLvalue, PsUnOp from .util import failing_cast +from ...sympyextensions import ReductionOp from ...types import PsVectorType @@ -42,6 +43,45 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): ) +class PsVecHorizontal(PsUnOp, PsVectorOp): + """Extracts scalar value from N vector lanes.""" + + __match_args__ = ("lanes", "operand", "operation") + + def __init__(self, lanes: int, operand: PsExpression, reduction_op: ReductionOp): + super().__init__(operand) + self._lanes = lanes + self._reduction_operation = reduction_op + + @property + def lanes(self) -> int: + return self._lanes + + @lanes.setter + def lanes(self, n: int): + self._lanes = n + + @property + def reduction_operation(self) -> ReductionOp: + return self._reduction_operation + + @reduction_operation.setter + def reduction_operation(self, op: ReductionOp): + self._reduction_operation = op + + def _clone_expr(self) -> PsVecHorizontal: + return PsVecHorizontal(self._lanes, self._operand.clone(), self._operation.clone()) + + def structurally_equal(self, other: PsAstNode) -> bool: + if not isinstance(other, PsVecHorizontal): + return False + return ( + super().structurally_equal(other) + and self._lanes == other._lanes + and self._operation == other._operation + ) + + class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): """Pointer-based vectorized memory access. diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py index ffb65181c..04084dd3b 100644 --- a/src/pystencils/backend/emission/ir_printer.py +++ b/src/pystencils/backend/emission/ir_printer.py @@ -10,7 +10,7 @@ from .base_printer import BasePrinter, Ops, LR from ..ast import PsAstNode from ..ast.expressions import PsBufferAcc -from ..ast.vector import PsVecMemAcc, PsVecBroadcast +from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal if TYPE_CHECKING: from ...codegen import Kernel @@ -77,6 +77,15 @@ class IRAstPrinter(BasePrinter): f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest ) + case PsVecHorizontal(lanes, operand, reduction_op): + pc.push_op(Ops.Weakest, LR.Middle) + operand_code = self.visit(operand, pc) + pc.pop_op() + + return pc.parenthesize( + f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({operand_code})", Ops.Weakest + ) + case _: return super().visit(node, pc) diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 059817bfd..25fb55a0b 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -49,7 +49,7 @@ from ..ast.expressions import ( PsNeg, PsNot, ) -from ..ast.vector import PsVecBroadcast, PsVecMemAcc +from ..ast.vector import PsVecBroadcast, PsVecMemAcc, PsVecHorizontal from ..functions import PsMathFunction, CFunction, PsReductionFunction from ..ast.util import determine_memory_object from ..exceptions import TypificationError @@ -640,6 +640,22 @@ class Typifier: tc.apply_dtype(PsVectorType(op_tc.target_type, lanes), expr) + case PsVecHorizontal(): + op_tc = TypeContext() + self.visit_expr(expr.operand, op_tc) + + if op_tc.target_type is None: + raise TypificationError( + f"Unable to determine type of argument to vector horizontal: {expr.operand}" + ) + + if not isinstance(op_tc.target_type, PsVectorType): + raise TypificationError( + f"Illegal type in argument to vector horizontal: {op_tc.target_type}" + ) + + tc.apply_dtype(op_tc.target_type.scalar_type, expr) + case _: raise NotImplementedError(f"Can't typify {expr}") diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index 7d2fe650f..acd397155 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -17,8 +17,8 @@ from ..ast.expressions import ( PsCast, PsCall, ) -from ..ast.vector import PsVecMemAcc, PsVecBroadcast -from ...types import PsCustomType, PsVectorType, PsPointerType +from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal +from ...types import PsCustomType, PsVectorType, PsPointerType, PsType from ..constants import PsConstant from ..exceptions import MaterializationError @@ -160,7 +160,14 @@ class X86VectorCpu(GenericVectorCpu): ) -> PsExpression: match expr: case PsUnOp() | PsBinOp(): - func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype()) + vtype: PsType + if isinstance(expr, PsVecHorizontal): + # expression itself is scalar, but argument is a vector + vtype = expr.operand.get_dtype() + else: + vtype = expr.get_dtype() + + func = _x86_op_intrin(self._vector_arch, expr, vtype) intrinsic = func(*operands) intrinsic.dtype = func.return_type return intrinsic @@ -343,6 +350,9 @@ def _x86_op_intrin( if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4: suffix += "x" atype = vtype.scalar_type + case PsVecHorizontal(): + opstr = f"horizontal_{op.reduction_operation.name.lower()}" + rtype = vtype.scalar_type case PsAdd(): opstr = "add" case PsSub(): diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index e1e4fea50..39d72adb4 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -7,9 +7,9 @@ from ...types import PsVectorType, PsScalarType from ..kernelcreation import KernelCreationContext from ..constants import PsConstant from ..ast import PsAstNode -from ..ast.structural import PsLoop, PsBlock, PsDeclaration -from ..ast.expressions import PsExpression, PsTernary, PsGt -from ..ast.vector import PsVecBroadcast +from ..ast.structural import PsLoop, PsBlock, PsDeclaration, PsAssignment +from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr +from ..ast.vector import PsVecBroadcast, PsVecHorizontal from ..ast.analysis import collect_undefined_symbols from .ast_vectorizer import VectorizationAxis, VectorizationContext, AstVectorizer @@ -134,6 +134,21 @@ class LoopVectorizer: # Prepare vectorization context vc = VectorizationContext(self._ctx, self._lanes, axis) + # Prepare reductions + simd_init_local_reduction_vars = [] + simd_writeback_local_reduction_vars = [] + for symb, reduction_info in self._ctx.symbols_reduction_info.items(): + # Vectorize symbol for local copy + vector_symb = vc.vectorize_symbol(symb) + + # Declare and init vector + simd_init_local_reduction_vars += [self._type_fold(PsDeclaration( + PsSymbolExpr(vector_symb), PsVecBroadcast(self._lanes, PsSymbolExpr(symb))))] + + # Write back vectorization result + simd_writeback_local_reduction_vars += [self._type_fold(PsAssignment( + PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(vector_symb), reduction_info.op)))] + # Generate vectorized loop body simd_body = self._vectorize_ast(loop.body, vc) @@ -224,10 +239,14 @@ class LoopVectorizer: ) return PsBlock( + simd_init_local_reduction_vars + [ simd_stop_decl, simd_step_decl, - simd_loop, + simd_loop + ] + + simd_writeback_local_reduction_vars + + [ trailing_start_decl, trailing_loop, ] @@ -238,11 +257,13 @@ class LoopVectorizer: case LoopVectorizer.TrailingItersTreatment.NONE: return PsBlock( + simd_init_local_reduction_vars + [ simd_stop_decl, simd_step_decl, simd_loop, - ] + ] + + simd_writeback_local_reduction_vars ) @overload diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py index 060192810..7a03e293a 100644 --- a/src/pystencils/backend/transformations/select_intrinsics.py +++ b/src/pystencils/backend/transformations/select_intrinsics.py @@ -7,7 +7,7 @@ from ..ast.structural import PsAstNode, PsDeclaration, PsAssignment, PsStatement from ..ast.expressions import PsExpression, PsCall, PsCast, PsLiteral from ...types import PsCustomType, PsVectorType, constify, deconstify from ..ast.expressions import PsSymbolExpr, PsConstantExpr, PsUnOp, PsBinOp -from ..ast.vector import PsVecMemAcc +from ..ast.vector import PsVecMemAcc, PsVecHorizontal from ..exceptions import MaterializationError from ..functions import CFunction, PsMathFunction @@ -86,6 +86,10 @@ class SelectIntrinsics: new_rhs = self.visit_expr(rhs, sc) return PsStatement(self._platform.vector_store(lhs, new_rhs)) + case PsAssignment(lhs, rhs) if isinstance(rhs, PsVecHorizontal): + new_rhs = self.visit_expr(rhs, sc) + return PsAssignment(lhs, new_rhs) + case _: node.children = [self.visit(c, sc) for c in node.children] @@ -93,7 +97,13 @@ class SelectIntrinsics: def visit_expr(self, expr: PsExpression, sc: SelectionContext) -> PsExpression: if not isinstance(expr.dtype, PsVectorType): - return expr + # special case: result type of horizontal reduction is scalar + if isinstance(expr, PsVecHorizontal): + op = self.visit_expr(expr.operand, sc) + print(op) + return self._platform.op_intrinsic(expr, [op]) + else: + return expr match expr: case PsSymbolExpr(symb): diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index be2589912..f64ba154a 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -29,7 +29,10 @@ def test_reduction(target, dtype, op): red_assign = reduction_assignment_from_str(w, op, x.center()) - config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True) + vectorize_info = {'instruction_set': 'avx', 'assume_inner_stride_one': True} + + config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail \ + else ps.CreateKernelConfig(cpu_openmp=True, cpu_vectorize_info=vectorize_info) ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype) ps.show_code(ast_reduction) -- GitLab From b4b105be0ff674bd1cacaea75e5f92bf8a7fda3c Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 11 Feb 2025 17:21:54 +0100 Subject: [PATCH 100/180] Minor fix --- tests/kernelcreation/test_reduction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 67a844821..12dc4ba1c 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -31,8 +31,8 @@ def test_reduction(target, dtype, op): vectorize_info = {'instruction_set': 'avx', 'assume_inner_stride_one': True} - config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail \ - else ps.CreateKernelConfig(cpu_openmp=True, cpu_vectorize_info=vectorize_info) + config = ps.CreateKernelConfig(target=target) if gpu_avail \ + else ps.CreateKernelConfig(target=target, cpu_openmp=True, cpu_vectorize_info=vectorize_info) ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype) ps.show_code(ast_reduction) -- GitLab From eb7823a5b28589a45f968ac6415ee2a02543a3ab Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Feb 2025 14:13:05 +0100 Subject: [PATCH 101/180] Minor refactor of reduction ops --- src/pystencils/backend/ast/vector.py | 16 ++++++++-------- src/pystencils/backend/functions.py | 10 +++++----- .../backend/kernelcreation/freeze.py | 9 +++++---- .../backend/platforms/generic_cpu.py | 2 +- src/pystencils/backend/platforms/x86.py | 2 +- src/pystencils/compound_op_mapping.py | 10 +++++----- src/pystencils/sympyextensions/reduction.py | 18 +++++++++--------- 7 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 8ff1ff8a0..4e6b2ff00 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -51,7 +51,7 @@ class PsVecHorizontal(PsUnOp, PsVectorOp): def __init__(self, lanes: int, operand: PsExpression, reduction_op: ReductionOp): super().__init__(operand) self._lanes = lanes - self._reduction_operation = reduction_op + self._reduction_op = reduction_op @property def lanes(self) -> int: @@ -62,15 +62,15 @@ class PsVecHorizontal(PsUnOp, PsVectorOp): self._lanes = n @property - def reduction_operation(self) -> ReductionOp: - return self._reduction_operation + def reduction_op(self) -> ReductionOp: + return self._reduction_op - @reduction_operation.setter - def reduction_operation(self, op: ReductionOp): - self._reduction_operation = op + @reduction_op.setter + def reduction_op(self, op: ReductionOp): + self._reduction_op = op def _clone_expr(self) -> PsVecHorizontal: - return PsVecHorizontal(self._lanes, self._operand.clone(), self._operation.clone()) + return PsVecHorizontal(self._lanes, self._operand.clone(), self._reduction_op) def structurally_equal(self, other: PsAstNode) -> bool: if not isinstance(other, PsVecHorizontal): @@ -78,7 +78,7 @@ class PsVecHorizontal(PsUnOp, PsVectorOp): return ( super().structurally_equal(other) and self._lanes == other._lanes - and self._operation == other._operation + and self._reduction_op == other._reduction_op ) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index e1f742386..d28ef5f44 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -152,18 +152,18 @@ class ReductionFunctions(Enum): class PsReductionFunction(PsFunction): - def __init__(self, func: ReductionFunctions, op: ReductionOp) -> None: + def __init__(self, func: ReductionFunctions, reduction_op: ReductionOp) -> None: super().__init__(func.function_name, func.num_args) self._func = func - self._op = op + self._reduction_op = reduction_op @property def func(self) -> ReductionFunctions: return self._func @property - def op(self) -> ReductionOp: - return self._op + def reduction_op(self) -> ReductionOp: + return self._reduction_op def __str__(self) -> str: return f"{self._func.function_name}" @@ -172,7 +172,7 @@ class PsReductionFunction(PsFunction): if not isinstance(other, PsReductionFunction): return False - return self._func == other._func + return self._func == other._func and self._reduction_op == other._reduction_op def __hash__(self) -> int: return hash(self._func) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 675655802..ce65cd85d 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -189,6 +189,7 @@ class FreezeExpressions: assert isinstance(rhs, PsExpression) assert isinstance(lhs, PsSymbolExpr) + op = expr.reduction_op orig_lhs_symb = lhs.symbol dtype = lhs.dtype @@ -202,11 +203,11 @@ class FreezeExpressions: new_lhs = PsSymbolExpr(new_lhs_symb) # get new rhs from augmented assignment - new_rhs: PsExpression = compound_op_to_expr(expr.op, new_lhs.clone(), rhs) + new_rhs: PsExpression = compound_op_to_expr(op, new_lhs.clone(), rhs) # match for reduction operation and set neutral init_val init_val: PsExpression - match expr.op: + match op: case ReductionOp.Add: init_val = PsConstantExpr(PsConstant(0)) case ReductionOp.Sub: @@ -218,9 +219,9 @@ class FreezeExpressions: case ReductionOp.Max: init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) case _: - raise FreezeError(f"Unsupported reduced assignment: {expr.op}.") + raise FreezeError(f"Unsupported reduced assignment: {op}.") - reduction_info = ReductionInfo(expr.op, init_val, orig_lhs_symb_as_ptr) + reduction_info = ReductionInfo(op, init_val, orig_lhs_symb_as_ptr) # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info self._ctx.add_symbol(new_lhs_symb) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index aa6e22b85..7655572de 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -74,7 +74,7 @@ class GenericCpu(Platform): return PsDeclaration(symbol_expr, init_val) case ReductionFunctions.WriteBackToPtr: ptr_expr, symbol_expr = call.args - op = call.function.op + op = call.function.reduction_op assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index 0727b65b9..59c3a178f 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -354,7 +354,7 @@ def _x86_op_intrin( suffix += "x" atype = vtype.scalar_type case PsVecHorizontal(): - opstr = f"horizontal_{op.reduction_operation.name.lower()}" + opstr = f"horizontal_{op.reduction_op.name.lower()}" rtype = vtype.scalar_type case PsAdd(): opstr = "add" diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/compound_op_mapping.py index 1eadfa6f0..2dd88fc94 100644 --- a/src/pystencils/compound_op_mapping.py +++ b/src/pystencils/compound_op_mapping.py @@ -1,6 +1,6 @@ from operator import truediv, mul, sub, add -from .backend.ast.expressions import PsExpression, PsCall +from .backend.ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv from .backend.exceptions import FreezeError from .backend.functions import PsMathFunction, MathFunctions from .sympyextensions.reduction import ReductionOp @@ -12,13 +12,13 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: if op in _available_operator_interface: match op: case ReductionOp.Add: - operator = add + operator = PsAdd case ReductionOp.Sub: - operator = sub + operator = PsSub case ReductionOp.Mul: - operator = mul + operator = PsMul case ReductionOp.Div: - operator = truediv + operator = PsDiv case _: raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.") return operator(op1, op2) diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index 9d8aecb5b..25ae5c0ac 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -22,36 +22,36 @@ class ReductionAssignment(AssignmentBase): binop : CompoundOp Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc. """ - binop = None # type: ReductionOp + reduction_op = None # type: ReductionOp @property - def op(self): - return self.binop + def reduction_op(self): + return self.reduction_op class AddReductionAssignment(ReductionAssignment): - binop = ReductionOp.Add + reduction_op = ReductionOp.Add class SubReductionAssignment(ReductionAssignment): - binop = ReductionOp.Sub + reduction_op = ReductionOp.Sub class MulReductionAssignment(ReductionAssignment): - binop = ReductionOp.Mul + reduction_op = ReductionOp.Mul class MinReductionAssignment(ReductionAssignment): - binop = ReductionOp.Min + reduction_op = ReductionOp.Min class MaxReductionAssignment(ReductionAssignment): - binop = ReductionOp.Max + reduction_op = ReductionOp.Max # Mapping from ReductionOp enum to ReductionAssigment classes _reduction_assignment_classes = { - cls.binop: cls for cls in [ + cls.reduction_op: cls for cls in [ AddReductionAssignment, SubReductionAssignment, MulReductionAssignment, MinReductionAssignment, MaxReductionAssignment ] -- GitLab From 8e0a74784ccb0d22c89364871ee630ba67239b2e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Feb 2025 15:22:45 +0100 Subject: [PATCH 102/180] Refactor PsVecHorizontal as PsBinOp --- src/pystencils/backend/ast/vector.py | 29 ++++++++++--- src/pystencils/backend/emission/ir_printer.py | 8 ++-- .../backend/kernelcreation/typification.py | 43 ++++++++++++------- src/pystencils/backend/platforms/x86.py | 15 +++++-- .../transformations/loop_vectorizer.py | 3 +- .../transformations/select_intrinsics.py | 6 +-- .../include/simd_horizontal_helpers.h | 11 +++++ 7 files changed, 82 insertions(+), 33 deletions(-) create mode 100644 src/pystencils/include/simd_horizontal_helpers.h diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 4e6b2ff00..14249e1e8 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -3,7 +3,7 @@ from __future__ import annotations from typing import cast from .astnode import PsAstNode -from .expressions import PsExpression, PsLvalue, PsUnOp +from .expressions import PsExpression, PsLvalue, PsUnOp, PsBinOp from .util import failing_cast from ...sympyextensions import ReductionOp @@ -43,13 +43,14 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): ) -class PsVecHorizontal(PsUnOp, PsVectorOp): +class PsVecHorizontal(PsBinOp, PsVectorOp): """Extracts scalar value from N vector lanes.""" - __match_args__ = ("lanes", "operand", "operation") + __match_args__ = ("lanes", "scalar_operand", "vector_operand", "operation") - def __init__(self, lanes: int, operand: PsExpression, reduction_op: ReductionOp): - super().__init__(operand) + def __init__(self, lanes: int, scalar_operand: PsExpression, vector_operand: PsExpression, + reduction_op: ReductionOp): + super().__init__(scalar_operand, vector_operand) self._lanes = lanes self._reduction_op = reduction_op @@ -61,6 +62,22 @@ class PsVecHorizontal(PsUnOp, PsVectorOp): def lanes(self, n: int): self._lanes = n + @property + def scalar_operand(self) -> PsExpression: + return self._op1 + + @scalar_operand.setter + def scalar_operand(self, op: PsExpression): + self._op1 = op + + @property + def vector_operand(self) -> PsExpression: + return self._op2 + + @vector_operand.setter + def vector_operand(self, op: PsExpression): + self._op2 = op + @property def reduction_op(self) -> ReductionOp: return self._reduction_op @@ -70,7 +87,7 @@ class PsVecHorizontal(PsUnOp, PsVectorOp): self._reduction_op = op def _clone_expr(self) -> PsVecHorizontal: - return PsVecHorizontal(self._lanes, self._operand.clone(), self._reduction_op) + return PsVecHorizontal(self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op) def structurally_equal(self, other: PsAstNode) -> bool: if not isinstance(other, PsVecHorizontal): diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py index 04084dd3b..1508e6d94 100644 --- a/src/pystencils/backend/emission/ir_printer.py +++ b/src/pystencils/backend/emission/ir_printer.py @@ -77,13 +77,15 @@ class IRAstPrinter(BasePrinter): f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest ) - case PsVecHorizontal(lanes, operand, reduction_op): + case PsVecHorizontal(lanes, scalar_operand, vector_operand, reduction_op): pc.push_op(Ops.Weakest, LR.Middle) - operand_code = self.visit(operand, pc) + scalar_operand_code = self.visit(scalar_operand, pc) + vector_operand_code = self.visit(vector_operand, pc) pc.pop_op() return pc.parenthesize( - f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({operand_code})", Ops.Weakest + f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({scalar_operand_code, vector_operand_code})", + Ops.Weakest ) case _: diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 25fb55a0b..544746ef6 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -579,6 +579,33 @@ class Typifier: else: tc.apply_dtype(PsBoolType(), expr) + case PsVecHorizontal(): + # bin op consisting of a scalar and a vector that is converted to a scalar + # -> whole expression should be treated as scalar + + scalar_op_tc = TypeContext() + self.visit_expr(expr.scalar_operand, scalar_op_tc) + + vector_op_tc = TypeContext() + self.visit_expr(expr.vector_operand, vector_op_tc) + + if scalar_op_tc.target_type is None or vector_op_tc.target_type is None: + raise TypificationError( + f"Unable to determine type of argument to vector horizontal: {expr}" + ) + + if not isinstance(scalar_op_tc.target_type, PsScalarType): + raise TypificationError( + f"Illegal type in scalar operand (op1) to vector horizontal: {scalar_op_tc.target_type}" + ) + + if not isinstance(vector_op_tc.target_type, PsVectorType): + raise TypificationError( + f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}" + ) + + tc.apply_dtype(scalar_op_tc.target_type, expr) + case PsBinOp(op1, op2): self.visit_expr(op1, tc) self.visit_expr(op2, tc) @@ -640,22 +667,6 @@ class Typifier: tc.apply_dtype(PsVectorType(op_tc.target_type, lanes), expr) - case PsVecHorizontal(): - op_tc = TypeContext() - self.visit_expr(expr.operand, op_tc) - - if op_tc.target_type is None: - raise TypificationError( - f"Unable to determine type of argument to vector horizontal: {expr.operand}" - ) - - if not isinstance(op_tc.target_type, PsVectorType): - raise TypificationError( - f"Illegal type in argument to vector horizontal: {op_tc.target_type}" - ) - - tc.apply_dtype(op_tc.target_type.scalar_type, expr) - case _: raise NotImplementedError(f"Can't typify {expr}") diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index 59c3a178f..ee14d1689 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import Sequence +from typing import Sequence, Tuple from enum import Enum from functools import cache @@ -132,6 +132,8 @@ class X86VectorCpu(GenericVectorCpu): else: headers = {"<immintrin.h>"} + headers.update({'"simd_horizontal_helpers.h"'}) + return super().required_headers | headers def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType: @@ -162,8 +164,8 @@ class X86VectorCpu(GenericVectorCpu): case PsUnOp() | PsBinOp(): vtype: PsType if isinstance(expr, PsVecHorizontal): - # expression itself is scalar, but argument is a vector - vtype = expr.operand.get_dtype() + # return type of expression itself is scalar, but input argument to intrinsic is a vector + vtype = expr.vector_operand.get_dtype() else: vtype = expr.get_dtype() @@ -346,6 +348,7 @@ def _x86_op_intrin( prefix = varch.intrin_prefix(vtype) suffix = varch.intrin_suffix(vtype) rtype = atype = varch.intrin_type(vtype) + atypes: Tuple[PsType, ...] = () match op: case PsVecBroadcast(): @@ -356,6 +359,7 @@ def _x86_op_intrin( case PsVecHorizontal(): opstr = f"horizontal_{op.reduction_op.name.lower()}" rtype = vtype.scalar_type + atypes = (vtype.scalar_type, vtype) case PsAdd(): opstr = "add" case PsSub(): @@ -418,4 +422,7 @@ def _x86_op_intrin( ) num_args = 1 if isinstance(op, PsUnOp) else 2 - return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype) + if not atypes: + atypes = (atype,) * num_args + + return CFunction(f"{prefix}_{opstr}_{suffix}", atypes, rtype) diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index 39d72adb4..ab28507c2 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -147,7 +147,8 @@ class LoopVectorizer: # Write back vectorization result simd_writeback_local_reduction_vars += [self._type_fold(PsAssignment( - PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(vector_symb), reduction_info.op)))] + PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(symb), PsSymbolExpr(vector_symb), + reduction_info.op)))] # Generate vectorized loop body simd_body = self._vectorize_ast(loop.body, vc) diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py index 7a03e293a..49fb9bb08 100644 --- a/src/pystencils/backend/transformations/select_intrinsics.py +++ b/src/pystencils/backend/transformations/select_intrinsics.py @@ -99,9 +99,9 @@ class SelectIntrinsics: if not isinstance(expr.dtype, PsVectorType): # special case: result type of horizontal reduction is scalar if isinstance(expr, PsVecHorizontal): - op = self.visit_expr(expr.operand, sc) - print(op) - return self._platform.op_intrinsic(expr, [op]) + scalar_op = expr.scalar_operand + vector_op_to_scalar = self.visit_expr(expr.vector_operand, sc) + return self._platform.op_intrinsic(expr, [scalar_op, vector_op_to_scalar]) else: return expr diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h new file mode 100644 index 000000000..6a80f2107 --- /dev/null +++ b/src/pystencils/include/simd_horizontal_helpers.h @@ -0,0 +1,11 @@ +#pragma once + +#include <immintrin.h> + +#define QUALIFIERS inline + +QUALIFIERS double _mm256_horizontal_add_pd(double a, __m256d b) { + __m256d _v = b; + __m256d _h = _mm256_hadd_pd(_v,_v); + return a + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h))); +} \ No newline at end of file -- GitLab From 7306f4ddfbb9066494480f14cdb211ae657f79ba Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Feb 2025 15:41:34 +0100 Subject: [PATCH 103/180] Minor fix --- src/pystencils/backend/platforms/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index f9fbdfa56..90efebe61 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -161,7 +161,7 @@ class CudaPlatform(GenericGpu): return PsDeclaration(symbol_expr, init_val) case ReductionFunctions.WriteBackToPtr: ptr_expr, symbol_expr = call.args - op = call.function.op + op = call.function.reduction_op assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) -- GitLab From 58cdb79206931dfe15a4fdb7f656d8d047a2a6b8 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 17 Feb 2025 15:30:42 +0100 Subject: [PATCH 104/180] Fix bug with doubly inverted sign for subtraction reductions --- src/pystencils/backend/platforms/cuda.py | 5 +++-- src/pystencils/backend/platforms/generic_cpu.py | 6 +++++- src/pystencils/backend/platforms/x86.py | 5 ++++- tests/kernelcreation/test_reduction.py | 2 +- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 90efebe61..9877cea44 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -168,10 +168,11 @@ class CudaPlatform(GenericGpu): match op: case ReductionOp.Sub: - # workaround for unsupported atomicSub: use atomic add and invert sign + # workaround for unsupported atomicSub: use atomic add + # similar to OpenMP reductions: local copies (negative sign) are added at the end call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) - call.args = (ptr_expr, -symbol_expr) + call.args = (ptr_expr, symbol_expr) case _: call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void")) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 7655572de..1e7468e33 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -8,6 +8,7 @@ from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsF PsReductionFunction from ..literals import PsLiteral from ...compound_op_mapping import compound_op_to_expr +from ...sympyextensions import ReductionOp from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType from .platform import Platform @@ -81,8 +82,11 @@ class GenericCpu(Platform): ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) + # inspired by OpenMP: local reduction variable (negative sign) is added at the end + actual_op = ReductionOp.Add if op is ReductionOp.Sub else op + # TODO: can this be avoided somehow? - potential_call = compound_op_to_expr(op, ptr_access, symbol_expr) + potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr) if isinstance(potential_call, PsCall): potential_call.dtype = symbol_expr.dtype potential_call = self.select_function(potential_call) diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index ee14d1689..02b5ea6db 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -18,6 +18,7 @@ from ..ast.expressions import ( PsCall, ) from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal +from ...sympyextensions import ReductionOp from ...types import PsCustomType, PsVectorType, PsPointerType, PsType from ..constants import PsConstant @@ -357,7 +358,9 @@ def _x86_op_intrin( suffix += "x" atype = vtype.scalar_type case PsVecHorizontal(): - opstr = f"horizontal_{op.reduction_op.name.lower()}" + # horizontal add instead of sub avoids double inversion of sign + actual_op = ReductionOp.Add if op.reduction_op == ReductionOp.Sub else op.reduction_op + opstr = f"horizontal_{actual_op.name.lower()}" rtype = vtype.scalar_type atypes = (vtype.scalar_type, vtype) case PsAdd(): diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 12dc4ba1c..537eb4b67 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -10,7 +10,7 @@ INIT_ARR = 2 SIZE = 15 SOLUTION = { "+": INIT_W + INIT_ARR * SIZE, - "-": INIT_W - INIT_ARR * -SIZE, + "-": INIT_W - INIT_ARR * SIZE, "*": INIT_W * INIT_ARR ** SIZE, "min": min(INIT_W, INIT_ARR), "max": max(INIT_W, INIT_ARR), -- GitLab From fe3cd6cd19f2c11166e2b8f670a0a8a337b898ef Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 17 Feb 2025 16:02:05 +0100 Subject: [PATCH 105/180] Add generator for SIMD horizontal operations and the emitted code. --- .../include/simd_horizontal_helpers.h | 232 ++++++++++++- util/generate_simd_horizontal_op.py | 309 ++++++++++++++++++ 2 files changed, 535 insertions(+), 6 deletions(-) create mode 100644 util/generate_simd_horizontal_op.py diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h index 6a80f2107..cd4bd5730 100644 --- a/src/pystencils/include/simd_horizontal_helpers.h +++ b/src/pystencils/include/simd_horizontal_helpers.h @@ -1,11 +1,231 @@ #pragma once +#include <cmath> + +#if defined(__SSE3__) +#include <immintrin.h> + +inline double _mm_horizontal_add_pd(double dst, __m128d src) { + __m128d _v = src; + return dst + _mm_cvtsd_f64(_mm_hadd_pd(_v, _v)); +} + +inline float _mm_horizontal_add_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_hadd_ps(_v, _v); + return dst + _mm_cvtss_f32(_mm_add_ps(_h, _mm_movehdup_ps(_h))); +} + +inline double _mm_horizontal_mul_pd(double dst, __m128d src) { + __m128d _v = src; + double _r = _mm_cvtsd_f64(_mm_mul_pd(_v, _mm_shuffle_pd(_v, _v, 1))); + return dst * _r; +} + +inline float _mm_horizontal_mul_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_mul_ps(_v, _mm_shuffle_ps(_v, _v, 177)); + float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return dst * _r; +} + +inline double _mm_horizontal_min_pd(double dst, __m128d src) { + __m128d _v = src; + double _r = _mm_cvtsd_f64(_mm_min_pd(_v, _mm_shuffle_pd(_v, _v, 1))); + return fmin(_r, dst); +} + +inline float _mm_horizontal_min_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_min_ps(_v, _mm_shuffle_ps(_v, _v, 177)); + float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmin(_r, dst); +} + +inline double _mm_horizontal_max_pd(double dst, __m128d src) { + __m128d _v = src; + double _r = _mm_cvtsd_f64(_mm_max_pd(_v, _mm_shuffle_pd(_v, _v, 1))); + return fmax(_r, dst); +} + +inline float _mm_horizontal_max_ps(float dst, __m128 src) { + __m128 _v = src; + __m128 _h = _mm_max_ps(_v, _mm_shuffle_ps(_v, _v, 177)); + float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmax(_r, dst); +} + +#endif + +#if defined(__AVX__) +#include <immintrin.h> + +inline double _mm256_horizontal_add_pd(double dst, __m256d src) { + __m256d _v = src; + __m256d _h = _mm256_hadd_pd(_v, _v); + return dst + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h))); +} + +inline float _mm256_horizontal_add_ps(float dst, __m256 src) { + __m256 _v = src; + __m256 _h = _mm256_hadd_ps(_v, _v); + __m128 _i = _mm_add_ps(_mm256_extractf128_ps(_h,1), _mm256_castps256_ps128(_h)); + return dst + _mm_cvtss_f32(_mm_hadd_ps(_i,_i)); +} + +inline double _mm256_horizontal_mul_pd(double dst, __m256d src) { + __m256d _v = src; + __m128d _w = _mm_mul_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v)); + double _r = _mm_cvtsd_f64(_mm_mul_pd(_w, _mm_permute_pd(_w,1))); + return dst * _r; +} + +inline float _mm256_horizontal_mul_ps(float dst, __m256 src) { + __m256 _v = src; + __m128 _w = _mm_mul_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v)); + __m128 _h = _mm_mul_ps(_w, _mm_shuffle_ps(_w, _w, 177)); + float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return dst * _r; +} + +inline double _mm256_horizontal_min_pd(double dst, __m256d src) { + __m256d _v = src; + __m128d _w = _mm_min_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v)); + double _r = _mm_cvtsd_f64(_mm_min_pd(_w, _mm_permute_pd(_w,1))); + return fmin(_r, dst); +} + +inline float _mm256_horizontal_min_ps(float dst, __m256 src) { + __m256 _v = src; + __m128 _w = _mm_min_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v)); + __m128 _h = _mm_min_ps(_w, _mm_shuffle_ps(_w, _w, 177)); + float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmin(_r, dst); +} + +inline double _mm256_horizontal_max_pd(double dst, __m256d src) { + __m256d _v = src; + __m128d _w = _mm_max_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v)); + double _r = _mm_cvtsd_f64(_mm_max_pd(_w, _mm_permute_pd(_w,1))); + return fmax(_r, dst); +} + +inline float _mm256_horizontal_max_ps(float dst, __m256 src) { + __m256 _v = src; + __m128 _w = _mm_max_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v)); + __m128 _h = _mm_max_ps(_w, _mm_shuffle_ps(_w, _w, 177)); + float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10))); + return fmax(_r, dst); +} + +#endif + +#if defined(__AVX512VL__) #include <immintrin.h> -#define QUALIFIERS inline +inline double _mm512_horizontal_add_pd(double dst, __m512d src) { + double _r = _mm512_reduce_add_pd(src); + return dst + _r; +} + +inline float _mm512_horizontal_add_ps(float dst, __m512 src) { + float _r = _mm512_reduce_add_ps(src); + return dst + _r; +} + +inline double _mm512_horizontal_mul_pd(double dst, __m512d src) { + double _r = _mm512_reduce_mul_pd(src); + return dst * _r; +} + +inline float _mm512_horizontal_mul_ps(float dst, __m512 src) { + float _r = _mm512_reduce_mul_ps(src); + return dst * _r; +} + +inline double _mm512_horizontal_min_pd(double dst, __m512d src) { + double _r = _mm512_reduce_min_pd(src); + return fmin(_r, dst); +} + +inline float _mm512_horizontal_min_ps(float dst, __m512 src) { + float _r = _mm512_reduce_min_ps(src); + return fmin(_r, dst); +} + +inline double _mm512_horizontal_max_pd(double dst, __m512d src) { + double _r = _mm512_reduce_max_pd(src); + return fmax(_r, dst); +} + +inline float _mm512_horizontal_max_ps(float dst, __m512 src) { + float _r = _mm512_reduce_max_ps(src); + return fmax(_r, dst); +} + +#endif + +#if defined(_M_ARM64) +#include <arm_neon.h> + +inline double vgetq_horizontal_add_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r += vgetq_lane_f64(_v,1); + return dst + _r; +} + +inline float vget_horizontal_add_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vadd_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r += vget_lane_f32(_w,1); + return dst + _r; +} + +inline double vgetq_horizontal_mul_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r *= vgetq_lane_f64(_v,1); + return dst * _r; +} + +inline float vget_horizontal_mul_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vmul_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r *= vget_lane_f32(_w,1); + return dst * _r; +} + +inline double vgetq_horizontal_min_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r = fmin(_r, vgetq_lane_f64(_v,1)); + return fmin(_r, dst); +} + +inline float vget_horizontal_min_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vmin_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r = fmin(_r, vget_lane_f32(_w,1)); + return fmin(_r, dst); +} + +inline double vgetq_horizontal_max_f64(double dst, float64x2_t src) { + float64x2_t _v = src; + double _r = vgetq_lane_f64(_v,0); + _r = fmax(_r, vgetq_lane_f64(_v,1)); + return fmax(_r, dst); +} + +inline float vget_horizontal_max_f32(float dst, float32x4_t src) { + float32x4_t _v = src; + float32x2_t _w = vmax_f32(vget_high_f32(_v), vget_low_f32(_v)); + float _r = vgetq_lane_f32(_w,0); + _r = fmax(_r, vget_lane_f32(_w,1)); + return fmax(_r, dst); +} -QUALIFIERS double _mm256_horizontal_add_pd(double a, __m256d b) { - __m256d _v = b; - __m256d _h = _mm256_hadd_pd(_v,_v); - return a + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h))); -} \ No newline at end of file +#endif \ No newline at end of file diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py new file mode 100644 index 000000000..aebbf35bb --- /dev/null +++ b/util/generate_simd_horizontal_op.py @@ -0,0 +1,309 @@ +from enum import Enum + +FCT_QUALIFIERS = "inline" + + +class InstructionSets(Enum): + SSE3 = "SSE3" + AVX = "AVX" + AVX512 = "AVX512" + NEON = "NEON" + + def __str__(self): + return self.value + + +class ReductionOps(Enum): + Add = ("add", "+") + Mul = ("mul", "*") + Min = ("min", "min") + Max = ("max", "max") + + def __init__(self, op_name, op_str): + self.op_name = op_name + self.op_str = op_str + + +class ScalarTypes(Enum): + Double = "double" + Float = "float" + + def __str__(self): + return self.value + + +class VectorTypes(Enum): + SSE3_128d = "__m128d" + SSE3_128 = "__m128" + + AVX_256d = "__m256d" + AVX_256 = "__m256" + AVX_128 = "__m128" + + AVX_512d = "__m512d" + AVX_512 = "__m512" + + NEON_64x2 = "float64x2_t" + NEON_32x4 = "float32x4_t" + + def __str__(self): + return self.value + + +class Variable: + def __init__(self, name: str, dtype: ScalarTypes | VectorTypes): + self._name = name + self._dtype = dtype + + def __str__(self): + return f"{self._dtype} {self._name}" + + @property + def name(self) -> str: + return self._name + + @property + def dtype(self) -> ScalarTypes | VectorTypes: + return self._dtype + + +def get_intrin_from_vector_type(vtype: VectorTypes) -> InstructionSets: + match vtype: + case VectorTypes.SSE3_128 | VectorTypes.SSE3_128d: + return InstructionSets.SSE3 + case VectorTypes.AVX_256 | VectorTypes.AVX_256d: + return InstructionSets.AVX + case VectorTypes.AVX_512 | VectorTypes.AVX_512d: + return InstructionSets.AVX512 + case VectorTypes.NEON_32x4 | VectorTypes.NEON_64x2: + return InstructionSets.NEON + + +def intrin_prefix(instruction_set: InstructionSets, double_prec: bool): + match instruction_set: + case InstructionSets.SSE3: + return "_mm" + case InstructionSets.AVX: + return "_mm256" + case InstructionSets.AVX512: + return "_mm512" + case InstructionSets.NEON: + return "vgetq" if double_prec else "vget" + case _: + raise ValueError(f"Unknown instruction set {instruction_set}") + + +def intrin_suffix(instruction_set: InstructionSets, double_prec: bool): + if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]: + return "pd" if double_prec else "ps" + elif instruction_set in [InstructionSets.NEON]: + return "f64" if double_prec else "f32" + else: + raise ValueError(f"Unknown instruction set {instruction_set}") + + +def generate_hadd_intrin(instruction_set: InstructionSets, double_prec: bool, v: str): + return f"{intrin_prefix(instruction_set, double_prec)}_hadd_{intrin_suffix(instruction_set, double_prec)}({v}, {v})" + + +def generate_shuffle_intrin(instruction_set: InstructionSets, double_prec: bool, v: str, offset): + return f"_mm_shuffle_{intrin_suffix(instruction_set, double_prec)}({v}, {v}, {offset})" + + +def generate_op_intrin(instruction_set: InstructionSets, double_prec: bool, reduction_op: ReductionOps, a: str, b: str): + return f"_mm_{reduction_op.op_name}_{intrin_suffix(instruction_set, double_prec)}({a}, {b})" + + +def generate_cvts_intrin(double_prec: bool, v: str): + convert_suffix = "f64" if double_prec else "f32" + intrin_suffix = "d" if double_prec else "s" + return f"_mm_cvts{intrin_suffix}_{convert_suffix}({v})" + + +def generate_fct_name(instruction_set: InstructionSets, double_prec: bool, op: ReductionOps): + prefix = intrin_prefix(instruction_set, double_prec) + suffix = intrin_suffix(instruction_set, double_prec) + return f"{prefix}_horizontal_{op.op_name}_{suffix}" + + +def generate_fct_decl(instruction_set: InstructionSets, op: ReductionOps, svar: Variable, vvar: Variable): + double_prec = svar.dtype is ScalarTypes.Double + return f"{FCT_QUALIFIERS} {svar.dtype} {generate_fct_name(instruction_set, double_prec, op)}({svar}, {vvar}) {{ \n" + + +# SSE & AVX provide horizontal add 'hadd' intrinsic that allows for specialized handling +def generate_simd_horizontal_add(scalar_var: Variable, vector_var: Variable): + reduction_op = ReductionOps.Add + instruction_set = get_intrin_from_vector_type(vector_var.dtype) + double_prec = scalar_var.dtype is ScalarTypes.Double + + sname = scalar_var.name + vtype = vector_var.dtype + vname = vector_var.name + + simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b) + hadd = lambda var: generate_hadd_intrin(instruction_set, double_prec, var) + cvts = lambda var: generate_cvts_intrin(double_prec, var) + + # function body + body = f"\t{vtype} _v = {vname};\n" + match instruction_set: + case InstructionSets.SSE3: + if double_prec: + body += f"\treturn {sname} + {cvts(hadd('_v'))};\n" + else: + body += f"\t{vtype} _h = {hadd('_v')};\n" \ + f"\treturn {sname} + {cvts(simd_op('_h', '_mm_movehdup_ps(_h)'))};\n" + + case InstructionSets.AVX: + if double_prec: + body += f"\t{vtype} _h = {hadd('_v')};\n" \ + f"\treturn {sname} + {cvts(simd_op('_mm256_extractf128_pd(_h,1)', '_mm256_castpd256_pd128(_h)'))};\n" + else: + add_i = "_mm_hadd_ps(_i,_i)" + body += f"\t{vtype} _h = {hadd('_v')};\n" \ + f"\t__m128 _i = {simd_op('_mm256_extractf128_ps(_h,1)', '_mm256_castps256_ps128(_h)')};\n" \ + f"\treturn {sname} + {cvts(add_i)};\n" + + case _: + raise ValueError(f"No specialized version of horizontal_add available for {instruction_set}") + + # function decl + decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var) + + return decl + body + "}\n" + + +def generate_simd_horizontal_op(reduction_op: ReductionOps, scalar_var: Variable, vector_var: Variable): + instruction_set = get_intrin_from_vector_type(vector_var.dtype) + double_prec = scalar_var.dtype is ScalarTypes.Double + + # generate specialized version for add operation + if reduction_op == ReductionOps.Add and instruction_set in [InstructionSets.SSE3, InstructionSets.AVX]: + return generate_simd_horizontal_add(scalar_var, vector_var) + + sname = scalar_var.name + stype = scalar_var.dtype + vtype = vector_var.dtype + vname = vector_var.name + + opname = reduction_op.op_name + opstr = reduction_op.op_str + + reduction_function = f"f{opname}" \ + if reduction_op in [ReductionOps.Max, ReductionOps.Min] else None + + simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b) + cvts = lambda var: generate_cvts_intrin(double_prec, var) + shuffle = lambda var, offset: generate_shuffle_intrin(instruction_set, double_prec, var, offset) + + # function body + body = f"\t{vtype} _v = {vname};\n" if instruction_set != InstructionSets.AVX512 else "" + match instruction_set: + case InstructionSets.SSE3: + if double_prec: + body += f"\t{stype} _r = {cvts(simd_op('_v', shuffle('_v', 1)))};\n" + else: + body += f"\t{vtype} _h = {simd_op('_v', shuffle('_v', 177))};\n" \ + f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n" + + case InstructionSets.AVX: + if double_prec: + body += f"\t__m128d _w = {simd_op('_mm256_extractf128_pd(_v,1)', '_mm256_castpd256_pd128(_v)')};\n" \ + f"\t{stype} _r = {cvts(simd_op('_w', '_mm_permute_pd(_w,1)'))}; \n" + else: + body += f"\t__m128 _w = {simd_op('_mm256_extractf128_ps(_v,1)', '_mm256_castps256_ps128(_v)')};\n" \ + f"\t__m128 _h = {simd_op('_w', shuffle('_w', 177))};\n" \ + f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n" + + case InstructionSets.AVX512: + suffix = intrin_suffix(instruction_set, double_prec) + body += f"\t{stype} _r = _mm512_reduce_{opname}_{suffix}({vname});\n" + + case InstructionSets.NEON: + if double_prec: + body += f"\t{stype} _r = vgetq_lane_f64(_v,0);\n" + if reduction_function: + body += f"\t_r = {reduction_function}(_r, vgetq_lane_f64(_v,1));\n" + else: + body += f"\t_r {opstr}= vgetq_lane_f64(_v,1);\n" + else: + body += f"\tfloat32x2_t _w = v{opname}_f32(vget_high_f32(_v), vget_low_f32(_v));\n" \ + f"\t{stype} _r = vgetq_lane_f32(_w,0);\n" + if reduction_function: + body += f"\t_r = {reduction_function}(_r, vget_lane_f32(_w,1));\n" + else: + body += f"\t_r {opstr}= vget_lane_f32(_w,1);\n" + + case _: + raise ValueError(f"Unsupported instruction set {instruction_set}") + + # finalize reduction + if reduction_function: + body += f"\treturn {reduction_function}(_r, {sname});\n" + else: + body += f"\treturn {sname} {opstr} _r;\n" + + # function decl + decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var) + + return decl + body + "}\n" + + +stypes = { + True: ScalarTypes.Double, + False: ScalarTypes.Float +} + +vtypes_for_instruction_set = { + InstructionSets.SSE3: { + True: VectorTypes.SSE3_128d, + False: VectorTypes.SSE3_128 + }, + InstructionSets.AVX: { + True: VectorTypes.AVX_256d, + False: VectorTypes.AVX_256 + }, + InstructionSets.AVX512: { + True: VectorTypes.AVX_512d, + False: VectorTypes.AVX_512 + }, + InstructionSets.NEON: { + True: VectorTypes.NEON_64x2, + False: VectorTypes.NEON_32x4 + }, +} + +guards_for_instruction_sets = { + InstructionSets.SSE3: "__SSE3__", + InstructionSets.AVX: "__AVX__", + InstructionSets.AVX512: '__AVX512VL__', + InstructionSets.NEON: '_M_ARM64', +} + +code = """#pragma once + +#include <cmath> + +""" + +for instruction_set in InstructionSets: + code += f"#if defined({guards_for_instruction_sets[instruction_set]})\n" + + if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]: + code += "#include <immintrin.h>\n\n" + elif instruction_set == InstructionSets.NEON: + code += "#include <arm_neon.h>\n\n" + else: + ValueError(f"Missing header include for instruction set {instruction_set}") + + for reduction_op in ReductionOps: + for double_prec in [True, False]: + scalar_var = Variable("dst", stypes[double_prec]) + vector_var = Variable("src", vtypes_for_instruction_set[instruction_set][double_prec]) + + code += generate_simd_horizontal_op(reduction_op, scalar_var, vector_var) + "\n" + + code += "#endif\n\n" + +print(code) -- GitLab From 71881893e551cfa3c5ce2217b8fad0ef751e3613 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 17 Feb 2025 17:16:09 +0100 Subject: [PATCH 106/180] Split reduction test into separate CPU/GPU tests --- tests/kernelcreation/test_reduction.py | 52 ++++++++++++++++---------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 537eb4b67..9fd385e02 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -17,38 +17,50 @@ SOLUTION = { } -@pytest.mark.parametrize('dtype', ["float64"]) -@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) -def test_reduction(target, dtype, op): - gpu_avail = target is ps.Target.GPU - +# get AST for kernel with reduction assignment +def get_reduction_assign_ast(dtype, op, config): x = ps.fields(f'x: {dtype}[1d]') w = ps.TypedSymbol("w", dtype) - # kernel with reduction assignment - red_assign = reduction_assignment_from_str(w, op, x.center()) - vectorize_info = {'instruction_set': 'avx', 'assume_inner_stride_one': True} + return ps.create_kernel([red_assign], config, default_dtype=dtype) + - config = ps.CreateKernelConfig(target=target) if gpu_avail \ - else ps.CreateKernelConfig(target=target, cpu_openmp=True, cpu_vectorize_info=vectorize_info) +@pytest.mark.parametrize('instruction_set', ['sse', 'avx']) +@pytest.mark.parametrize('dtype', ["float64", "float32"]) +@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) +def test_reduction_cpu(instruction_set, dtype, op): + + vectorize_info = {'instruction_set': instruction_set, 'assume_inner_stride_one': True} - ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype) + config = ps.CreateKernelConfig(target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info) + + ast_reduction = get_reduction_assign_ast(dtype, op, config) ps.show_code(ast_reduction) + kernel_reduction = ast_reduction.compile() - # code_reduction = ps.get_code_str(ast_reduction) + array = np.full((SIZE,), INIT_ARR, dtype=dtype) + reduction_array = np.full((1,), INIT_W, dtype=dtype) + + kernel_reduction(x=array, w=reduction_array) + assert np.allclose(reduction_array, SOLUTION[op]) + + +@pytest.mark.parametrize('dtype', ["float64", "float32"]) +@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) +def test_reduction_gpu(dtype, op): + config = ps.CreateKernelConfig(target=ps.Target.GPU) + + ast_reduction = get_reduction_assign_ast(dtype, op, config) + ps.show_code(ast_reduction) kernel_reduction = ast_reduction.compile() array = np.full((SIZE,), INIT_ARR, dtype=dtype) reduction_array = np.full((1,), INIT_W, dtype=dtype) - if gpu_avail: - array_gpu = cp.asarray(array) - reduction_array_gpu = cp.asarray(reduction_array) + array_gpu = cp.asarray(array) + reduction_array_gpu = cp.asarray(reduction_array) - kernel_reduction(x=array_gpu, w=reduction_array_gpu) - assert np.allclose(reduction_array_gpu.get(), SOLUTION[op]) - else: - kernel_reduction(x=array, w=reduction_array) - assert np.allclose(reduction_array, SOLUTION[op]) + kernel_reduction(x=array_gpu, w=reduction_array_gpu) + assert np.allclose(reduction_array_gpu.get(), SOLUTION[op]) -- GitLab From 13569a616ef3eb6a28a698e8163f0748d4a4c0c0 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 17 Feb 2025 17:31:23 +0100 Subject: [PATCH 107/180] Fix lint --- src/pystencils/backend/ast/vector.py | 6 ++---- src/pystencils/backend/platforms/cuda.py | 6 +++--- .../backend/transformations/loop_vectorizer.py | 18 +++++++++--------- src/pystencils/compound_op_mapping.py | 2 -- src/pystencils/jit/gpu_cupy.py | 3 +-- src/pystencils/sympyextensions/reduction.py | 10 +++++++--- 6 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 14249e1e8..5121987a8 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -92,11 +92,9 @@ class PsVecHorizontal(PsBinOp, PsVectorOp): def structurally_equal(self, other: PsAstNode) -> bool: if not isinstance(other, PsVecHorizontal): return False - return ( - super().structurally_equal(other) + return (super().structurally_equal(other) and self._lanes == other._lanes - and self._reduction_op == other._reduction_op - ) + and self._reduction_op == other._reduction_op) class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 9877cea44..e8c8f6a3a 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -89,7 +89,7 @@ class CudaPlatform(GenericGpu): if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): assert isinstance(dtype, PsIeeeFloatType) - defines = { NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY" } + defines = {NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY"} return PsLiteralExpr(PsLiteral(defines[func], dtype)) @@ -170,8 +170,8 @@ class CudaPlatform(GenericGpu): case ReductionOp.Sub: # workaround for unsupported atomicSub: use atomic add # similar to OpenMP reductions: local copies (negative sign) are added at the end - call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype], - PsCustomType("void")) + call.function = CFunction("atomicAdd", [ptr_expr.dtype, symbol_expr.dtype], + PsCustomType("void")) call.args = (ptr_expr, symbol_expr) case _: call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index ab28507c2..b78114553 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -240,14 +240,14 @@ class LoopVectorizer: ) return PsBlock( - simd_init_local_reduction_vars + - [ + simd_init_local_reduction_vars + + [ simd_stop_decl, simd_step_decl, simd_loop - ] + - simd_writeback_local_reduction_vars + - [ + ] + + simd_writeback_local_reduction_vars + + [ trailing_start_decl, trailing_loop, ] @@ -258,13 +258,13 @@ class LoopVectorizer: case LoopVectorizer.TrailingItersTreatment.NONE: return PsBlock( - simd_init_local_reduction_vars + - [ + simd_init_local_reduction_vars + + [ simd_stop_decl, simd_step_decl, simd_loop, - ] + - simd_writeback_local_reduction_vars + ] + + simd_writeback_local_reduction_vars ) @overload diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/compound_op_mapping.py index 2dd88fc94..f256369f9 100644 --- a/src/pystencils/compound_op_mapping.py +++ b/src/pystencils/compound_op_mapping.py @@ -1,5 +1,3 @@ -from operator import truediv, mul, sub, add - from .backend.ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv from .backend.exceptions import FreezeError from .backend.functions import PsMathFunction, MathFunctions diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 331b58ce5..0792b6c01 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -11,7 +11,6 @@ except ImportError: from ..codegen import Target from ..field import FieldType -from ..types import PsType, PsPointerType from .jit import JitBase, JitError, KernelWrapper from ..codegen import ( Kernel, @@ -19,7 +18,7 @@ from ..codegen import ( Parameter, ) from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr -from ..types import PsStructType, PsPointerType +from ..types import PsType, PsStructType, PsPointerType from ..include import get_pystencils_include_path diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index 25ae5c0ac..cebfcb2f7 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -19,14 +19,18 @@ class ReductionAssignment(AssignmentBase): Attributes: =========== - binop : CompoundOp + reduction_op : ReductionOp Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc. """ - reduction_op = None # type: ReductionOp + _reduction_op = None # type: ReductionOp @property def reduction_op(self): - return self.reduction_op + return self._reduction_op + + @reduction_op.setter + def reduction_op(self, op): + self._reduction_op = op class AddReductionAssignment(ReductionAssignment): -- GitLab From d14898373c0503bd9bbde0d3c0ee35888519f11f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 17 Feb 2025 17:54:40 +0100 Subject: [PATCH 108/180] Fix typecheck --- src/pystencils/backend/ast/vector.py | 2 +- src/pystencils/backend/platforms/cuda.py | 13 ++++++++++--- src/pystencils/jit/cpu_extension_module.py | 6 +++++- src/pystencils/jit/gpu_cupy.py | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 5121987a8..4f5224133 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -46,7 +46,7 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): class PsVecHorizontal(PsBinOp, PsVectorOp): """Extracts scalar value from N vector lanes.""" - __match_args__ = ("lanes", "scalar_operand", "vector_operand", "operation") + __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op") def __init__(self, lanes: int, scalar_operand: PsExpression, vector_operand: PsExpression, reduction_op: ReductionOp): diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index e8c8f6a3a..12a18b41b 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -87,11 +87,18 @@ class CudaPlatform(GenericGpu): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): + if isinstance(dtype, PsScalarType) and func in NumericLimitsFunctions: assert isinstance(dtype, PsIeeeFloatType) - defines = {NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY"} - return PsLiteralExpr(PsLiteral(defines[func], dtype)) + match func: + case NumericLimitsFunctions.Min: + define = "NEG_INFINITY" + case NumericLimitsFunctions.Max: + define = "POS_INFINITY" + case _: + raise MaterializationError(f"Cannot materialize call to function {func}") + + return PsLiteralExpr(PsLiteral(define, dtype)) if isinstance(dtype, PsIeeeFloatType): match func: diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index bdf99b7ad..03260f649 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -286,7 +286,11 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ def extract_ptr(self, param: Parameter) -> str: if param not in self._pointer_extractions: ptr = param.symbol - self._buffer_types[ptr] = ptr.dtype.base_type + ptr_dtype = ptr.dtype + + assert isinstance(ptr_dtype, PsPointerType) + + self._buffer_types[ptr] = ptr_dtype.base_type self.extract_buffer(ptr, param.name) buffer = self.get_buffer(param.name) code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py index 0792b6c01..6b0ccf02f 100644 --- a/src/pystencils/jit/gpu_cupy.py +++ b/src/pystencils/jit/gpu_cupy.py @@ -197,7 +197,7 @@ class CupyKernelWrapper(KernelWrapper): args.append(val) else: # scalar parameter - val: Any = kwargs[kparam.name] + val = kwargs[kparam.name] add_arg(kparam.name, val, kparam.dtype) # Determine launch grid -- GitLab From ef185b4e48f0c0ca40aaba84c928289995537f45 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 17 Feb 2025 18:18:42 +0100 Subject: [PATCH 109/180] Fix ImportError for cupy --- tests/kernelcreation/test_reduction.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 9fd385e02..1824ea095 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -1,6 +1,10 @@ import pytest import numpy as np -import cupy as cp + +try: + import cupy as cp +except ImportError: + pass import pystencils as ps from pystencils.sympyextensions import reduction_assignment_from_str -- GitLab From 0c40ed634587f5854d11f7851c7ba9eb5911f4ff Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 17 Feb 2025 18:31:41 +0100 Subject: [PATCH 110/180] Use import or skip mechanism for cupy --- tests/kernelcreation/test_reduction.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 1824ea095..ec23297b0 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -1,11 +1,6 @@ import pytest import numpy as np -try: - import cupy as cp -except ImportError: - pass - import pystencils as ps from pystencils.sympyextensions import reduction_assignment_from_str @@ -35,7 +30,6 @@ def get_reduction_assign_ast(dtype, op, config): @pytest.mark.parametrize('dtype', ["float64", "float32"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction_cpu(instruction_set, dtype, op): - vectorize_info = {'instruction_set': instruction_set, 'assume_inner_stride_one': True} config = ps.CreateKernelConfig(target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info) @@ -54,6 +48,9 @@ def test_reduction_cpu(instruction_set, dtype, op): @pytest.mark.parametrize('dtype', ["float64", "float32"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction_gpu(dtype, op): + pytest.importorskip('cupy') + import cupy as cp + config = ps.CreateKernelConfig(target=ps.Target.GPU) ast_reduction = get_reduction_assign_ast(dtype, op, config) -- GitLab From 77a2226818946a666f98b23e95a76c16b77c1a82 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Feb 2025 12:57:49 +0100 Subject: [PATCH 111/180] Avoid duplicate definition of atomicMin/Max for HIP --- src/pystencils/include/gpu_defines.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h index 5525bbc69..34cff79de 100644 --- a/src/pystencils/include/gpu_defines.h +++ b/src/pystencils/include/gpu_defines.h @@ -13,10 +13,11 @@ typedef __hip_uint16_t uint16_t; typedef __hip_int16_t int16_t; #endif -#ifdef __CUDA_ARCH__ -// No direct implementation of atomic multiplication, minimum and maximum available +// No direct implementation for all atomic operations available // -> add support by custom implementations using a CAS mechanism +#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__) + // - atomicMul (double/float) // see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division __device__ double atomicMul(double* address, double val) { @@ -43,6 +44,10 @@ __device__ float atomicMul(float* address, float val) { return __int_as_float(old); } +#endif + +#ifdef __CUDA_ARCH__ + // - atomicMin (double/float) // see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda __device__ __forceinline__ double atomicMin(double *address, double val) @@ -94,4 +99,5 @@ __device__ __forceinline__ float atomicMax(float *address, float val) } return __int_as_float(ret); } + #endif -- GitLab From d10c65d43919f5ee611fc4a0ef7c79f3c3e65822 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Tue, 18 Feb 2025 17:29:22 +0100 Subject: [PATCH 112/180] Catch CUDARuntimeError for mising CUDA capable device in reduction GPU test --- tests/kernelcreation/test_reduction.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index ec23297b0..992c328d7 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -2,6 +2,7 @@ import pytest import numpy as np import pystencils as ps +from cupy_backends.cuda.api.runtime import CUDARuntimeError from pystencils.sympyextensions import reduction_assignment_from_str INIT_W = 5 @@ -48,8 +49,15 @@ def test_reduction_cpu(instruction_set, dtype, op): @pytest.mark.parametrize('dtype', ["float64", "float32"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction_gpu(dtype, op): - pytest.importorskip('cupy') - import cupy as cp + try: + import cupy as cp + + device_count = range(cp.cuda.runtime.getDeviceCount()) + print(f"Found {device_count} GPUs") + except ImportError: + pytest.skip(reason="CuPy is not available", allow_module_level=True) + except CUDARuntimeError: + pytest.skip(reason="No CUDA capable device is detected", allow_module_level=True) config = ps.CreateKernelConfig(target=ps.Target.GPU) -- GitLab From ce816539159408b26b09b4bf6e1df0cbff437829 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 19 Feb 2025 16:33:25 +0100 Subject: [PATCH 113/180] Encapsulate fetching of kernel conditions for iteration spaces in separate function --- src/pystencils/backend/platforms/cuda.py | 55 ++++++++++++++++++------ 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index fb613347a..eff88df7e 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -315,13 +315,47 @@ class CudaPlatform(GenericGpu): # Internals + # TODO: SYCL platform has very similar code for fetching conditionals -> move to GenericGPU? + + def _get_condition_for_translation( + self, ispace: IterationSpace): + + if not self._omit_range_check: + return None + + match ispace: + case FullIterationSpace(): + + dimensions = ispace.dimensions_in_loop_order() + + conds = [] + for dim in dimensions: + ctr_expr = PsExpression.make(dim.counter) + conds.append(PsLt(ctr_expr, dim.stop)) + + if conds: + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + return condition + else: + return None + + case SparseIterationSpace(): + sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) + stop = PsExpression.make(ispace.index_list.shape[0]) + + return PsLt(sparse_ctr_expr.clone(), stop) + case _: + assert False, "Unknown iteration space" + def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace ) -> PsBlock: ctr_mapping = self._thread_mapping(ispace) indexing_decls = [] - conds = [] + cond = self._get_condition_for_translation(ispace) dimensions = ispace.dimensions_in_loop_order() @@ -335,14 +369,9 @@ class CudaPlatform(GenericGpu): indexing_decls.append( self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) - if not self._omit_range_check: - conds.append(PsLt(ctr_expr, dim.stop)) - - if conds: - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) + + if cond: + ast = PsBlock(indexing_decls + [PsConditional(cond, body)]) else: body.statements = indexing_decls + body.statements ast = body @@ -355,6 +384,8 @@ class CudaPlatform(GenericGpu): factory = AstFactory(self._ctx) ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) + cond = self._get_condition_for_translation(ispace) + sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) ctr_mapping = self._thread_mapping(ispace) @@ -377,10 +408,8 @@ class CudaPlatform(GenericGpu): ] body.statements = mappings + body.statements - if not self._omit_range_check: - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr_expr.clone(), stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) + if cond: + ast = PsBlock([sparse_idx_decl, PsConditional(cond, body)]) else: body.statements = [sparse_idx_decl] + body.statements ast = body -- GitLab From ad292c2b42e01c38843eb1a5556ea2ced762f845 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 19 Feb 2025 16:34:14 +0100 Subject: [PATCH 114/180] Add initial version of warp-level reduction for CUDA --- src/pystencils/backend/platforms/cuda.py | 73 ++++++++++++++++++------ 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index eff88df7e..6f32102de 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,9 +1,16 @@ from __future__ import annotations + +import math +import operator from abc import ABC, abstractmethod +from functools import reduce from ..ast import PsAstNode +from ..constants import PsConstant +from ...compound_op_mapping import compound_op_to_expr from ...sympyextensions.reduction import ReductionOp from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType +from ...types.quick import UInt, SInt from ..exceptions import MaterializationError from .generic_gpu import GenericGpu @@ -17,14 +24,14 @@ from ..kernelcreation import ( ) from ..kernelcreation.context import KernelCreationContext -from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement +from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment from ..ast.expressions import ( PsExpression, PsLiteralExpr, PsCast, PsCall, PsLookup, - PsBufferAcc, PsSymbolExpr + PsBufferAcc, PsSymbolExpr, PsConstantExpr, PsAdd, PsRem, PsEq ) from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType @@ -292,26 +299,58 @@ class CudaPlatform(GenericGpu): case ReductionFunctions.WriteBackToPtr: ptr_expr, symbol_expr = call.args op = call.function.reduction_op + stype = symbol_expr.dtype + ptrtype = ptr_expr.dtype + + warp_size = 32 # TODO: get from platform/user config + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) + if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): + NotImplementedError("atomic operations are only available for float32/64 datatypes") + def gen_shuffle_instr(offset: int): + return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))), + symbol_expr, + PsConstantExpr(PsConstant(offset, SInt(32)))]) + + # workaround for subtractions -> use additions for reducing intermediate results + # similar to OpenMP reductions: local copies (negative sign) are added at the end match op: case ReductionOp.Sub: - # workaround for unsupported atomicSub: use atomic add - # similar to OpenMP reductions: local copies (negative sign) are added at the end - call.function = CFunction("atomicAdd", [ptr_expr.dtype, symbol_expr.dtype], - PsCustomType("void")) - call.args = (ptr_expr, symbol_expr) + actual_op = ReductionOp.Add case _: - call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], - PsCustomType("void")) - call.args = (ptr_expr, symbol_expr) - - if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64): - NotImplementedError("atomicMul is only available for float32/64 datatypes") - - return PsStatement(call) + actual_op = op + + # perform local warp reductions + num_shuffles = math.frexp(warp_size)[1] - 1 + shuffles = [PsAssignment(symbol_expr, compound_op_to_expr(actual_op, symbol_expr, gen_shuffle_instr(i))) + for i in reversed(range(1, num_shuffles))] + + # find first thread in warp + ispace = self._ctx.get_iteration_space() # TODO: receive as argument in unfold_function? + is_valid_thread = self._get_condition_for_translation(ispace) + thread_indices_per_dim = [ + idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) + for i, idx in enumerate(THREAD_IDX[:ispace.rank]) + ] + tid: PsExpression = thread_indices_per_dim[0] + for t in thread_indices_per_dim[1:]: + tid = PsAdd(tid, t) + first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32)))) + cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp + + # use atomic operation on first thread of warp to sync + call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) + call.args = (ptr_expr, symbol_expr) + + # assemble warp reduction + return PsBlock( + shuffles + + [PsConditional(cond, PsBlock([PsStatement(call)]))]) # Internals -- GitLab From c7b564bed65615c77f1065dd4821b5a79d2244b3 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 19 Feb 2025 19:04:57 +0100 Subject: [PATCH 115/180] Fix CUDARuntimeError import --- tests/kernelcreation/test_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 992c328d7..c3775964b 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -2,7 +2,6 @@ import pytest import numpy as np import pystencils as ps -from cupy_backends.cuda.api.runtime import CUDARuntimeError from pystencils.sympyextensions import reduction_assignment_from_str INIT_W = 5 @@ -51,6 +50,7 @@ def test_reduction_cpu(instruction_set, dtype, op): def test_reduction_gpu(dtype, op): try: import cupy as cp + from cupy_backends.cuda.api.runtime import CUDARuntimeError device_count = range(cp.cuda.runtime.getDeviceCount()) print(f"Found {device_count} GPUs") -- GitLab From 2b6589b8fd5eea2e416432c9261884e9f50e4e1c Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 19 Feb 2025 19:06:30 +0100 Subject: [PATCH 116/180] Introduce masks for warp reductions and fix errors when shuffling warp results --- src/pystencils/backend/platforms/cuda.py | 34 ++++++++++++++++-------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 6f32102de..873961cc7 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -5,6 +5,8 @@ import operator from abc import ABC, abstractmethod from functools import reduce +from pystencils.types import PsBoolType + from ..ast import PsAstNode from ..constants import PsConstant from ...compound_op_mapping import compound_op_to_expr @@ -310,11 +312,9 @@ class CudaPlatform(GenericGpu): if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): NotImplementedError("atomic operations are only available for float32/64 datatypes") - def gen_shuffle_instr(offset: int): - return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), - [PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))), - symbol_expr, - PsConstantExpr(PsConstant(offset, SInt(32)))]) + # set up mask symbol for active threads in warp + mask = PsSymbol("__shfl_mask", UInt(32)) + self._ctx.add_symbol(mask) # workaround for subtractions -> use additions for reducing intermediate results # similar to OpenMP reductions: local copies (negative sign) are added at the end @@ -325,8 +325,13 @@ class CudaPlatform(GenericGpu): actual_op = op # perform local warp reductions - num_shuffles = math.frexp(warp_size)[1] - 1 - shuffles = [PsAssignment(symbol_expr, compound_op_to_expr(actual_op, symbol_expr, gen_shuffle_instr(i))) + def gen_shuffle_instr(offset: int): + return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [PsSymbolExpr(mask), symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) + + num_shuffles = math.frexp(warp_size)[1] + shuffles = [PsAssignment(symbol_expr, + compound_op_to_expr(actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) for i in reversed(range(1, num_shuffles))] # find first thread in warp @@ -343,14 +348,21 @@ class CudaPlatform(GenericGpu): PsConstantExpr(PsConstant(0, SInt(32)))) cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) + ballot_instr = PsCall(CFunction("__ballot_sync", [UInt(32), SInt(32)], SInt(32)), + [full_mask, is_valid_thread]) + decl_mask = PsDeclaration(PsSymbolExpr(mask), ballot_instr if is_valid_thread else full_mask) + # use atomic operation on first thread of warp to sync call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) call.args = (ptr_expr, symbol_expr) # assemble warp reduction - return PsBlock( - shuffles - + [PsConditional(cond, PsBlock([PsStatement(call)]))]) + return PsConditional(is_valid_thread if is_valid_thread else PsConstantExpr(PsLiteral("true", PsBoolType)), + PsBlock( + [decl_mask] + + shuffles + + [PsConditional(cond, PsBlock([PsStatement(call)]))])) # Internals @@ -359,7 +371,7 @@ class CudaPlatform(GenericGpu): def _get_condition_for_translation( self, ispace: IterationSpace): - if not self._omit_range_check: + if self._omit_range_check: return None match ispace: -- GitLab From bcd83842f3d331fa039a96e3eab68c34b3beb6f3 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 21 Feb 2025 14:56:46 +0100 Subject: [PATCH 117/180] Use full mask for CUDA reductions --- src/pystencils/backend/platforms/cuda.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 873961cc7..8936bf73f 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -313,8 +313,9 @@ class CudaPlatform(GenericGpu): NotImplementedError("atomic operations are only available for float32/64 datatypes") # set up mask symbol for active threads in warp - mask = PsSymbol("__shfl_mask", UInt(32)) - self._ctx.add_symbol(mask) + #mask = PsSymbol("__shfl_mask", UInt(32)) + #self._ctx.add_symbol(mask) + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) # workaround for subtractions -> use additions for reducing intermediate results # similar to OpenMP reductions: local copies (negative sign) are added at the end @@ -327,7 +328,7 @@ class CudaPlatform(GenericGpu): # perform local warp reductions def gen_shuffle_instr(offset: int): return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), - [PsSymbolExpr(mask), symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) + [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) num_shuffles = math.frexp(warp_size)[1] shuffles = [PsAssignment(symbol_expr, @@ -348,21 +349,19 @@ class CudaPlatform(GenericGpu): PsConstantExpr(PsConstant(0, SInt(32)))) cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp - full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) - ballot_instr = PsCall(CFunction("__ballot_sync", [UInt(32), SInt(32)], SInt(32)), - [full_mask, is_valid_thread]) - decl_mask = PsDeclaration(PsSymbolExpr(mask), ballot_instr if is_valid_thread else full_mask) + #ballot_instr = PsCall(CFunction("__ballot_sync", [UInt(32), SInt(32)], SInt(32)), + # [full_mask, is_valid_thread]) + #decl_mask = PsDeclaration(full_mask) # use atomic operation on first thread of warp to sync call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) call.args = (ptr_expr, symbol_expr) # assemble warp reduction - return PsConditional(is_valid_thread if is_valid_thread else PsConstantExpr(PsLiteral("true", PsBoolType)), - PsBlock( - [decl_mask] - + shuffles - + [PsConditional(cond, PsBlock([PsStatement(call)]))])) + return PsBlock( + #[decl_mask] + shuffles + + [PsConditional(cond, PsBlock([PsStatement(call)]))]) # Internals -- GitLab From a8479afadc2a95e2ecda861ce7fd186375477347 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 21 Feb 2025 18:11:51 +0100 Subject: [PATCH 118/180] Refactor reductionfunction mechanism --- src/pystencils/backend/functions.py | 1 - .../backend/kernelcreation/typification.py | 2 +- src/pystencils/backend/platforms/cuda.py | 142 ++++++++---------- .../backend/platforms/generic_cpu.py | 49 +++--- src/pystencils/backend/platforms/platform.py | 12 +- src/pystencils/backend/platforms/sycl.py | 9 +- .../transformations/select_functions.py | 36 ++++- src/pystencils/codegen/driver.py | 16 +- 8 files changed, 119 insertions(+), 148 deletions(-) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index d28ef5f44..4e38de5e9 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -142,7 +142,6 @@ class ReductionFunctions(Enum): Each platform has to materialize these functions to a concrete implementation. """ - InitLocalCopy = ("InitLocalCopy", 2) WriteBackToPtr = ("WriteBackToPtr", 2) def __init__(self, func_name, num_args): diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 544746ef6..284e80b9d 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -617,7 +617,7 @@ class Typifier: case PsCall(function, args): match function: - case PsMathFunction() | PsReductionFunction(): + case PsMathFunction(): for arg in args: self.visit_expr(arg, tc) tc.infer_dtype(expr) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 8936bf73f..1f6506c8f 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -212,10 +212,66 @@ class CudaPlatform(GenericGpu): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression: - assert isinstance(call.function, PsMathFunction) - + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]: func = call.function.func + + if func in ReductionFunctions: + match func: + case ReductionFunctions.WriteBackToPtr: + ptr_expr, symbol_expr = call.args + op = call.function.reduction_op + stype = symbol_expr.dtype + ptrtype = ptr_expr.dtype + + warp_size = 32 # TODO: get from platform/user config + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) + + if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): + NotImplementedError("atomic operations are only available for float32/64 datatypes") + + # workaround for subtractions -> use additions for reducing intermediate results + # similar to OpenMP reductions: local copies (negative sign) are added at the end + match op: + case ReductionOp.Sub: + actual_op = ReductionOp.Add + case _: + actual_op = op + + # perform local warp reductions + def gen_shuffle_instr(offset: int): + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) + return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) + + num_shuffles = math.frexp(warp_size)[1] + shuffles = [PsAssignment(symbol_expr, + compound_op_to_expr(actual_op, + symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) + for i in reversed(range(1, num_shuffles))] + + # find first thread in warp + ispace = self._ctx.get_iteration_space() + is_valid_thread = self._get_condition_for_translation(ispace) + thread_indices_per_dim = [ + idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) + for i, idx in enumerate(THREAD_IDX[:ispace.rank]) + ] + tid: PsExpression = thread_indices_per_dim[0] + for t in thread_indices_per_dim[1:]: + tid = PsAdd(tid, t) + first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32)))) + cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp + + # use atomic operation on first thread of warp to sync + call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) + call.args = (ptr_expr, symbol_expr) + + # assemble warp reduction + return (shuffles, PsConditional(cond, PsBlock([PsStatement(call)]))) + dtype = call.get_dtype() arg_types = (dtype,) * func.num_args @@ -232,7 +288,7 @@ class CudaPlatform(GenericGpu): return PsLiteralExpr(PsLiteral(define, dtype)) - if isinstance(dtype, PsIeeeFloatType): + if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions: match func: case ( MathFunctions.Exp @@ -285,84 +341,6 @@ class CudaPlatform(GenericGpu): f"No implementation available for function {func} on data type {dtype}" ) - def unfold_function( - self, call: PsCall - ) -> PsAstNode: - assert isinstance(call.function, PsReductionFunction) - - func = call.function.func - - match func: - case ReductionFunctions.InitLocalCopy: - symbol_expr, init_val = call.args - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression) - - return PsDeclaration(symbol_expr, init_val) - case ReductionFunctions.WriteBackToPtr: - ptr_expr, symbol_expr = call.args - op = call.function.reduction_op - stype = symbol_expr.dtype - ptrtype = ptr_expr.dtype - - warp_size = 32 # TODO: get from platform/user config - - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) - - if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): - NotImplementedError("atomic operations are only available for float32/64 datatypes") - - # set up mask symbol for active threads in warp - #mask = PsSymbol("__shfl_mask", UInt(32)) - #self._ctx.add_symbol(mask) - full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) - - # workaround for subtractions -> use additions for reducing intermediate results - # similar to OpenMP reductions: local copies (negative sign) are added at the end - match op: - case ReductionOp.Sub: - actual_op = ReductionOp.Add - case _: - actual_op = op - - # perform local warp reductions - def gen_shuffle_instr(offset: int): - return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), - [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) - - num_shuffles = math.frexp(warp_size)[1] - shuffles = [PsAssignment(symbol_expr, - compound_op_to_expr(actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) - for i in reversed(range(1, num_shuffles))] - - # find first thread in warp - ispace = self._ctx.get_iteration_space() # TODO: receive as argument in unfold_function? - is_valid_thread = self._get_condition_for_translation(ispace) - thread_indices_per_dim = [ - idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) - for i, idx in enumerate(THREAD_IDX[:ispace.rank]) - ] - tid: PsExpression = thread_indices_per_dim[0] - for t in thread_indices_per_dim[1:]: - tid = PsAdd(tid, t) - first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))), - PsConstantExpr(PsConstant(0, SInt(32)))) - cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp - - #ballot_instr = PsCall(CFunction("__ballot_sync", [UInt(32), SInt(32)], SInt(32)), - # [full_mask, is_valid_thread]) - #decl_mask = PsDeclaration(full_mask) - - # use atomic operation on first thread of warp to sync - call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) - call.args = (ptr_expr, symbol_expr) - - # assemble warp reduction - return PsBlock( - #[decl_mask] - shuffles - + [PsConditional(cond, PsBlock([PsStatement(call)]))]) - # Internals # TODO: SYCL platform has very similar code for fetching conditionals -> move to GenericGPU? diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 1e7468e33..24692b25c 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -4,8 +4,7 @@ from typing import Sequence from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr from ..ast import PsAstNode -from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions, \ - PsReductionFunction +from ..functions import CFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions from ..literals import PsLiteral from ...compound_op_mapping import compound_op_to_expr from ...sympyextensions import ReductionOp @@ -60,43 +59,31 @@ class GenericCpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def unfold_function( - self, call: PsCall - ) -> PsAstNode: - assert isinstance(call.function, PsReductionFunction) - + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]: func = call.function.func - match func: - case ReductionFunctions.InitLocalCopy: - symbol_expr, init_val = call.args - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression) - - return PsDeclaration(symbol_expr, init_val) - case ReductionFunctions.WriteBackToPtr: - ptr_expr, symbol_expr = call.args - op = call.function.reduction_op - - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) + if func in ReductionFunctions: + match func: + case ReductionFunctions.WriteBackToPtr: + ptr_expr, symbol_expr = call.args + op = call.function.reduction_op - ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) - # inspired by OpenMP: local reduction variable (negative sign) is added at the end - actual_op = ReductionOp.Add if op is ReductionOp.Sub else op + ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) - # TODO: can this be avoided somehow? - potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr) - if isinstance(potential_call, PsCall): - potential_call.dtype = symbol_expr.dtype - potential_call = self.select_function(potential_call) + # inspired by OpenMP: local reduction variable (negative sign) is added at the end + actual_op = ReductionOp.Add if op is ReductionOp.Sub else op - return PsAssignment(ptr_access, potential_call) + # TODO: can this be avoided somehow? + potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr) + if isinstance(potential_call, PsCall): + potential_call.dtype = symbol_expr.dtype + potential_call = self.select_function(potential_call) - def select_function(self, call: PsCall) -> PsExpression: - assert isinstance(call.function, PsMathFunction) + return potential_call - func = call.function.func dtype = call.get_dtype() arg_types = (dtype,) * func.num_args diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index e195d59bc..90fd69084 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -38,19 +38,9 @@ class Platform(ABC): @abstractmethod def select_function( self, call: PsCall - ) -> PsExpression: + ) -> PsExpression | tuple[tuple[PsAstNode, ...], PsExpression]: """Select an implementation for the given function on the given data type. If no viable implementation exists, raise a `MaterializationError`. """ pass - - @abstractmethod - def unfold_function( - self, call: PsCall - ) -> PsAstNode: - """Unfolds an implementation for the given function on the given data type. - - If no viable implementation exists, raise a `MaterializationError`. - """ - pass diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 9b077fd2b..eae2b7598 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -57,7 +57,7 @@ class SyclPlatform(GenericGpu): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression: + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]: assert isinstance(call.function, PsMathFunction) func = call.function.func @@ -108,13 +108,6 @@ class SyclPlatform(GenericGpu): f"No implementation available for function {func} on data type {dtype}" ) - def unfold_function( - self, call: PsCall - ) -> PsAstNode: - raise MaterializationError( - f"No implementation available for function {call.function.name}" - ) - def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace ) -> PsBlock: diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py index 0045de87b..288650698 100644 --- a/src/pystencils/backend/transformations/select_functions.py +++ b/src/pystencils/backend/transformations/select_functions.py @@ -1,6 +1,8 @@ -from ..platforms import Platform +from ..ast.structural import PsStatement, PsAssignment, PsBlock +from ..exceptions import MaterializationError +from ..platforms import Platform, CudaPlatform from ..ast import PsAstNode -from ..ast.expressions import PsCall +from ..ast.expressions import PsCall, PsExpression from ..functions import PsMathFunction, PsReductionFunction @@ -17,9 +19,31 @@ class SelectFunctions: def visit(self, node: PsAstNode) -> PsAstNode: node.children = [self.visit(c) for c in node.children] - if isinstance(node, PsCall) and isinstance(node.function, PsMathFunction): - return self._platform.select_function(node) - elif isinstance(node, PsCall) and isinstance(node.function, PsReductionFunction): - return self._platform.unfold_function(node) + if isinstance(node, PsAssignment): + rhs = node.rhs + if isinstance(rhs, PsCall) and isinstance(rhs.function, PsReductionFunction): + resolved_func = self._platform.select_function(rhs) + + match resolved_func: + case ((prepend), expr): + match self._platform: + case CudaPlatform(): + # special case: produces statement with atomic operation writing value back to ptr + return PsBlock(prepend + [PsStatement(expr)]) + case _: + return PsBlock(prepend + [PsAssignment(node.lhs, expr)]) + case PsExpression(): + return PsAssignment(node.lhs, resolved_func) + case _: + raise MaterializationError( + f"Wrong return type for resolved function {rhs.function.name} in SelectFunctions." + ) + else: + return node + elif isinstance(node, PsCall) and isinstance(node.function, PsMathFunction): + resolved_func = self._platform.select_function(node) + assert isinstance(resolved_func, PsExpression) + + return resolved_func else: return node diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 96e9b94ed..9f04d074a 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -16,7 +16,7 @@ from .kernel import Kernel, GpuKernel from .properties import PsSymbolProperty, FieldBasePtr from .parameters import Parameter from ..backend.functions import PsReductionFunction, ReductionFunctions -from ..backend.ast.expressions import PsSymbolExpr, PsCall +from ..backend.ast.expressions import PsSymbolExpr, PsCall, PsMemAcc, PsConstantExpr from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration from ..field import Field @@ -24,7 +24,7 @@ from ..types import PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode -from ..backend.ast.structural import PsBlock, PsLoop +from ..backend.ast.structural import PsBlock, PsLoop, PsDeclaration, PsAssignment from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( KernelCreationContext, @@ -187,16 +187,16 @@ class DefaultKernelCreationDriver: symbol_expr = typify(PsSymbolExpr(symbol)) ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol)) init_val = typify(reduction_info.init_val) + ptr_access = PsMemAcc(ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) - init_local_copy = PsCall(PsReductionFunction(ReductionFunctions.InitLocalCopy, reduction_info.op), - [symbol_expr, init_val]) + decl_local_copy = PsDeclaration(symbol_expr, init_val) write_back_ptr = PsCall(PsReductionFunction(ReductionFunctions.WriteBackToPtr, reduction_info.op), [ptr_symbol_expr, symbol_expr]) - # Init local reduction variable copy - kernel_ast.statements = [init_local_copy] + kernel_ast.statements - # Write back result to reduction target variable - kernel_ast.statements += [write_back_ptr] + prepend_ast = [decl_local_copy] # declare and init local copy with neutral element + append_ast = [PsAssignment(ptr_access, write_back_ptr)] # write back result to reduction target variable + + kernel_ast.statements = prepend_ast + kernel_ast.statements + append_ast # Target-Specific optimizations if self._target.is_cpu(): -- GitLab From 89a6f36ad50849267b3a95ef5a035d22566c5748 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 21 Feb 2025 18:17:44 +0100 Subject: [PATCH 119/180] Fix lint --- src/pystencils/backend/kernelcreation/typification.py | 2 +- src/pystencils/backend/platforms/cuda.py | 5 +---- src/pystencils/backend/platforms/generic_cpu.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 284e80b9d..3ca0a16e2 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -50,7 +50,7 @@ from ..ast.expressions import ( PsNot, ) from ..ast.vector import PsVecBroadcast, PsVecMemAcc, PsVecHorizontal -from ..functions import PsMathFunction, CFunction, PsReductionFunction +from ..functions import PsMathFunction, CFunction from ..ast.util import determine_memory_object from ..exceptions import TypificationError diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 1f6506c8f..6df502c1f 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -5,8 +5,6 @@ import operator from abc import ABC, abstractmethod from functools import reduce -from pystencils.types import PsBoolType - from ..ast import PsAstNode from ..constants import PsConstant from ...compound_op_mapping import compound_op_to_expr @@ -38,8 +36,7 @@ from ..ast.expressions import ( from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral -from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions, \ - NumericLimitsFunctions +from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions int32 = PsSignedIntegerType(width=32, const=False) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 24692b25c..3ffdfa22f 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -21,7 +21,7 @@ from ..kernelcreation.iteration_space import ( ) from ..constants import PsConstant -from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsAssignment +from ..ast.structural import PsDeclaration, PsLoop, PsBlock from ..ast.expressions import ( PsSymbolExpr, PsExpression, -- GitLab From 02be4d5e994e458e0e42d72045ec53355e7820d1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 21 Feb 2025 19:35:48 +0100 Subject: [PATCH 120/180] Fix typecheck --- src/pystencils/backend/platforms/cuda.py | 121 +++++++++--------- .../backend/platforms/generic_cpu.py | 40 +++--- src/pystencils/backend/platforms/platform.py | 2 +- .../transformations/select_functions.py | 20 +-- src/pystencils/codegen/driver.py | 8 +- 5 files changed, 99 insertions(+), 92 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 6df502c1f..291858810 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -36,8 +36,8 @@ from ..ast.expressions import ( from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral -from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions - +from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions, PsReductionFunction, \ + PsMathFunction int32 = PsSignedIntegerType(width=32, const=False) @@ -209,65 +209,66 @@ class CudaPlatform(GenericGpu): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]: - func = call.function.func + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]: + call_func = call.function + assert isinstance(call_func, PsReductionFunction | PsMathFunction) - if func in ReductionFunctions: - match func: - case ReductionFunctions.WriteBackToPtr: - ptr_expr, symbol_expr = call.args - op = call.function.reduction_op - stype = symbol_expr.dtype - ptrtype = ptr_expr.dtype - - warp_size = 32 # TODO: get from platform/user config - - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) - - if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): - NotImplementedError("atomic operations are only available for float32/64 datatypes") - - # workaround for subtractions -> use additions for reducing intermediate results - # similar to OpenMP reductions: local copies (negative sign) are added at the end - match op: - case ReductionOp.Sub: - actual_op = ReductionOp.Add - case _: - actual_op = op - - # perform local warp reductions - def gen_shuffle_instr(offset: int): - full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) - return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), - [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) - - num_shuffles = math.frexp(warp_size)[1] - shuffles = [PsAssignment(symbol_expr, - compound_op_to_expr(actual_op, - symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) - for i in reversed(range(1, num_shuffles))] - - # find first thread in warp - ispace = self._ctx.get_iteration_space() - is_valid_thread = self._get_condition_for_translation(ispace) - thread_indices_per_dim = [ - idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) - for i, idx in enumerate(THREAD_IDX[:ispace.rank]) - ] - tid: PsExpression = thread_indices_per_dim[0] - for t in thread_indices_per_dim[1:]: - tid = PsAdd(tid, t) - first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))), - PsConstantExpr(PsConstant(0, SInt(32)))) - cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp - - # use atomic operation on first thread of warp to sync - call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) - call.args = (ptr_expr, symbol_expr) - - # assemble warp reduction - return (shuffles, PsConditional(cond, PsBlock([PsStatement(call)]))) + func = call_func.func + + if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr: + ptr_expr, symbol_expr = call.args + op = call_func.reduction_op + stype = symbol_expr.dtype + ptrtype = ptr_expr.dtype + + warp_size = 32 # TODO: get from platform/user config + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) + + if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): + NotImplementedError("atomic operations are only available for float32/64 datatypes") + + # workaround for subtractions -> use additions for reducing intermediate results + # similar to OpenMP reductions: local copies (negative sign) are added at the end + match op: + case ReductionOp.Sub: + actual_op = ReductionOp.Add + case _: + actual_op = op + + # perform local warp reductions + def gen_shuffle_instr(offset: int): + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) + return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) + + num_shuffles = math.frexp(warp_size)[1] + shuffles = tuple(PsAssignment(symbol_expr, + compound_op_to_expr(actual_op, + symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) + for i in reversed(range(1, num_shuffles))) + + # find first thread in warp + ispace = self._ctx.get_iteration_space() + is_valid_thread = self._get_condition_for_translation(ispace) + thread_indices_per_dim = [ + idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) + for i, idx in enumerate(THREAD_IDX[:ispace.rank]) + ] + tid: PsExpression = thread_indices_per_dim[0] + for t in thread_indices_per_dim[1:]: + tid = PsAdd(tid, t) + first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32)))) + cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp + + # use atomic operation on first thread of warp to sync + call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) + call.args = (ptr_expr, symbol_expr) + + # assemble warp reduction + return shuffles, PsConditional(cond, PsBlock([PsStatement(call)])) dtype = call.get_dtype() arg_types = (dtype,) * func.num_args diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 3ffdfa22f..2f873ff29 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -4,7 +4,8 @@ from typing import Sequence from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr from ..ast import PsAstNode -from ..functions import CFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions +from ..functions import CFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions, PsMathFunction, \ + PsReductionFunction from ..literals import PsLiteral from ...compound_op_mapping import compound_op_to_expr from ...sympyextensions import ReductionOp @@ -59,30 +60,31 @@ class GenericCpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]: - func = call.function.func + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]: + call_func = call.function + assert isinstance(call_func, PsReductionFunction | PsMathFunction) - if func in ReductionFunctions: - match func: - case ReductionFunctions.WriteBackToPtr: - ptr_expr, symbol_expr = call.args - op = call.function.reduction_op + func = call_func.func + + if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr: + ptr_expr, symbol_expr = call.args + op = call_func.reduction_op - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) - ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) + ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) - # inspired by OpenMP: local reduction variable (negative sign) is added at the end - actual_op = ReductionOp.Add if op is ReductionOp.Sub else op + # inspired by OpenMP: local reduction variable (negative sign) is added at the end + actual_op = ReductionOp.Add if op is ReductionOp.Sub else op - # TODO: can this be avoided somehow? - potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr) - if isinstance(potential_call, PsCall): - potential_call.dtype = symbol_expr.dtype - potential_call = self.select_function(potential_call) + # create binop and potentially select corresponding function for e.g. min or max + potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr) + if isinstance(potential_call, PsCall): + potential_call.dtype = symbol_expr.dtype + return self.select_function(potential_call) - return potential_call + return potential_call dtype = call.get_dtype() arg_types = (dtype,) * func.num_args diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index 90fd69084..437962172 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -38,7 +38,7 @@ class Platform(ABC): @abstractmethod def select_function( self, call: PsCall - ) -> PsExpression | tuple[tuple[PsAstNode, ...], PsExpression]: + ) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]: """Select an implementation for the given function on the given data type. If no viable implementation exists, raise a `MaterializationError`. diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py index 288650698..d5f731653 100644 --- a/src/pystencils/backend/transformations/select_functions.py +++ b/src/pystencils/backend/transformations/select_functions.py @@ -1,6 +1,6 @@ -from ..ast.structural import PsStatement, PsAssignment, PsBlock +from ..ast.structural import PsAssignment, PsBlock from ..exceptions import MaterializationError -from ..platforms import Platform, CudaPlatform +from ..platforms import Platform from ..ast import PsAstNode from ..ast.expressions import PsCall, PsExpression from ..functions import PsMathFunction, PsReductionFunction @@ -25,13 +25,17 @@ class SelectFunctions: resolved_func = self._platform.select_function(rhs) match resolved_func: - case ((prepend), expr): - match self._platform: - case CudaPlatform(): - # special case: produces statement with atomic operation writing value back to ptr - return PsBlock(prepend + [PsStatement(expr)]) + case (prepend, new_rhs): + assert isinstance(prepend, tuple) + + match new_rhs: + case PsExpression(): + return PsBlock(prepend + (PsAssignment(node.lhs, new_rhs),)) + case PsAstNode(): + # special case: produces structural with atomic operation writing value back to ptr + return PsBlock(prepend + (new_rhs,)) case _: - return PsBlock(prepend + [PsAssignment(node.lhs, expr)]) + assert False, "Unexpected output from SelectFunctions." case PsExpression(): return PsAssignment(node.lhs, resolved_func) case _: diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 9f04d074a..cc3411249 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -187,16 +187,16 @@ class DefaultKernelCreationDriver: symbol_expr = typify(PsSymbolExpr(symbol)) ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol)) init_val = typify(reduction_info.init_val) - ptr_access = PsMemAcc(ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) - decl_local_copy = PsDeclaration(symbol_expr, init_val) + ptr_access = PsMemAcc(ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) write_back_ptr = PsCall(PsReductionFunction(ReductionFunctions.WriteBackToPtr, reduction_info.op), [ptr_symbol_expr, symbol_expr]) - prepend_ast = [decl_local_copy] # declare and init local copy with neutral element + prepend_ast = [PsDeclaration(symbol_expr, init_val)] # declare and init local copy with neutral element append_ast = [PsAssignment(ptr_access, write_back_ptr)] # write back result to reduction target variable - kernel_ast.statements = prepend_ast + kernel_ast.statements + append_ast + kernel_ast.statements = prepend_ast + kernel_ast.statements + kernel_ast.statements += append_ast # Target-Specific optimizations if self._target.is_cpu(): -- GitLab From a972d759e7e6e8e8e0c6241a70456184a1366143 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 26 Feb 2025 19:15:45 +0100 Subject: [PATCH 121/180] Fix getter for thread exec condition for dense/sparse iteration spaces in cuda.py --- src/pystencils/backend/platforms/cuda.py | 68 +++++++++++------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 291858810..e67b70db6 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -172,7 +172,7 @@ class Blockwise4DMapping(ThreadMapping): class CudaPlatform(GenericGpu): """Platform for CUDA-based GPUs. - + Args: ctx: The kernel creation context omit_range_check: If `True`, generated index translation code will not check if the point identified @@ -209,6 +209,33 @@ class CudaPlatform(GenericGpu): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") + def _get_condition_for_translation(self, ispace: IterationSpace): + if self._omit_range_check: + return None + + if isinstance(ispace, FullIterationSpace): + conds = [] + + dimensions = ispace.dimensions_in_loop_order() + + for dim in dimensions: + ctr_expr = PsExpression.make(dim.counter) + conds.append(PsLt(ctr_expr, dim.stop)) + + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + + return condition + elif isinstance(ispace, SparseIterationSpace): + sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) + stop = PsExpression.make(ispace.index_list.shape[0]) + + return PsLt(sparse_ctr_expr.clone(), stop) + else: + raise MaterializationError(f"Unknown type of iteration space: {ispace}") + + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]: call_func = call.function assert isinstance(call_func, PsReductionFunction | PsMathFunction) @@ -341,47 +368,12 @@ class CudaPlatform(GenericGpu): # Internals - # TODO: SYCL platform has very similar code for fetching conditionals -> move to GenericGPU? - - def _get_condition_for_translation( - self, ispace: IterationSpace): - - if self._omit_range_check: - return None - - match ispace: - case FullIterationSpace(): - - dimensions = ispace.dimensions_in_loop_order() - - conds = [] - for dim in dimensions: - ctr_expr = PsExpression.make(dim.counter) - conds.append(PsLt(ctr_expr, dim.stop)) - - if conds: - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - return condition - else: - return None - - case SparseIterationSpace(): - sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) - stop = PsExpression.make(ispace.index_list.shape[0]) - - return PsLt(sparse_ctr_expr.clone(), stop) - case _: - assert False, "Unknown iteration space" - def _prepend_dense_translation( self, body: PsBlock, ispace: FullIterationSpace ) -> PsBlock: ctr_mapping = self._thread_mapping(ispace) indexing_decls = [] - cond = self._get_condition_for_translation(ispace) dimensions = ispace.dimensions_in_loop_order() @@ -396,6 +388,7 @@ class CudaPlatform(GenericGpu): self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) + cond = self._get_condition_for_translation(ispace) if cond: ast = PsBlock(indexing_decls + [PsConditional(cond, body)]) else: @@ -410,8 +403,6 @@ class CudaPlatform(GenericGpu): factory = AstFactory(self._ctx) ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype()) - cond = self._get_condition_for_translation(ispace) - sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) ctr_mapping = self._thread_mapping(ispace) @@ -434,6 +425,7 @@ class CudaPlatform(GenericGpu): ] body.statements = mappings + body.statements + cond = self._get_condition_for_translation(ispace) if cond: ast = PsBlock([sparse_idx_decl, PsConditional(cond, body)]) else: -- GitLab From 4a031fc192fc3f7d30fde450e8ff1788d3ecc3dd Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 27 Feb 2025 11:49:00 +0100 Subject: [PATCH 122/180] Fix lint --- src/pystencils/backend/platforms/cuda.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index e67b70db6..a9ec9d8d6 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -235,7 +235,6 @@ class CudaPlatform(GenericGpu): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]: call_func = call.function assert isinstance(call_func, PsReductionFunction | PsMathFunction) -- GitLab From c7f1518efc14326f9b5732e7e8f6ac7f07acc71a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Mar 2025 16:28:42 +0100 Subject: [PATCH 123/180] Move manual atomic op implementations to new header --- src/pystencils/backend/platforms/cuda.py | 5 +- src/pystencils/include/gpu_atomics.h | 90 +++++++++++++++++++ .../include/pystencils_runtime/hip.h | 89 ------------------ 3 files changed, 94 insertions(+), 90 deletions(-) create mode 100644 src/pystencils/include/gpu_atomics.h diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 7aac0d412..32744661a 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -197,7 +197,10 @@ class CudaPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return {'"pystencils_runtime/hip.h"'} # TODO: move to HipPlatform once it is introduced + return { + '"pystencils_runtime/hip.h"', # TODO: move to HipPlatform once it is introduced + '"gpu_atomics.h' + } def materialize_iteration_space( self, body: PsBlock, ispace: IterationSpace diff --git a/src/pystencils/include/gpu_atomics.h b/src/pystencils/include/gpu_atomics.h new file mode 100644 index 000000000..6de5c3321 --- /dev/null +++ b/src/pystencils/include/gpu_atomics.h @@ -0,0 +1,90 @@ +#pragma once + +// No direct implementation for all atomic operations available +// -> add support by custom implementations using a CAS mechanism + +#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__) + +// - atomicMul (double/float) +// see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division +__device__ double atomicMul(double* address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int oldValue = *address_as_ull, assumed; + do { + assumed = oldValue; + oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * + __longlong_as_double(assumed))); + } while (assumed != oldValue); + + return __longlong_as_double(oldValue); +} + +__device__ float atomicMul(float* address, float val) { + int* address_as_int = (int*)address; + int old = *address_as_int; + int assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed))); + } while (assumed != old); + + return __int_as_float(old); +} + +#endif + +#ifdef __CUDA_ARCH__ + +// - atomicMin (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMin(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val < __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMin(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val < __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +// - atomicMax (double/float) +// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda +__device__ __forceinline__ double atomicMax(double *address, double val) +{ + unsigned long long ret = __double_as_longlong(*address); + while(val > __longlong_as_double(ret)) + { + unsigned long long old = ret; + if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) + break; + } + return __longlong_as_double(ret); +} + +__device__ __forceinline__ float atomicMax(float *address, float val) +{ + int ret = __float_as_int(*address); + while(val > __int_as_float(ret)) + { + int old = ret; + if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) + break; + } + return __int_as_float(ret); +} + +#endif \ No newline at end of file diff --git a/src/pystencils/include/pystencils_runtime/hip.h b/src/pystencils/include/pystencils_runtime/hip.h index b0b4d9679..4bf4917f8 100644 --- a/src/pystencils/include/pystencils_runtime/hip.h +++ b/src/pystencils/include/pystencils_runtime/hip.h @@ -6,92 +6,3 @@ typedef __hip_int8_t int8_t; typedef __hip_uint16_t uint16_t; typedef __hip_int16_t int16_t; #endif - -// No direct implementation for all atomic operations available -// -> add support by custom implementations using a CAS mechanism - -#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__) - -// - atomicMul (double/float) -// see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division -__device__ double atomicMul(double* address, double val) { - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int oldValue = *address_as_ull, assumed; - do { - assumed = oldValue; - oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val * - __longlong_as_double(assumed))); - } while (assumed != oldValue); - - return __longlong_as_double(oldValue); -} - -__device__ float atomicMul(float* address, float val) { - int* address_as_int = (int*)address; - int old = *address_as_int; - int assumed; - do { - assumed = old; - old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed))); - } while (assumed != old); - - return __int_as_float(old); -} - -#endif - -#ifdef __CUDA_ARCH__ - -// - atomicMin (double/float) -// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda -__device__ __forceinline__ double atomicMin(double *address, double val) -{ - unsigned long long ret = __double_as_longlong(*address); - while(val < __longlong_as_double(ret)) - { - unsigned long long old = ret; - if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) - break; - } - return __longlong_as_double(ret); -} - -__device__ __forceinline__ float atomicMin(float *address, float val) -{ - int ret = __float_as_int(*address); - while(val < __int_as_float(ret)) - { - int old = ret; - if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) - break; - } - return __int_as_float(ret); -} - -// - atomicMax (double/float) -// see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda -__device__ __forceinline__ double atomicMax(double *address, double val) -{ - unsigned long long ret = __double_as_longlong(*address); - while(val > __longlong_as_double(ret)) - { - unsigned long long old = ret; - if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old) - break; - } - return __longlong_as_double(ret); -} - -__device__ __forceinline__ float atomicMax(float *address, float val) -{ - int ret = __float_as_int(*address); - while(val > __int_as_float(ret)) - { - int old = ret; - if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old) - break; - } - return __int_as_float(ret); -} - -#endif -- GitLab From 974cf848bfe474a5003e95c907e3e1289b6a5454 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 12 Mar 2025 16:40:25 +0100 Subject: [PATCH 124/180] Fix header incl --- src/pystencils/backend/platforms/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 32744661a..637723f07 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -199,7 +199,7 @@ class CudaPlatform(GenericGpu): def required_headers(self) -> set[str]: return { '"pystencils_runtime/hip.h"', # TODO: move to HipPlatform once it is introduced - '"gpu_atomics.h' + '"gpu_atomics.h"' } def materialize_iteration_space( -- GitLab From 90837d04eaad5b3ad049c11fa5af48cf0942e812 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 20 Mar 2025 15:36:01 +0100 Subject: [PATCH 125/180] Merge handling for GPU reductions into generic_gpu.py for the time being --- .../backend/platforms/generic_gpu.py | 147 +++++++++++++++--- src/pystencils/codegen/driver.py | 5 + 2 files changed, 130 insertions(+), 22 deletions(-) diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 11425d923..f16c28e8c 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,7 +1,16 @@ from __future__ import annotations -from abc import ABC, abstractmethod -from ...types import constify, deconstify +import math +import operator +from abc import ABC, abstractmethod +from functools import reduce + +from ..ast import PsAstNode +from ..constants import PsConstant +from ...compound_op_mapping import compound_op_to_expr +from ...sympyextensions.reduction import ReductionOp +from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType +from ...types.quick import UInt, SInt from ..exceptions import MaterializationError from .platform import Platform @@ -15,7 +24,7 @@ from ..kernelcreation import ( ) from ..kernelcreation.context import KernelCreationContext -from ..ast.structural import PsBlock, PsConditional, PsDeclaration +from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment from ..ast.expressions import ( PsExpression, PsLiteralExpr, @@ -23,12 +32,17 @@ from ..ast.expressions import ( PsCall, PsLookup, PsBufferAcc, + PsSymbolExpr, + PsConstantExpr, + PsAdd, + PsRem, + PsEq ) from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral -from ..functions import PsMathFunction, MathFunctions, CFunction - +from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions, PsReductionFunction, \ + PsMathFunction int32 = PsSignedIntegerType(width=32, const=False) @@ -174,10 +188,15 @@ class GenericGpu(Platform): def __init__( self, ctx: KernelCreationContext, + assume_warp_aligned_block_size: bool, + warp_size: int | None, thread_mapping: ThreadMapping | None = None, ) -> None: super().__init__(ctx) + self._assume_warp_aligned_block_size = assume_warp_aligned_block_size + self._warp_size = warp_size + self._thread_mapping = ( thread_mapping if thread_mapping is not None else Linear3DMapping() ) @@ -194,14 +213,107 @@ class GenericGpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression: - assert isinstance(call.function, PsMathFunction) + @staticmethod + def _get_condition_for_translation(ispace: IterationSpace): + + if isinstance(ispace, FullIterationSpace): + conds = [] + + dimensions = ispace.dimensions_in_loop_order() + + for dim in dimensions: + ctr_expr = PsExpression.make(dim.counter) + conds.append(PsLt(ctr_expr, dim.stop)) + + condition: PsExpression = conds[0] + for cond in conds[1:]: + condition = PsAnd(condition, cond) + + return condition + elif isinstance(ispace, SparseIterationSpace): + sparse_ctr_expr = PsExpression.make(ispace.sparse_counter) + stop = PsExpression.make(ispace.index_list.shape[0]) + + return PsLt(sparse_ctr_expr.clone(), stop) + else: + raise MaterializationError(f"Unknown type of iteration space: {ispace}") + + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]: + call_func = call.function + assert isinstance(call_func, PsReductionFunction | PsMathFunction) + + func = call_func.func + + if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr: + ptr_expr, symbol_expr = call.args + op = call_func.reduction_op + stype = symbol_expr.dtype + ptrtype = ptr_expr.dtype + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) + + if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): + NotImplementedError("atomic operations are only available for float32/64 datatypes") + + # workaround for subtractions -> use additions for reducing intermediate results + # similar to OpenMP reductions: local copies (negative sign) are added at the end + match op: + case ReductionOp.Sub: + actual_op = ReductionOp.Add + case _: + actual_op = op + + # perform local warp reductions + def gen_shuffle_instr(offset: int): + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) + return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) + + num_shuffles = math.frexp(self._warp_size)[1] + shuffles = tuple(PsAssignment(symbol_expr, + compound_op_to_expr(actual_op, + symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) + for i in reversed(range(1, num_shuffles))) + + # find first thread in warp + ispace = self._ctx.get_iteration_space() + is_valid_thread = self._get_condition_for_translation(ispace) + thread_indices_per_dim = [ + idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) + for i, idx in enumerate(THREAD_IDX[:ispace.rank]) + ] + tid: PsExpression = thread_indices_per_dim[0] + for t in thread_indices_per_dim[1:]: + tid = PsAdd(tid, t) + first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32)))) + cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp + + # use atomic operation on first thread of warp to sync + call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) + call.args = (ptr_expr, symbol_expr) + + # assemble warp reduction + return shuffles, PsConditional(cond, PsBlock([PsStatement(call)])) - func = call.function.func dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsIeeeFloatType): + if isinstance(dtype, PsScalarType) and func in NumericLimitsFunctions: + assert isinstance(dtype, PsIeeeFloatType) + + match func: + case NumericLimitsFunctions.Min: + define = "NEG_INFINITY" + case NumericLimitsFunctions.Max: + define = "POS_INFINITY" + case _: + raise MaterializationError(f"Cannot materialize call to function {func}") + + return PsLiteralExpr(PsLiteral(define, dtype)) + + if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions: match func: case ( MathFunctions.Exp @@ -262,7 +374,6 @@ class GenericGpu(Platform): ctr_mapping = self._thread_mapping(ispace) indexing_decls = [] - conds = [] dimensions = ispace.dimensions_in_loop_order() @@ -276,14 +387,9 @@ class GenericGpu(Platform): indexing_decls.append( self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter])) ) - conds.append(PsLt(ctr_expr, dim.stop)) - - condition: PsExpression = conds[0] - for cond in conds[1:]: - condition = PsAnd(condition, cond) - ast = PsBlock(indexing_decls + [PsConditional(condition, body)]) - return ast + cond = self._get_condition_for_translation(ispace) + return PsBlock(indexing_decls + [PsConditional(cond, body)]) def _prepend_sparse_translation( self, body: PsBlock, ispace: SparseIterationSpace @@ -313,8 +419,5 @@ class GenericGpu(Platform): ] body.statements = mappings + body.statements - stop = PsExpression.make(ispace.index_list.shape[0]) - condition = PsLt(sparse_ctr_expr.clone(), stop) - ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)]) - - return ast + cond = self._get_condition_for_translation(ispace) + return PsBlock([sparse_idx_decl, PsConditional(cond, body)]) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index c2bee0ad2..3962c316b 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -475,6 +475,9 @@ class DefaultKernelCreationDriver: else None ) + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") + warp_size: int | None = self._cfg.gpu.get_option("warp_size") + GpuPlatform: type match self._target: case Target.CUDA: @@ -486,6 +489,8 @@ class DefaultKernelCreationDriver: return GpuPlatform( self._ctx, + assume_warp_aligned_block_size, + warp_size, thread_mapping=thread_mapping, ) -- GitLab From 9ec813bdd1c8dfd2a1f32bb300d6f9e7d172542f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 20 Mar 2025 16:07:01 +0100 Subject: [PATCH 126/180] Employ optimized warp-level reduction based on check --- .../backend/platforms/generic_gpu.py | 62 +++++++++++-------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index f16c28e8c..d3e8de42d 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -264,33 +264,45 @@ class GenericGpu(Platform): case _: actual_op = op - # perform local warp reductions - def gen_shuffle_instr(offset: int): - full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) - return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), - [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) - - num_shuffles = math.frexp(self._warp_size)[1] - shuffles = tuple(PsAssignment(symbol_expr, - compound_op_to_expr(actual_op, - symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) - for i in reversed(range(1, num_shuffles))) - - # find first thread in warp + # check if thread is valid for performing reduction ispace = self._ctx.get_iteration_space() is_valid_thread = self._get_condition_for_translation(ispace) - thread_indices_per_dim = [ - idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) - for i, idx in enumerate(THREAD_IDX[:ispace.rank]) - ] - tid: PsExpression = thread_indices_per_dim[0] - for t in thread_indices_per_dim[1:]: - tid = PsAdd(tid, t) - first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), - PsConstantExpr(PsConstant(0, SInt(32)))) - cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp - - # use atomic operation on first thread of warp to sync + + cond: PsExpression + shuffles: tuple[PsAssignment, ...] + if self._warp_size and self._assume_warp_aligned_block_size: + # perform local warp reductions + def gen_shuffle_instr(offset: int): + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) + return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) + + # set up shuffle instructions for warp-level reduction + num_shuffles = math.frexp(self._warp_size)[1] + shuffles = tuple(PsAssignment(symbol_expr, + compound_op_to_expr(actual_op, + symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) + for i in reversed(range(1, num_shuffles))) + + # find first thread in warp + thread_indices_per_dim = [ + idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) + for i, idx in enumerate(THREAD_IDX[:ispace.rank]) + ] + tid: PsExpression = thread_indices_per_dim[0] + for t in thread_indices_per_dim[1:]: + tid = PsAdd(tid, t) + first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32)))) + + # set condition to only execute atomic operation on first valid thread in warp + cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp + else: + # no optimization: only execute atomic add on valid thread + shuffles = () + cond = is_valid_thread + + # use atomic operation call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) call.args = (ptr_expr, symbol_expr) -- GitLab From d7e6890cff8f327ce285a3360821a4341b2acc46 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 20 Mar 2025 16:23:54 +0100 Subject: [PATCH 127/180] Fix typecheck --- src/pystencils/backend/platforms/generic_cpu.py | 4 ++-- src/pystencils/backend/platforms/generic_gpu.py | 4 ++-- src/pystencils/backend/platforms/platform.py | 4 ++-- src/pystencils/backend/platforms/sycl.py | 4 ++-- .../backend/transformations/loop_vectorizer.py | 14 +++++++------- .../backend/transformations/select_functions.py | 4 ++-- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 2f873ff29..43b048184 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -22,7 +22,7 @@ from ..kernelcreation.iteration_space import ( ) from ..constants import PsConstant -from ..ast.structural import PsDeclaration, PsLoop, PsBlock +from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsStructuralNode from ..ast.expressions import ( PsSymbolExpr, PsExpression, @@ -60,7 +60,7 @@ class GenericCpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]: + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: call_func = call.function assert isinstance(call_func, PsReductionFunction | PsMathFunction) diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index d3e8de42d..9b21457be 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -24,7 +24,7 @@ from ..kernelcreation import ( ) from ..kernelcreation.context import KernelCreationContext -from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment +from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment, PsStructuralNode from ..ast.expressions import ( PsExpression, PsLiteralExpr, @@ -238,7 +238,7 @@ class GenericGpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]: + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: call_func = call.function assert isinstance(call_func, PsReductionFunction | PsMathFunction) diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index 437962172..4f738dd5d 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod from ..ast import PsAstNode -from ..ast.structural import PsBlock +from ..ast.structural import PsBlock, PsStructuralNode from ..ast.expressions import PsCall, PsExpression from ..kernelcreation.context import KernelCreationContext @@ -38,7 +38,7 @@ class Platform(ABC): @abstractmethod def select_function( self, call: PsCall - ) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]: + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: """Select an implementation for the given function on the given data type. If no viable implementation exists, raise a `MaterializationError`. diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 7d7b8d1a7..78af01b2f 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -7,7 +7,7 @@ from ..kernelcreation.iteration_space import ( FullIterationSpace, SparseIterationSpace, ) -from ..ast.structural import PsDeclaration, PsBlock, PsConditional +from ..ast.structural import PsDeclaration, PsBlock, PsConditional, PsStructuralNode from ..ast.expressions import ( PsExpression, PsSymbolExpr, @@ -56,7 +56,7 @@ class SyclPlatform(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]: + def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: assert isinstance(call.function, PsMathFunction) func = call.function.func diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index b78114553..a96c6af4b 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -7,7 +7,7 @@ from ...types import PsVectorType, PsScalarType from ..kernelcreation import KernelCreationContext from ..constants import PsConstant from ..ast import PsAstNode -from ..ast.structural import PsLoop, PsBlock, PsDeclaration, PsAssignment +from ..ast.structural import PsLoop, PsBlock, PsDeclaration, PsAssignment, PsStructuralNode from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr from ..ast.vector import PsVecBroadcast, PsVecHorizontal from ..ast.analysis import collect_undefined_symbols @@ -135,20 +135,20 @@ class LoopVectorizer: vc = VectorizationContext(self._ctx, self._lanes, axis) # Prepare reductions - simd_init_local_reduction_vars = [] - simd_writeback_local_reduction_vars = [] + simd_init_local_reduction_vars: list[PsStructuralNode] = [] + simd_writeback_local_reduction_vars: list[PsStructuralNode] = [] for symb, reduction_info in self._ctx.symbols_reduction_info.items(): # Vectorize symbol for local copy vector_symb = vc.vectorize_symbol(symb) # Declare and init vector - simd_init_local_reduction_vars += [self._type_fold(PsDeclaration( - PsSymbolExpr(vector_symb), PsVecBroadcast(self._lanes, PsSymbolExpr(symb))))] + simd_init_local_reduction_vars += [PsDeclaration( + PsSymbolExpr(vector_symb), PsVecBroadcast(self._lanes, PsSymbolExpr(symb)))] # Write back vectorization result - simd_writeback_local_reduction_vars += [self._type_fold(PsAssignment( + simd_writeback_local_reduction_vars += [PsAssignment( PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(symb), PsSymbolExpr(vector_symb), - reduction_info.op)))] + reduction_info.op))] # Generate vectorized loop body simd_body = self._vectorize_ast(loop.body, vc) diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py index d5f731653..576cebad1 100644 --- a/src/pystencils/backend/transformations/select_functions.py +++ b/src/pystencils/backend/transformations/select_functions.py @@ -1,4 +1,4 @@ -from ..ast.structural import PsAssignment, PsBlock +from ..ast.structural import PsAssignment, PsBlock, PsStructuralNode from ..exceptions import MaterializationError from ..platforms import Platform from ..ast import PsAstNode @@ -31,7 +31,7 @@ class SelectFunctions: match new_rhs: case PsExpression(): return PsBlock(prepend + (PsAssignment(node.lhs, new_rhs),)) - case PsAstNode(): + case PsStructuralNode(): # special case: produces structural with atomic operation writing value back to ptr return PsBlock(prepend + (new_rhs,)) case _: -- GitLab From 5ee715d0df2651e745ff5de0524abfe24d48c968 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 20 Mar 2025 17:04:47 +0100 Subject: [PATCH 128/180] Reformat adapted files [skip ci] --- src/pystencils/__init__.py | 9 +- src/pystencils/backend/ast/vector.py | 46 +++++---- src/pystencils/backend/emission/ir_printer.py | 4 +- .../backend/kernelcreation/context.py | 6 +- .../backend/kernelcreation/freeze.py | 18 ++-- .../backend/kernelcreation/typification.py | 7 +- .../backend/platforms/generic_cpu.py | 46 +++++++-- .../backend/platforms/generic_gpu.py | 98 ++++++++++++++----- src/pystencils/backend/platforms/platform.py | 2 +- src/pystencils/backend/platforms/sycl.py | 4 +- src/pystencils/backend/platforms/x86.py | 10 +- .../backend/transformations/add_pragmas.py | 8 +- .../transformations/loop_vectorizer.py | 36 ++++--- .../transformations/select_functions.py | 8 +- .../transformations/select_intrinsics.py | 4 +- src/pystencils/codegen/driver.py | 38 +++++-- src/pystencils/compound_op_mapping.py | 15 ++- src/pystencils/jit/cpu_extension_module.py | 5 +- src/pystencils/sympyextensions/__init__.py | 4 +- src/pystencils/sympyextensions/reduction.py | 11 ++- tests/kernelcreation/test_reduction.py | 23 +++-- 21 files changed, 283 insertions(+), 119 deletions(-) diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py index a7bf33aa6..329f61d32 100644 --- a/src/pystencils/__init__.py +++ b/src/pystencils/__init__.py @@ -1,10 +1,6 @@ """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions""" -from .codegen import ( - Target, - CreateKernelConfig, - AUTO -) +from .codegen import Target, CreateKernelConfig, AUTO from .defaults import DEFAULTS from . import fd from . import stencil as stencil @@ -93,4 +89,5 @@ __all__ = [ ] from . import _version -__version__ = _version.get_versions()['version'] + +__version__ = _version.get_versions()["version"] diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 4f5224133..4141b0296 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -18,7 +18,7 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): """Broadcast a scalar value to N vector lanes.""" __match_args__ = ("lanes", "operand") - + def __init__(self, lanes: int, operand: PsExpression): super().__init__(operand) self._lanes = lanes @@ -26,21 +26,18 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): @property def lanes(self) -> int: return self._lanes - + @lanes.setter def lanes(self, n: int): self._lanes = n def _clone_expr(self) -> PsVecBroadcast: return PsVecBroadcast(self._lanes, self._operand.clone()) - + def structurally_equal(self, other: PsAstNode) -> bool: if not isinstance(other, PsVecBroadcast): return False - return ( - super().structurally_equal(other) - and self._lanes == other._lanes - ) + return super().structurally_equal(other) and self._lanes == other._lanes class PsVecHorizontal(PsBinOp, PsVectorOp): @@ -48,8 +45,13 @@ class PsVecHorizontal(PsBinOp, PsVectorOp): __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op") - def __init__(self, lanes: int, scalar_operand: PsExpression, vector_operand: PsExpression, - reduction_op: ReductionOp): + def __init__( + self, + lanes: int, + scalar_operand: PsExpression, + vector_operand: PsExpression, + reduction_op: ReductionOp, + ): super().__init__(scalar_operand, vector_operand) self._lanes = lanes self._reduction_op = reduction_op @@ -87,19 +89,23 @@ class PsVecHorizontal(PsBinOp, PsVectorOp): self._reduction_op = op def _clone_expr(self) -> PsVecHorizontal: - return PsVecHorizontal(self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op) + return PsVecHorizontal( + self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op + ) def structurally_equal(self, other: PsAstNode) -> bool: if not isinstance(other, PsVecHorizontal): return False - return (super().structurally_equal(other) - and self._lanes == other._lanes - and self._reduction_op == other._reduction_op) + return ( + super().structurally_equal(other) + and self._lanes == other._lanes + and self._reduction_op == other._reduction_op + ) class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): """Pointer-based vectorized memory access. - + Args: base_ptr: Pointer identifying the accessed memory region offset: Offset inside the memory region @@ -150,7 +156,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): @property def stride(self) -> PsExpression | None: return self._stride - + @stride.setter def stride(self, expr: PsExpression | None): self._stride = expr @@ -161,10 +167,12 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): def get_vector_type(self) -> PsVectorType: return cast(PsVectorType, self._dtype) - + def get_children(self) -> tuple[PsAstNode, ...]: - return (self._ptr, self._offset) + (() if self._stride is None else (self._stride,)) - + return (self._ptr, self._offset) + ( + () if self._stride is None else (self._stride,) + ) + def set_child(self, idx: int, c: PsAstNode): idx = [0, 1, 2][idx] match idx: @@ -193,7 +201,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp): and self._vector_entries == other._vector_entries and self._aligned == other._aligned ) - + def __repr__(self) -> str: return ( f"PsVecMemAcc({repr(self._ptr)}, {repr(self._offset)}, {repr(self._vector_entries)}, " diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py index 1508e6d94..22ae2f91a 100644 --- a/src/pystencils/backend/emission/ir_printer.py +++ b/src/pystencils/backend/emission/ir_printer.py @@ -24,7 +24,7 @@ def emit_ir(ir: PsAstNode | Kernel): class IRAstPrinter(BasePrinter): """Print the IR AST as pseudo-code. - + This printer produces a complete pseudocode representation of a pystencils AST. Other than the `CAstPrinter`, the `IRAstPrinter` is capable of emitting code for each node defined in `ast <pystencils.backend.ast>`. @@ -85,7 +85,7 @@ class IRAstPrinter(BasePrinter): return pc.parenthesize( f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({scalar_operand_code, vector_operand_code})", - Ops.Weakest + Ops.Weakest, ) case _: diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 827be45a5..536c73c7f 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -106,7 +106,7 @@ class KernelCreationContext: def index_dtype(self) -> PsIntegerType: """Data type used by default for index expressions""" return self._index_dtype - + def resolve_dynamic_type(self, dtype: DynamicType | PsType) -> PsType: """Selects the appropriate data type for `DynamicType` instances, and returns all other types as they are.""" match dtype: @@ -191,7 +191,9 @@ class KernelCreationContext: self._symbols[old.name] = new - def add_symbol_reduction_info(self, local_symb: PsSymbol, reduction_info: ReductionInfo): + def add_symbol_reduction_info( + self, local_symb: PsSymbol, reduction_info: ReductionInfo + ): """Adds entry for a symbol and its reduction info to its corresponding lookup table. The symbol ``symbol`` shall not exist in the symbol table already. diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index df6bfbd1f..63e9ea5b1 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -57,7 +57,7 @@ from ..ast.expressions import ( PsAnd, PsOr, PsNot, - PsMemAcc + PsMemAcc, ) from ..ast.vector import PsVecMemAcc @@ -110,7 +110,9 @@ class FreezeExpressions: def __call__(self, obj: AssignmentCollection | sp.Basic) -> PsAstNode: if isinstance(obj, AssignmentCollection): - return PsBlock([cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments]) + return PsBlock( + [cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments] + ) elif isinstance(obj, AssignmentBase): return cast(PsAssignment, self.visit(obj)) elif isinstance(obj, _ExprLike): @@ -179,7 +181,9 @@ class FreezeExpressions: "/=": ReductionOp.Div, } - return PsAssignment(lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs)) + return PsAssignment( + lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs) + ) def map_ReductionAssignment(self, expr: ReductionAssignment): assert isinstance(expr.lhs, TypedSymbol) @@ -327,22 +331,22 @@ class FreezeExpressions: raise FreezeError("Cannot translate an empty tuple.") items = [self.visit_expr(item) for item in expr] - + if any(isinstance(i, PsArrayInitList) for i in items): # base case: have nested arrays if not all(isinstance(i, PsArrayInitList) for i in items): raise FreezeError( f"Cannot translate nested arrays of non-uniform shape: {expr}" ) - + subarrays = cast(list[PsArrayInitList], items) shape_tail = subarrays[0].shape - + if not all(s.shape == shape_tail for s in subarrays[1:]): raise FreezeError( f"Cannot translate nested arrays of non-uniform shape: {expr}" ) - + return PsArrayInitList([s.items_grid for s in subarrays]) # type: ignore else: # base case: no nested arrays diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 3ca0a16e2..b457f39a0 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -194,9 +194,10 @@ class TypeContext: f" Target type: {self._target_type}" ) - case PsNumericOpTrait() if not isinstance( - self._target_type, PsNumericType - ) or self._target_type.is_bool(): + case PsNumericOpTrait() if ( + not isinstance(self._target_type, PsNumericType) + or self._target_type.is_bool() + ): # FIXME: PsBoolType derives from PsNumericType, but is not numeric raise TypificationError( f"Numerical operation encountered in non-numerical type context:\n" diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 43b048184..ccef61817 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -4,8 +4,14 @@ from typing import Sequence from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr from ..ast import PsAstNode -from ..functions import CFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions, PsMathFunction, \ - PsReductionFunction +from ..functions import ( + CFunction, + MathFunctions, + NumericLimitsFunctions, + ReductionFunctions, + PsMathFunction, + PsReductionFunction, +) from ..literals import PsLiteral from ...compound_op_mapping import compound_op_to_expr from ...sympyextensions import ReductionOp @@ -30,7 +36,8 @@ from ..ast.expressions import ( PsLookup, PsGe, PsLe, - PsTernary, PsLiteralExpr, + PsTernary, + PsLiteralExpr, ) from ..ast.vector import PsVecMemAcc from ...types import PsVectorType, PsCustomType @@ -60,20 +67,31 @@ class GenericCpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: call_func = call.function assert isinstance(call_func, PsReductionFunction | PsMathFunction) func = call_func.func - if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr: + if ( + isinstance(call_func, PsReductionFunction) + and func is ReductionFunctions.WriteBackToPtr + ): ptr_expr, symbol_expr = call.args op = call_func.reduction_op - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType) + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance( + ptr_expr.dtype, PsPointerType + ) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance( + symbol_expr.dtype, PsScalarType + ) - ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) + ptr_access = PsMemAcc( + ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) + ) # inspired by OpenMP: local reduction variable (negative sign) is added at the end actual_op = ReductionOp.Add if op is ReductionOp.Sub else op @@ -89,8 +107,16 @@ class GenericCpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max): - return PsLiteralExpr(PsLiteral(f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype)) + if isinstance(dtype, PsScalarType) and func in ( + NumericLimitsFunctions.Min, + NumericLimitsFunctions.Max, + ): + return PsLiteralExpr( + PsLiteral( + f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", + dtype, + ) + ) if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64): cfunc: CFunction diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 9b21457be..2a12d6b7b 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -24,7 +24,14 @@ from ..kernelcreation import ( ) from ..kernelcreation.context import KernelCreationContext -from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment, PsStructuralNode +from ..ast.structural import ( + PsBlock, + PsConditional, + PsDeclaration, + PsStatement, + PsAssignment, + PsStructuralNode, +) from ..ast.expressions import ( PsExpression, PsLiteralExpr, @@ -36,13 +43,19 @@ from ..ast.expressions import ( PsConstantExpr, PsAdd, PsRem, - PsEq + PsEq, ) from ..ast.expressions import PsLt, PsAnd from ...types import PsSignedIntegerType, PsIeeeFloatType from ..literals import PsLiteral -from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions, PsReductionFunction, \ - PsMathFunction +from ..functions import ( + MathFunctions, + CFunction, + ReductionFunctions, + NumericLimitsFunctions, + PsReductionFunction, + PsMathFunction, +) int32 = PsSignedIntegerType(width=32, const=False) @@ -131,7 +144,7 @@ class Blockwise4DMapping(ThreadMapping): THREAD_IDX[0], BLOCK_IDX[0], BLOCK_IDX[1], - BLOCK_IDX[2] + BLOCK_IDX[2], ] def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]: @@ -177,7 +190,7 @@ class Blockwise4DMapping(ThreadMapping): class GenericGpu(Platform): """Common base platform for CUDA- and HIP-type GPU targets. - + Args: ctx: The kernel creation context omit_range_check: If `True`, generated index translation code will not check if the point identified @@ -238,23 +251,34 @@ class GenericGpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: call_func = call.function assert isinstance(call_func, PsReductionFunction | PsMathFunction) func = call_func.func - if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr: + if ( + isinstance(call_func, PsReductionFunction) + and func is ReductionFunctions.WriteBackToPtr + ): ptr_expr, symbol_expr = call.args op = call_func.reduction_op stype = symbol_expr.dtype ptrtype = ptr_expr.dtype - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance( + ptrtype, PsPointerType + ) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance( + stype, PsScalarType + ) if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): - NotImplementedError("atomic operations are only available for float32/64 datatypes") + NotImplementedError( + "atomic operations are only available for float32/64 datatypes" + ) # workaround for subtractions -> use additions for reducing intermediate results # similar to OpenMP reductions: local copies (negative sign) are added at the end @@ -274,36 +298,60 @@ class GenericGpu(Platform): # perform local warp reductions def gen_shuffle_instr(offset: int): full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) - return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), - [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))]) + return PsCall( + CFunction( + "__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype + ), + [ + full_mask, + symbol_expr, + PsConstantExpr(PsConstant(offset, SInt(32))), + ], + ) # set up shuffle instructions for warp-level reduction num_shuffles = math.frexp(self._warp_size)[1] - shuffles = tuple(PsAssignment(symbol_expr, - compound_op_to_expr(actual_op, - symbol_expr, gen_shuffle_instr(pow(2, i - 1)))) - for i in reversed(range(1, num_shuffles))) + shuffles = tuple( + PsAssignment( + symbol_expr, + compound_op_to_expr( + actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1)) + ), + ) + for i in reversed(range(1, num_shuffles)) + ) # find first thread in warp thread_indices_per_dim = [ - idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))) - for i, idx in enumerate(THREAD_IDX[:ispace.rank]) + idx + * PsConstantExpr( + PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)) + ) + for i, idx in enumerate(THREAD_IDX[: ispace.rank]) ] tid: PsExpression = thread_indices_per_dim[0] for t in thread_indices_per_dim[1:]: tid = PsAdd(tid, t) - first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), - PsConstantExpr(PsConstant(0, SInt(32)))) + first_thread_in_warp = PsEq( + PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32))), + ) # set condition to only execute atomic operation on first valid thread in warp - cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp + cond = ( + PsAnd(is_valid_thread, first_thread_in_warp) + if is_valid_thread + else first_thread_in_warp + ) else: # no optimization: only execute atomic add on valid thread shuffles = () cond = is_valid_thread # use atomic operation - call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")) + call.function = CFunction( + f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void") + ) call.args = (ptr_expr, symbol_expr) # assemble warp reduction @@ -321,7 +369,9 @@ class GenericGpu(Platform): case NumericLimitsFunctions.Max: define = "POS_INFINITY" case _: - raise MaterializationError(f"Cannot materialize call to function {func}") + raise MaterializationError( + f"Cannot materialize call to function {func}" + ) return PsLiteralExpr(PsLiteral(define, dtype)) diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py index 4f738dd5d..7b81865ae 100644 --- a/src/pystencils/backend/platforms/platform.py +++ b/src/pystencils/backend/platforms/platform.py @@ -12,7 +12,7 @@ class Platform(ABC): """Abstract base class for all supported platforms. The platform performs all target-dependent tasks during code generation: - + - Translation of the iteration space to an index source (loop nest, GPU indexing, ...) - Platform-specific optimizations (e.g. vectorization, OpenMP) """ diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py index 78af01b2f..22d60f9b0 100644 --- a/src/pystencils/backend/platforms/sycl.py +++ b/src/pystencils/backend/platforms/sycl.py @@ -56,7 +56,9 @@ class SyclPlatform(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") - def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: + def select_function( + self, call: PsCall + ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: assert isinstance(call.function, PsMathFunction) func = call.function.func diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py index df0945006..add38cfe4 100644 --- a/src/pystencils/backend/platforms/x86.py +++ b/src/pystencils/backend/platforms/x86.py @@ -359,7 +359,11 @@ def _x86_op_intrin( atype = vtype.scalar_type case PsVecHorizontal(): # horizontal add instead of sub avoids double inversion of sign - actual_op = ReductionOp.Add if op.reduction_op == ReductionOp.Sub else op.reduction_op + actual_op = ( + ReductionOp.Add + if op.reduction_op == ReductionOp.Sub + else op.reduction_op + ) opstr = f"horizontal_{actual_op.name.lower()}" rtype = vtype.scalar_type atypes = (vtype.scalar_type, vtype) @@ -409,7 +413,9 @@ def _x86_op_intrin( case (SInt(64), Fp()) | ( Fp(), SInt(64), - ) if varch < X86VectorArch.AVX512: + ) if ( + varch < X86VectorArch.AVX512 + ): panic() # AVX512 only: cvtepiA_epiT if A > T case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512: diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index c9e8b3994..fa466e495 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -126,9 +126,13 @@ class AddOpenMP: if bool(ctx.symbols_reduction_info): for symbol, reduction_info in ctx.symbols_reduction_info.items(): if isinstance(symbol.dtype, PsScalarType): - pragma_text += f" reduction({reduction_info.op.value}: {symbol.name})" + pragma_text += ( + f" reduction({reduction_info.op.value}: {symbol.name})" + ) else: - NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.") + NotImplementedError( + "OMP: Reductions for non-scalar data types are not supported yet." + ) if collapse is not None: if collapse <= 0: diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index a96c6af4b..04d7d20f0 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -7,7 +7,13 @@ from ...types import PsVectorType, PsScalarType from ..kernelcreation import KernelCreationContext from ..constants import PsConstant from ..ast import PsAstNode -from ..ast.structural import PsLoop, PsBlock, PsDeclaration, PsAssignment, PsStructuralNode +from ..ast.structural import ( + PsLoop, + PsBlock, + PsDeclaration, + PsAssignment, + PsStructuralNode, +) from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr from ..ast.vector import PsVecBroadcast, PsVecHorizontal from ..ast.analysis import collect_undefined_symbols @@ -142,13 +148,25 @@ class LoopVectorizer: vector_symb = vc.vectorize_symbol(symb) # Declare and init vector - simd_init_local_reduction_vars += [PsDeclaration( - PsSymbolExpr(vector_symb), PsVecBroadcast(self._lanes, PsSymbolExpr(symb)))] + simd_init_local_reduction_vars += [ + PsDeclaration( + PsSymbolExpr(vector_symb), + PsVecBroadcast(self._lanes, PsSymbolExpr(symb)), + ) + ] # Write back vectorization result - simd_writeback_local_reduction_vars += [PsAssignment( - PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(symb), PsSymbolExpr(vector_symb), - reduction_info.op))] + simd_writeback_local_reduction_vars += [ + PsAssignment( + PsSymbolExpr(symb), + PsVecHorizontal( + self._lanes, + PsSymbolExpr(symb), + PsSymbolExpr(vector_symb), + reduction_info.op, + ), + ) + ] # Generate vectorized loop body simd_body = self._vectorize_ast(loop.body, vc) @@ -241,11 +259,7 @@ class LoopVectorizer: return PsBlock( simd_init_local_reduction_vars - + [ - simd_stop_decl, - simd_step_decl, - simd_loop - ] + + [simd_stop_decl, simd_step_decl, simd_loop] + simd_writeback_local_reduction_vars + [ trailing_start_decl, diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py index 576cebad1..9ce404693 100644 --- a/src/pystencils/backend/transformations/select_functions.py +++ b/src/pystencils/backend/transformations/select_functions.py @@ -21,7 +21,9 @@ class SelectFunctions: if isinstance(node, PsAssignment): rhs = node.rhs - if isinstance(rhs, PsCall) and isinstance(rhs.function, PsReductionFunction): + if isinstance(rhs, PsCall) and isinstance( + rhs.function, PsReductionFunction + ): resolved_func = self._platform.select_function(rhs) match resolved_func: @@ -30,7 +32,9 @@ class SelectFunctions: match new_rhs: case PsExpression(): - return PsBlock(prepend + (PsAssignment(node.lhs, new_rhs),)) + return PsBlock( + prepend + (PsAssignment(node.lhs, new_rhs),) + ) case PsStructuralNode(): # special case: produces structural with atomic operation writing value back to ptr return PsBlock(prepend + (new_rhs,)) diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py index 49fb9bb08..b20614393 100644 --- a/src/pystencils/backend/transformations/select_intrinsics.py +++ b/src/pystencils/backend/transformations/select_intrinsics.py @@ -101,7 +101,9 @@ class SelectIntrinsics: if isinstance(expr, PsVecHorizontal): scalar_op = expr.scalar_operand vector_op_to_scalar = self.visit_expr(expr.vector_operand, sc) - return self._platform.op_intrinsic(expr, [scalar_op, vector_op_to_scalar]) + return self._platform.op_intrinsic( + expr, [scalar_op, vector_op_to_scalar] + ) else: return expr diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 3962c316b..c285dd7bf 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -26,7 +26,13 @@ from ..types import PsIntegerType, PsScalarType from ..backend.memory import PsSymbol from ..backend.ast import PsAstNode from ..backend.functions import PsReductionFunction, ReductionFunctions -from ..backend.ast.expressions import PsExpression, PsSymbolExpr, PsCall, PsMemAcc, PsConstantExpr +from ..backend.ast.expressions import ( + PsExpression, + PsSymbolExpr, + PsCall, + PsMemAcc, + PsConstantExpr, +) from ..backend.ast.structural import PsBlock, PsLoop, PsDeclaration, PsAssignment from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers from ..backend.kernelcreation import ( @@ -191,12 +197,20 @@ class DefaultKernelCreationDriver: ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol)) init_val = typify(reduction_info.init_val) - ptr_access = PsMemAcc(ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))) - write_back_ptr = PsCall(PsReductionFunction(ReductionFunctions.WriteBackToPtr, reduction_info.op), - [ptr_symbol_expr, symbol_expr]) + ptr_access = PsMemAcc( + ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) + ) + write_back_ptr = PsCall( + PsReductionFunction( + ReductionFunctions.WriteBackToPtr, reduction_info.op + ), + [ptr_symbol_expr, symbol_expr], + ) - prepend_ast = [PsDeclaration(symbol_expr, init_val)] # declare and init local copy with neutral element - append_ast = [PsAssignment(ptr_access, write_back_ptr)] # write back result to reduction target variable + # declare and init local copy with neutral element + prepend_ast = [PsDeclaration(symbol_expr, init_val)] + # write back result to reduction target variable + append_ast = [PsAssignment(ptr_access, write_back_ptr)] kernel_ast.statements = prepend_ast + kernel_ast.statements kernel_ast.statements += append_ast @@ -423,14 +437,18 @@ class DefaultKernelCreationDriver: idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme") manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid") - assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option( + "assume_warp_aligned_block_size" + ) warp_size: int | None = self._cfg.gpu.get_option("warp_size") if warp_size is None: warp_size = GpuOptions.default_warp_size(self._target) if warp_size is None and assume_warp_aligned_block_size: - warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.") + warn( + "GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`." + ) return GpuIndexing( self._ctx, @@ -475,7 +493,9 @@ class DefaultKernelCreationDriver: else None ) - assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size") + assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option( + "assume_warp_aligned_block_size" + ) warp_size: int | None = self._cfg.gpu.get_option("warp_size") GpuPlatform: type diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/compound_op_mapping.py index f256369f9..193b308d0 100644 --- a/src/pystencils/compound_op_mapping.py +++ b/src/pystencils/compound_op_mapping.py @@ -3,7 +3,12 @@ from .backend.exceptions import FreezeError from .backend.functions import PsMathFunction, MathFunctions from .sympyextensions.reduction import ReductionOp -_available_operator_interface: set[ReductionOp] = {ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Div} +_available_operator_interface: set[ReductionOp] = { + ReductionOp.Add, + ReductionOp.Sub, + ReductionOp.Mul, + ReductionOp.Div, +} def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: @@ -18,7 +23,9 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: case ReductionOp.Div: operator = PsDiv case _: - raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.") + raise FreezeError( + f"Found unsupported operation type for compound assignments: {op}." + ) return operator(op1, op2) else: match op: @@ -27,4 +34,6 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: case ReductionOp.Max: return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) case _: - raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.") + raise FreezeError( + f"Found unsupported operation type for compound assignments: {op}." + ) diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py index 03260f649..4d76ea9ca 100644 --- a/src/pystencils/jit/cpu_extension_module.py +++ b/src/pystencils/jit/cpu_extension_module.py @@ -92,6 +92,7 @@ class PsKernelExtensioNModule: # Kernels and call wrappers from ..backend.emission import CAstPrinter + printer = CAstPrinter(func_prefix="FUNC_PREFIX") for name, kernel in self._kernels.items(): @@ -293,7 +294,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{ self._buffer_types[ptr] = ptr_dtype.base_type self.extract_buffer(ptr, param.name) buffer = self.get_buffer(param.name) - code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" + code = ( + f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;" + ) assert code is not None diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index 71f9a049a..bd0fa1fe9 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -28,7 +28,7 @@ from .math import ( count_operations_in_ast, common_denominator, get_symmetric_part, - SymbolCreator + SymbolCreator, ) @@ -67,5 +67,5 @@ __all__ = [ "common_denominator", "get_symmetric_part", "SymbolCreator", - "DynamicType" + "DynamicType", ] diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index cebfcb2f7..e95e37c24 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -22,6 +22,7 @@ class ReductionAssignment(AssignmentBase): reduction_op : ReductionOp Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc. """ + _reduction_op = None # type: ReductionOp @property @@ -55,9 +56,13 @@ class MaxReductionAssignment(ReductionAssignment): # Mapping from ReductionOp enum to ReductionAssigment classes _reduction_assignment_classes = { - cls.reduction_op: cls for cls in [ - AddReductionAssignment, SubReductionAssignment, MulReductionAssignment, - MinReductionAssignment, MaxReductionAssignment + cls.reduction_op: cls + for cls in [ + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, ] } diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index c3775964b..6e2b2f3fe 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -10,7 +10,7 @@ SIZE = 15 SOLUTION = { "+": INIT_W + INIT_ARR * SIZE, "-": INIT_W - INIT_ARR * SIZE, - "*": INIT_W * INIT_ARR ** SIZE, + "*": INIT_W * INIT_ARR**SIZE, "min": min(INIT_W, INIT_ARR), "max": max(INIT_W, INIT_ARR), } @@ -18,7 +18,7 @@ SOLUTION = { # get AST for kernel with reduction assignment def get_reduction_assign_ast(dtype, op, config): - x = ps.fields(f'x: {dtype}[1d]') + x = ps.fields(f"x: {dtype}[1d]") w = ps.TypedSymbol("w", dtype) red_assign = reduction_assignment_from_str(w, op, x.center()) @@ -26,13 +26,18 @@ def get_reduction_assign_ast(dtype, op, config): return ps.create_kernel([red_assign], config, default_dtype=dtype) -@pytest.mark.parametrize('instruction_set', ['sse', 'avx']) -@pytest.mark.parametrize('dtype', ["float64", "float32"]) +@pytest.mark.parametrize("instruction_set", ["sse", "avx"]) +@pytest.mark.parametrize("dtype", ["float64", "float32"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction_cpu(instruction_set, dtype, op): - vectorize_info = {'instruction_set': instruction_set, 'assume_inner_stride_one': True} + vectorize_info = { + "instruction_set": instruction_set, + "assume_inner_stride_one": True, + } - config = ps.CreateKernelConfig(target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info) + config = ps.CreateKernelConfig( + target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info + ) ast_reduction = get_reduction_assign_ast(dtype, op, config) ps.show_code(ast_reduction) @@ -45,7 +50,7 @@ def test_reduction_cpu(instruction_set, dtype, op): assert np.allclose(reduction_array, SOLUTION[op]) -@pytest.mark.parametrize('dtype', ["float64", "float32"]) +@pytest.mark.parametrize("dtype", ["float64", "float32"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) def test_reduction_gpu(dtype, op): try: @@ -57,7 +62,9 @@ def test_reduction_gpu(dtype, op): except ImportError: pytest.skip(reason="CuPy is not available", allow_module_level=True) except CUDARuntimeError: - pytest.skip(reason="No CUDA capable device is detected", allow_module_level=True) + pytest.skip( + reason="No CUDA capable device is detected", allow_module_level=True + ) config = ps.CreateKernelConfig(target=ps.Target.GPU) -- GitLab From 16a6e80d32e7ff12e151fd75ec6f2905903620df Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 20 Mar 2025 17:11:13 +0100 Subject: [PATCH 129/180] Add missing type fold for loop vectorizer again --- src/pystencils/backend/transformations/loop_vectorizer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index 04d7d20f0..48b9ad0da 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -149,9 +149,11 @@ class LoopVectorizer: # Declare and init vector simd_init_local_reduction_vars += [ - PsDeclaration( - PsSymbolExpr(vector_symb), - PsVecBroadcast(self._lanes, PsSymbolExpr(symb)), + self._type_fold( + PsDeclaration( + PsSymbolExpr(vector_symb), + PsVecBroadcast(self._lanes, PsSymbolExpr(symb)), + ) ) ] -- GitLab From 2c507d796f0c5abbff32386f4268d3bdb988c6fa Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 20 Mar 2025 18:18:10 +0100 Subject: [PATCH 130/180] Fix required headers for cuda/hip platforms --- src/pystencils/backend/platforms/cuda.py | 4 +++- src/pystencils/backend/platforms/hip.py | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 98ff3e3d3..c05c45f04 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -8,4 +8,6 @@ class CudaPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return set() + return { + '"gpu_atomics.h"', + } diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py index c758995a0..65d844bbb 100644 --- a/src/pystencils/backend/platforms/hip.py +++ b/src/pystencils/backend/platforms/hip.py @@ -8,4 +8,7 @@ class HipPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return {'"pystencils_runtime/hip.h"'} + return { + '"gpu_atomics.h"', + '"pystencils_runtime/hip.h"', + } -- GitLab From 6a6e57f08bec708036387c78919f7d4e028d86d4 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 20 Mar 2025 18:19:03 +0100 Subject: [PATCH 131/180] Fix wrong rank being used for obtaining default block sizes --- src/pystencils/codegen/gpu_indexing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py index 43b612bd7..f5606da02 100644 --- a/src/pystencils/codegen/gpu_indexing.py +++ b/src/pystencils/codegen/gpu_indexing.py @@ -260,6 +260,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): def __init__( self, + rank: int, num_work_items: _Dim3Lambda, hw_props: HardwareProperties, assume_warp_aligned_block_size: bool, @@ -270,7 +271,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration): self._assume_warp_aligned_block_size = assume_warp_aligned_block_size - default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items)) + default_bs = GpuLaunchConfiguration.get_default_block_size(rank) self._default_block_size = default_bs self._init_block_size: dim3 = default_bs self._compute_block_size: ( @@ -598,6 +599,7 @@ class GpuIndexing: def factory(): return DynamicBlockSizeLaunchConfiguration( + rank, num_work_items, self._hw_props, self._assume_warp_aligned_block_size, -- GitLab From c31d407433bb075a42691866e87e59208bf99d90 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 20 Mar 2025 18:56:41 +0100 Subject: [PATCH 132/180] Try fixing required headers for cuda and hip for reductions --- src/pystencils/backend/platforms/cuda.py | 4 +--- src/pystencils/backend/platforms/generic_gpu.py | 12 ++++++++++-- src/pystencils/backend/platforms/hip.py | 3 +-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index c05c45f04..bbb608f5c 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -8,6 +8,4 @@ class CudaPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return { - '"gpu_atomics.h"', - } + return super().required_headers diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 2a12d6b7b..4f97264b0 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -198,6 +198,14 @@ class GenericGpu(Platform): thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points """ + @property + @abstractmethod + def required_headers(self) -> set[str]: + return { + '"gpu_atomics.h"', + "<cmath>", + } + def __init__( self, ctx: KernelCreationContext, @@ -365,9 +373,9 @@ class GenericGpu(Platform): match func: case NumericLimitsFunctions.Min: - define = "NEG_INFINITY" + define = "-INFINITY" case NumericLimitsFunctions.Max: - define = "POS_INFINITY" + define = "INFINITY" case _: raise MaterializationError( f"Cannot materialize call to function {func}" diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py index 65d844bbb..c5e8b3882 100644 --- a/src/pystencils/backend/platforms/hip.py +++ b/src/pystencils/backend/platforms/hip.py @@ -8,7 +8,6 @@ class HipPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return { - '"gpu_atomics.h"', + return super().required_headers | { '"pystencils_runtime/hip.h"', } -- GitLab From f77909540a9d26ce9dc1decde5c9508bc1f2d14a Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 24 Mar 2025 16:22:40 +0100 Subject: [PATCH 133/180] Use NPP library for numeric limits for CUDA, use std limits for HIP --- src/pystencils/backend/platforms/cuda.py | 25 +++++++++++++++++-- .../backend/platforms/generic_gpu.py | 21 +++++----------- src/pystencils/backend/platforms/hip.py | 17 ++++++++++++- 3 files changed, 45 insertions(+), 18 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index bbb608f5c..b5b3478e4 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,11 +1,32 @@ from __future__ import annotations from .generic_gpu import GenericGpu +from ..ast.expressions import PsExpression, PsLiteralExpr +from ..functions import PsFunction, NumericLimitsFunctions +from ..literals import PsLiteral +from ...types import PsType, PsIeeeFloatType class CudaPlatform(GenericGpu): - """Platform for the CUDA GPU taret.""" + """Platform for the CUDA GPU target.""" @property def required_headers(self) -> set[str]: - return super().required_headers + return super().required_headers | { + '"npp.h"', + } + + def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression: + assert isinstance(dtype, PsIeeeFloatType) + + match func: + case NumericLimitsFunctions.Min: + define = f"NPP_MINABS_{dtype.width}F" + case NumericLimitsFunctions.Max: + define = f"NPP_MAXABS_{dtype.width}F" + case _: + raise MaterializationError( + f"Cannot materialize call to function {func}" + ) + + return PsLiteralExpr(PsLiteral(define, dtype)) diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 4f97264b0..787b390fe 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -9,7 +9,7 @@ from ..ast import PsAstNode from ..constants import PsConstant from ...compound_op_mapping import compound_op_to_expr from ...sympyextensions.reduction import ReductionOp -from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType +from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType, PsType from ...types.quick import UInt, SInt from ..exceptions import MaterializationError from .platform import Platform @@ -203,9 +203,12 @@ class GenericGpu(Platform): def required_headers(self) -> set[str]: return { '"gpu_atomics.h"', - "<cmath>", } + @abstractmethod + def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression: + pass + def __init__( self, ctx: KernelCreationContext, @@ -369,19 +372,7 @@ class GenericGpu(Platform): arg_types = (dtype,) * func.num_args if isinstance(dtype, PsScalarType) and func in NumericLimitsFunctions: - assert isinstance(dtype, PsIeeeFloatType) - - match func: - case NumericLimitsFunctions.Min: - define = "-INFINITY" - case NumericLimitsFunctions.Max: - define = "INFINITY" - case _: - raise MaterializationError( - f"Cannot materialize call to function {func}" - ) - - return PsLiteralExpr(PsLiteral(define, dtype)) + return self.resolve_numeric_limits(func, dtype) if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions: match func: diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py index c5e8b3882..60e249aeb 100644 --- a/src/pystencils/backend/platforms/hip.py +++ b/src/pystencils/backend/platforms/hip.py @@ -1,13 +1,28 @@ from __future__ import annotations from .generic_gpu import GenericGpu +from ..ast.expressions import PsExpression, PsLiteralExpr +from ..functions import PsMathFunction +from ..literals import PsLiteral +from ...types import PsType, PsIeeeFloatType class HipPlatform(GenericGpu): - """Platform for the HIP GPU taret.""" + """Platform for the HIP GPU target.""" @property def required_headers(self) -> set[str]: return super().required_headers | { '"pystencils_runtime/hip.h"', + "<limits>" } + + def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression: + assert isinstance(dtype, PsIeeeFloatType) + + return PsLiteralExpr( + PsLiteral( + f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", + dtype, + ) + ) -- GitLab From 4e5c89b9cd23610f27d61612b14a12e968724e3f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 24 Mar 2025 16:54:50 +0100 Subject: [PATCH 134/180] Fix lint, typecheck --- src/pystencils/backend/platforms/cuda.py | 5 +++-- src/pystencils/backend/platforms/generic_gpu.py | 4 ++-- src/pystencils/backend/platforms/hip.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index b5b3478e4..7a5074677 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -2,7 +2,8 @@ from __future__ import annotations from .generic_gpu import GenericGpu from ..ast.expressions import PsExpression, PsLiteralExpr -from ..functions import PsFunction, NumericLimitsFunctions +from ..exceptions import MaterializationError +from ..functions import NumericLimitsFunctions from ..literals import PsLiteral from ...types import PsType, PsIeeeFloatType @@ -16,7 +17,7 @@ class CudaPlatform(GenericGpu): '"npp.h"', } - def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression: + def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression: assert isinstance(dtype, PsIeeeFloatType) match func: diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 787b390fe..8b7eead8d 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -206,7 +206,7 @@ class GenericGpu(Platform): } @abstractmethod - def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression: + def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression: pass def __init__( @@ -371,7 +371,7 @@ class GenericGpu(Platform): dtype = call.get_dtype() arg_types = (dtype,) * func.num_args - if isinstance(dtype, PsScalarType) and func in NumericLimitsFunctions: + if isinstance(dtype, PsScalarType) and isinstance(func, NumericLimitsFunctions): return self.resolve_numeric_limits(func, dtype) if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions: diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py index 60e249aeb..45d60452b 100644 --- a/src/pystencils/backend/platforms/hip.py +++ b/src/pystencils/backend/platforms/hip.py @@ -2,7 +2,7 @@ from __future__ import annotations from .generic_gpu import GenericGpu from ..ast.expressions import PsExpression, PsLiteralExpr -from ..functions import PsMathFunction +from ..functions import NumericLimitsFunctions from ..literals import PsLiteral from ...types import PsType, PsIeeeFloatType @@ -17,7 +17,7 @@ class HipPlatform(GenericGpu): "<limits>" } - def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression: + def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression: assert isinstance(dtype, PsIeeeFloatType) return PsLiteralExpr( -- GitLab From 806dcb6b2aaa42849d8af7f12b62b730eec7fa0e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 27 Mar 2025 17:02:57 +0100 Subject: [PATCH 135/180] Move resolution of reductions to concrete gpu platform classes --- src/pystencils/backend/platforms/cuda.py | 114 +++++++++++++- .../backend/platforms/generic_gpu.py | 143 ++++++------------ src/pystencils/backend/platforms/hip.py | 22 ++- 3 files changed, 172 insertions(+), 107 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 7a5074677..da8375c5e 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -1,11 +1,32 @@ from __future__ import annotations +import math + from .generic_gpu import GenericGpu -from ..ast.expressions import PsExpression, PsLiteralExpr +from ..ast import PsAstNode +from ..ast.expressions import ( + PsExpression, + PsLiteralExpr, + PsCall, + PsAnd, + PsConstantExpr, + PsSymbolExpr, +) +from ..ast.structural import ( + PsConditional, + PsStatement, + PsAssignment, + PsBlock, + PsStructuralNode, +) +from ..constants import PsConstant from ..exceptions import MaterializationError -from ..functions import NumericLimitsFunctions +from ..functions import NumericLimitsFunctions, CFunction from ..literals import PsLiteral -from ...types import PsType, PsIeeeFloatType +from ...compound_op_mapping import compound_op_to_expr +from ...sympyextensions import ReductionOp +from ...types import PsType, PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType +from ...types.quick import SInt, UInt class CudaPlatform(GenericGpu): @@ -17,7 +38,92 @@ class CudaPlatform(GenericGpu): '"npp.h"', } - def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression: + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + stype = symbol_expr.dtype + ptrtype = ptr_expr.dtype + + assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType) + assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType) + + if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): + NotImplementedError( + "atomic operations are only available for float32/64 datatypes" + ) + + # workaround for subtractions -> use additions for reducing intermediate results + # similar to OpenMP reductions: local copies (negative sign) are added at the end + match reduction_op: + case ReductionOp.Sub: + actual_reduction_op = ReductionOp.Add + case _: + actual_reduction_op = reduction_op + + # check if thread is valid for performing reduction + ispace = self._ctx.get_iteration_space() + is_valid_thread = self._get_condition_for_translation(ispace) + + cond: PsExpression + shuffles: tuple[PsAssignment, ...] + if self._warp_size and self._assume_warp_aligned_block_size: + # perform local warp reductions + def gen_shuffle_instr(offset: int): + full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) + return PsCall( + CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype), + [ + full_mask, + symbol_expr, + PsConstantExpr(PsConstant(offset, SInt(32))), + ], + ) + + # set up shuffle instructions for warp-level reduction + num_shuffles = math.frexp(self._warp_size)[1] + shuffles = tuple( + PsAssignment( + symbol_expr, + compound_op_to_expr( + actual_reduction_op, + symbol_expr, + gen_shuffle_instr(pow(2, i - 1)), + ), + ) + for i in reversed(range(1, num_shuffles)) + ) + + # find first thread in warp + first_thread_in_warp = self._first_thread_in_warp(ispace) + + # set condition to only execute atomic operation on first valid thread in warp + cond = ( + PsAnd(is_valid_thread, first_thread_in_warp) + if is_valid_thread + else first_thread_in_warp + ) + else: + # no optimization: only execute atomic add on valid thread + shuffles = () + cond = is_valid_thread + + # use atomic operation + func = CFunction( + f"atomic{actual_reduction_op.name}", [ptrtype, stype], PsCustomType("void") + ) + func_args = (ptr_expr, symbol_expr) + + # assemble warp reduction + return shuffles, PsConditional( + cond, PsBlock([PsStatement(PsCall(func, func_args))]) + ) + + def resolve_numeric_limits( + self, func: NumericLimitsFunctions, dtype: PsType + ) -> PsExpression: assert isinstance(dtype, PsIeeeFloatType) match func: diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 8b7eead8d..8a4dd11a2 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -1,16 +1,19 @@ from __future__ import annotations -import math import operator from abc import ABC, abstractmethod from functools import reduce from ..ast import PsAstNode from ..constants import PsConstant -from ...compound_op_mapping import compound_op_to_expr from ...sympyextensions.reduction import ReductionOp -from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType, PsType -from ...types.quick import UInt, SInt +from ...types import ( + constify, + deconstify, + PsScalarType, + PsType, +) +from ...types.quick import SInt from ..exceptions import MaterializationError from .platform import Platform @@ -28,8 +31,6 @@ from ..ast.structural import ( PsBlock, PsConditional, PsDeclaration, - PsStatement, - PsAssignment, PsStructuralNode, ) from ..ast.expressions import ( @@ -39,7 +40,6 @@ from ..ast.expressions import ( PsCall, PsLookup, PsBufferAcc, - PsSymbolExpr, PsConstantExpr, PsAdd, PsRem, @@ -206,7 +206,18 @@ class GenericGpu(Platform): } @abstractmethod - def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression: + def resolve_numeric_limits( + self, func: NumericLimitsFunctions, dtype: PsType + ) -> PsExpression: + pass + + @abstractmethod + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: pass def __init__( @@ -262,6 +273,31 @@ class GenericGpu(Platform): else: raise MaterializationError(f"Unknown type of iteration space: {ispace}") + @staticmethod + def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]: + """Returns thread indices multiplied with block dimension strides per dimension.""" + + return tuple( + idx + * PsConstantExpr( + PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)) + ) + for i, idx in enumerate(THREAD_IDX[: ispace.rank]) + ) + + def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression: + """Returns expression that determines whether a thread is the first within a warp.""" + + tids_per_dim = GenericGpu._thread_index_per_dim(ispace) + tid: PsExpression = tids_per_dim[0] + for t in tids_per_dim[1:]: + tid = PsAdd(tid, t) + + return PsEq( + PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), + PsConstantExpr(PsConstant(0, SInt(32))), + ) + def select_function( self, call: PsCall ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]: @@ -276,97 +312,8 @@ class GenericGpu(Platform): ): ptr_expr, symbol_expr = call.args op = call_func.reduction_op - stype = symbol_expr.dtype - ptrtype = ptr_expr.dtype - - assert isinstance(ptr_expr, PsSymbolExpr) and isinstance( - ptrtype, PsPointerType - ) - assert isinstance(symbol_expr, PsSymbolExpr) and isinstance( - stype, PsScalarType - ) - - if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64): - NotImplementedError( - "atomic operations are only available for float32/64 datatypes" - ) - - # workaround for subtractions -> use additions for reducing intermediate results - # similar to OpenMP reductions: local copies (negative sign) are added at the end - match op: - case ReductionOp.Sub: - actual_op = ReductionOp.Add - case _: - actual_op = op - - # check if thread is valid for performing reduction - ispace = self._ctx.get_iteration_space() - is_valid_thread = self._get_condition_for_translation(ispace) - - cond: PsExpression - shuffles: tuple[PsAssignment, ...] - if self._warp_size and self._assume_warp_aligned_block_size: - # perform local warp reductions - def gen_shuffle_instr(offset: int): - full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))) - return PsCall( - CFunction( - "__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype - ), - [ - full_mask, - symbol_expr, - PsConstantExpr(PsConstant(offset, SInt(32))), - ], - ) - - # set up shuffle instructions for warp-level reduction - num_shuffles = math.frexp(self._warp_size)[1] - shuffles = tuple( - PsAssignment( - symbol_expr, - compound_op_to_expr( - actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1)) - ), - ) - for i in reversed(range(1, num_shuffles)) - ) - - # find first thread in warp - thread_indices_per_dim = [ - idx - * PsConstantExpr( - PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)) - ) - for i, idx in enumerate(THREAD_IDX[: ispace.rank]) - ] - tid: PsExpression = thread_indices_per_dim[0] - for t in thread_indices_per_dim[1:]: - tid = PsAdd(tid, t) - first_thread_in_warp = PsEq( - PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))), - PsConstantExpr(PsConstant(0, SInt(32))), - ) - - # set condition to only execute atomic operation on first valid thread in warp - cond = ( - PsAnd(is_valid_thread, first_thread_in_warp) - if is_valid_thread - else first_thread_in_warp - ) - else: - # no optimization: only execute atomic add on valid thread - shuffles = () - cond = is_valid_thread - - # use atomic operation - call.function = CFunction( - f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void") - ) - call.args = (ptr_expr, symbol_expr) - # assemble warp reduction - return shuffles, PsConditional(cond, PsBlock([PsStatement(call)])) + return self.resolve_reduction(ptr_expr, symbol_expr, op) dtype = call.get_dtype() arg_types = (dtype,) * func.num_args diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py index 45d60452b..404d9bb27 100644 --- a/src/pystencils/backend/platforms/hip.py +++ b/src/pystencils/backend/platforms/hip.py @@ -1,9 +1,13 @@ from __future__ import annotations from .generic_gpu import GenericGpu +from ..ast import PsAstNode from ..ast.expressions import PsExpression, PsLiteralExpr +from ..ast.structural import PsStructuralNode +from ..exceptions import MaterializationError from ..functions import NumericLimitsFunctions from ..literals import PsLiteral +from ...sympyextensions import ReductionOp from ...types import PsType, PsIeeeFloatType @@ -12,12 +16,11 @@ class HipPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return super().required_headers | { - '"pystencils_runtime/hip.h"', - "<limits>" - } + return super().required_headers | {'"pystencils_runtime/hip.h"', "<limits>"} - def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression: + def resolve_numeric_limits( + self, func: NumericLimitsFunctions, dtype: PsType + ) -> PsExpression: assert isinstance(dtype, PsIeeeFloatType) return PsLiteralExpr( @@ -26,3 +29,12 @@ class HipPlatform(GenericGpu): dtype, ) ) + + def resolve_reduction( + self, + ptr_expr: PsExpression, + symbol_expr: PsExpression, + reduction_op: ReductionOp, + ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]: + + raise MaterializationError("Reductions are yet not supported in HIP backend.") -- GitLab From b008a9e9954b83fd371c572c321b26597211a9c1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 28 Mar 2025 13:26:09 +0100 Subject: [PATCH 136/180] Adapt guards for generated avx512 horizontal ops --- src/pystencils/include/simd_horizontal_helpers.h | 2 +- util/generate_simd_horizontal_op.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h index cd4bd5730..bd1889153 100644 --- a/src/pystencils/include/simd_horizontal_helpers.h +++ b/src/pystencils/include/simd_horizontal_helpers.h @@ -120,7 +120,7 @@ inline float _mm256_horizontal_max_ps(float dst, __m256 src) { #endif -#if defined(__AVX512VL__) +#if defined(__AVX512F__) #include <immintrin.h> inline double _mm512_horizontal_add_pd(double dst, __m512d src) { diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py index aebbf35bb..1d652c6e1 100644 --- a/util/generate_simd_horizontal_op.py +++ b/util/generate_simd_horizontal_op.py @@ -277,7 +277,7 @@ vtypes_for_instruction_set = { guards_for_instruction_sets = { InstructionSets.SSE3: "__SSE3__", InstructionSets.AVX: "__AVX__", - InstructionSets.AVX512: '__AVX512VL__', + InstructionSets.AVX512: '__AVX512F__', InstructionSets.NEON: '_M_ARM64', } -- GitLab From 7b43ffd2e12f5d69ccc30fdf451ee47d7caea6ae Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 31 Mar 2025 13:45:15 +0200 Subject: [PATCH 137/180] Add minor comment to ReductionInfo dataclass --- src/pystencils/backend/kernelcreation/context.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 536c73c7f..358b5ff6c 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -49,6 +49,8 @@ FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array")) @dataclass(frozen=True) class ReductionInfo: + """Information about a reduction operation, its neutral element in form of an initial value + and the pointer used by the kernel as write-back argument.""" op: ReductionOp init_val: PsExpression -- GitLab From 3484e7f794e4e720d7bc5931b90a4b7caf2ffc59 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 31 Mar 2025 13:49:47 +0200 Subject: [PATCH 138/180] Rename compound op mapping funcs --- src/pystencils/backend/kernelcreation/freeze.py | 10 ++++++---- src/pystencils/backend/platforms/generic_cpu.py | 4 ++-- src/pystencils/backend/platforms/generic_gpu.py | 4 ++-- ...{compound_op_mapping.py => reduction_op_mapping.py} | 6 +++--- 4 files changed, 13 insertions(+), 11 deletions(-) rename src/pystencils/{compound_op_mapping.py => reduction_op_mapping.py} (82%) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 63e9ea5b1..5cdb3864c 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -13,7 +13,7 @@ from ...sympyextensions import ( integer_functions, ConditionalFieldAccess, ) -from ...compound_op_mapping import compound_op_to_expr +from ...reduction_op_mapping import reduction_op_to_expr from ...sympyextensions.typed_sympy import TypedSymbol, TypeCast, DynamicType from ...sympyextensions.pointers import AddressOf, mem_acc from ...sympyextensions.reduction import ReductionAssignment, ReductionOp @@ -174,15 +174,17 @@ class FreezeExpressions: assert isinstance(lhs, PsExpression) assert isinstance(rhs, PsExpression) - _str_to_compound_op: dict[str, ReductionOp] = { + # transform augmented assignment to reduction op + str_to_reduction_op: dict[str, ReductionOp] = { "+=": ReductionOp.Add, "-=": ReductionOp.Sub, "*=": ReductionOp.Mul, "/=": ReductionOp.Div, } + # reuse existing handling for transforming reduction ops to expressions return PsAssignment( - lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs) + lhs, reduction_op_to_expr(str_to_reduction_op[expr.op], lhs.clone(), rhs) ) def map_ReductionAssignment(self, expr: ReductionAssignment): @@ -208,7 +210,7 @@ class FreezeExpressions: new_lhs = PsSymbolExpr(new_lhs_symb) # get new rhs from augmented assignment - new_rhs: PsExpression = compound_op_to_expr(op, new_lhs.clone(), rhs) + new_rhs: PsExpression = reduction_op_to_expr(op, new_lhs.clone(), rhs) # match for reduction operation and set neutral init_val init_val: PsExpression diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index ccef61817..3de7cf696 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -13,7 +13,7 @@ from ..functions import ( PsReductionFunction, ) from ..literals import PsLiteral -from ...compound_op_mapping import compound_op_to_expr +from ...reduction_op_mapping import reduction_op_to_expr from ...sympyextensions import ReductionOp from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType @@ -97,7 +97,7 @@ class GenericCpu(Platform): actual_op = ReductionOp.Add if op is ReductionOp.Sub else op # create binop and potentially select corresponding function for e.g. min or max - potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr) + potential_call = reduction_op_to_expr(actual_op, ptr_access, symbol_expr) if isinstance(potential_call, PsCall): potential_call.dtype = symbol_expr.dtype return self.select_function(potential_call) diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 8b7eead8d..349e79d4b 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -7,7 +7,7 @@ from functools import reduce from ..ast import PsAstNode from ..constants import PsConstant -from ...compound_op_mapping import compound_op_to_expr +from ...reduction_op_mapping import reduction_op_to_expr from ...sympyextensions.reduction import ReductionOp from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType, PsType from ...types.quick import UInt, SInt @@ -325,7 +325,7 @@ class GenericGpu(Platform): shuffles = tuple( PsAssignment( symbol_expr, - compound_op_to_expr( + reduction_op_to_expr( actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1)) ), ) diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/reduction_op_mapping.py similarity index 82% rename from src/pystencils/compound_op_mapping.py rename to src/pystencils/reduction_op_mapping.py index 193b308d0..06fb8aa3e 100644 --- a/src/pystencils/compound_op_mapping.py +++ b/src/pystencils/reduction_op_mapping.py @@ -11,7 +11,7 @@ _available_operator_interface: set[ReductionOp] = { } -def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: +def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: if op in _available_operator_interface: match op: case ReductionOp.Add: @@ -24,7 +24,7 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: operator = PsDiv case _: raise FreezeError( - f"Found unsupported operation type for compound assignments: {op}." + f"Found unsupported operation type for reduction assignments: {op}." ) return operator(op1, op2) else: @@ -35,5 +35,5 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) case _: raise FreezeError( - f"Found unsupported operation type for compound assignments: {op}." + f"Found unsupported operation type for reduction assignments: {op}." ) -- GitLab From 492fb30a9da935fe0273201de79e90b1eb1ae338 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Mon, 31 Mar 2025 13:53:34 +0200 Subject: [PATCH 139/180] Adapt error messages for reduced assignments in freeze.py --- src/pystencils/backend/kernelcreation/freeze.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 5cdb3864c..9dc3928b3 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -200,7 +200,8 @@ class FreezeExpressions: orig_lhs_symb = lhs.symbol dtype = lhs.dtype - assert isinstance(dtype, PsNumericType) + assert isinstance(dtype, PsNumericType), \ + "Reduction assignments require type information of the lhs symbol." # replace original symbol with pointer-based type used for export orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype)) @@ -226,7 +227,7 @@ class FreezeExpressions: case ReductionOp.Max: init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) case _: - raise FreezeError(f"Unsupported reduced assignment: {op}.") + raise FreezeError(f"Unsupported kind of reduction assignment: {op}.") reduction_info = ReductionInfo(op, init_val, orig_lhs_symb_as_ptr) -- GitLab From f469e70e18c278385ab4237b3f30afb70da4c606 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 2 Apr 2025 19:19:28 +0200 Subject: [PATCH 140/180] Extend test_reduction_gpu with assume_warp_aligned_block_size and use_block_fitting parameters --- tests/kernelcreation/test_reduction.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index 6e2b2f3fe..cd1710cf5 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -52,7 +52,14 @@ def test_reduction_cpu(instruction_set, dtype, op): @pytest.mark.parametrize("dtype", ["float64", "float32"]) @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) -def test_reduction_gpu(dtype, op): +@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) +@pytest.mark.parametrize("use_block_fitting", [True, False]) +def test_reduction_gpu( + dtype: str, + op: str, + assume_warp_aligned_block_size: bool, + use_block_fitting: bool, +): try: import cupy as cp from cupy_backends.cuda.api.runtime import CUDARuntimeError @@ -66,12 +73,16 @@ def test_reduction_gpu(dtype, op): reason="No CUDA capable device is detected", allow_module_level=True ) - config = ps.CreateKernelConfig(target=ps.Target.GPU) + cfg = ps.CreateKernelConfig(target=ps.Target.GPU) + cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size - ast_reduction = get_reduction_assign_ast(dtype, op, config) + ast_reduction = get_reduction_assign_ast(dtype, op, cfg) ps.show_code(ast_reduction) kernel_reduction = ast_reduction.compile() + if use_block_fitting: + kernel_reduction.launch_config.fit_block_size((32, 1, 1)) + array = np.full((SIZE,), INIT_ARR, dtype=dtype) reduction_array = np.full((1,), INIT_W, dtype=dtype) -- GitLab From 5bef84a8e28279c46cd73b67352201622d9aa89e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 2 Apr 2025 19:47:49 +0200 Subject: [PATCH 141/180] Write howto guide for reductions --- docs/source/backend/gpu_codegen.md | 1 + docs/source/backend/index.rst | 1 + docs/source/backend/reduction_codegen.md | 122 +++++++++++++++++++++ docs/source/index.rst | 1 + docs/source/user_manual/reductions.md | 132 +++++++++++++++++++++++ 5 files changed, 257 insertions(+) create mode 100644 docs/source/backend/reduction_codegen.md create mode 100644 docs/source/user_manual/reductions.md diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md index 3fe00840e..a95c36566 100644 --- a/docs/source/backend/gpu_codegen.md +++ b/docs/source/backend/gpu_codegen.md @@ -1,3 +1,4 @@ +(gpu_codegen)= # GPU Code Generation The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components: diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst index 0d384c55b..b9b400544 100644 --- a/docs/source/backend/index.rst +++ b/docs/source/backend/index.rst @@ -16,6 +16,7 @@ who wish to customize or extend the behaviour of the code generator in their app iteration_space translation platforms + reduction_codegen transformations gpu_codegen errors diff --git a/docs/source/backend/reduction_codegen.md b/docs/source/backend/reduction_codegen.md new file mode 100644 index 000000000..360c69256 --- /dev/null +++ b/docs/source/backend/reduction_codegen.md @@ -0,0 +1,122 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.4 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +mystnb: + execution_mode: cache +--- + +```{code-cell} ipython3 +:tags: [remove-cell, raises-exception] + +import pystencils as ps +import numpy as np +import cupy as cp +``` + +(codegen_reductions)= +# Code Generation for Reductions + +In this guide, we demonstrate how reduction kernels can be generated for different platforms and what impact certain +optimization strategies have. +For this, we set up the update rule for a simple dot product kernel: + +```{code-cell} ipython3 +r = ps.TypedSymbol("r", "double") +x, y = ps.fields(f"x, y: double[3D]", layout="fzyx") + +assign_dot_prod = ps.AddReductionAssignment(r, x.center() * y.center()) +``` + +## CPU Platforms + +We first consider a base variant for CPUs without employing any optimizations. +The generated code for this variant looks as follows: + +```{code-cell} ipython3 +cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) +kernel_cpu = ps.create_kernel(assign_dot_prod, cfg) + +ps.inspect(kernel_cpu) +``` + +We want the reduction kernel to be SIMD vectorized and employ shared-memory parallelism using OpenMP. +The supported SIMD instruction sets for reductions are: +* SSE3 +* AVX/AVX2 +* AVX512 + +Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`. +**Note that reductions require `assume_inner_stride_one` to be enabled.** +This is due to the fact that other inner strides would require masked SIMD operations +which are not supported yet. + +```{code-cell} ipython3 +# configure SIMD vectorization +cfg = ps.CreateKernelConfig( + target=ps.Target.X86_AVX, +) +cfg.cpu.vectorize.enable = True +cfg.cpu.vectorize.assume_inner_stride_one = True + +# configure OpenMP parallelization +cfg.cpu.openmp.enable = True +cfg.cpu.openmp.num_threads = 8 + +kernel_cpu_opt = ps.create_kernel(assign_dot_prod, cfg) + +ps.inspect(kernel_cpu_opt) +``` + +## GPU Platforms + +Reductions are currently only supported for CUDA platforms. +Similar to the CPU section, a base variant for GPUs without explicitly employing any optimizations is shown: + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + + kernel_gpu = ps.create_kernel(assign_dot_prod, cfg) + + ps.inspect(kernel_gpu) +``` + +As evident from the code, the generated kernel employs atomic operations for updating the pointer +holding the reduction result. +Using the explicit warp-level instructions provided by CUDA allows us to achieve higher performance compared to +only using atomic operations. +To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by +the hardware's warp size. +**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this +criterion by enabling `assume_warp_aligned_block_size`.** +While the default block sizes provided by the code generator already fulfill this criterion, +we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space. + +You can find more detailed information about warp size alignment in {ref}`gpu_codegen`. + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + cfg.gpu.assume_warp_aligned_block_size = True + + kernel_gpu_opt = ps.create_kernel(assign_dot_prod, cfg) + + kernel_func = kernel_gpu_opt.compile() + kernel_func.launch_config.fit_block_size((32, 1, 1)) + + ps.inspect(kernel_gpu_opt) +``` + +:::{admonition} Developers To Do: + +- Support for HIP platforms +- Support vectorization using NEON intrinsics +::: + diff --git a/docs/source/index.rst b/docs/source/index.rst index 6dba50af1..4e1070979 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,6 +81,7 @@ Topics user_manual/symbolic_language user_manual/kernelcreation + user_manual/reductions user_manual/gpu_kernels user_manual/WorkingWithTypes diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md new file mode 100644 index 000000000..454fd7df4 --- /dev/null +++ b/docs/source/user_manual/reductions.md @@ -0,0 +1,132 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.4 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +mystnb: + execution_mode: cache +--- + +```{code-cell} ipython3 +:tags: [remove-cell, raises-exception] + +import sympy as sp +import pystencils as ps +import numpy as np +import cupy as cp + +from enum import Enum +``` + +(guide_reductions)= +# Reductions in Pystencils + +Reductions play a vital role in numerical simulations as they allow aggregating data across multiple elements, +such as computing sums, products over an array or finding its minima or maxima. + +## Specifying Assignments with Reductions + +In pystencils, reductions are made available via specialized assignments, namely `ReductionAssignment`. +Here is a snippet creating a reduction assignment for adding up all elements of a field: + +```{code-cell} ipython3 +r = ps.TypedSymbol("r", "double") +x = ps.fields(f"x: double[3D]", layout="fzyx") + +assign_sum = ps.AddReductionAssignment(r, x.center()) +``` + +For each point in the iteration space, the left-hand side symbol `r` accumulates the contents of the +right-hand side `x.center()`. In our case, the `AddReductionAssignment` denotes an accumulation via additions. + +**Pystencils requires type information about the reduction symbols and thus requires `r` to be a `TypedSymbol`.** + +The following reduction assignment classes are available in pystencils: +* `AddReductionAssignment`: Builds sum over elements +* `SubReductionAssignment`: Builds difference over elements +* `MulReductionAssignment`: Builds product over elements +* `MinReductionAssignment`: Finds minimum element +* `MaxReductionAssignment`: Finds maximum element + +:::{note} +AlternatÃÂvely, you can also make use of the `reduction_assignment` or `reduction_assignment_from_str` functions +to specify reduction assignments: +::: + +```{code-cell} ipython3 +from pystencils.sympyextensions import reduction_assignment, reduction_assignment_from_str +from pystencils.sympyextensions.reduction import ReductionOp + +assign_sum = reduction_assignment(r, ReductionOp.Add, x.center()) + +assign_sum = reduction_assignment_from_str(r, "+", x.center()) +``` + +For other reduction operations, the following enums can be passed to `reduction_assignment` +or the corresponding strings can be passed to `reduction_assignment_from_str`. + +```{code-cell} python3 +class ReductionOp(Enum): + Add = "+" + Sub = "-" + Mul = "*" + Min = "min" + Max = "max" +``` + +## Generating Reduction Kernels + +With the assignments being fully assembled, we can finally invoke the code generator and +create the kernel object via the {any}`create_kernel` function. +For this example, we assume a kernel configuration where no optimizations are explicitly enabled. + +```{code-cell} ipython3 +cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) +kernel = ps.create_kernel(assign_sum, cfg) + +ps.inspect(kernel) +``` + +:::{note} +The generated reduction kernels may vary vastly for different platforms and optimizations. +For the sake of compactness, the impact of different backend or optimization choices is left out. + +A detailed description of configuration choices and their impact on the generated code can be found in +{ref}`codegen_reductions`. +::: + +The kernel can be compiled and run immediately. + +To execute the kernel on CPUs, not only a {any}`numpy.ndarray` has to be passed for each field +but also one for exporting reduction results. +The export mechanism can be seen in the previously generated code snippet. +Here, the kernel obtains a pointer with the name of the reduction symbol (here: `r`). +This pointer not only allows providing initial values for the reduction but is also used for writing back the +reduction result. +Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value. + +```{code-cell} ipython3 + kernel_func = kernel.compile() + + x_array = np.ones((4, 4, 4), dtype="float64") + reduction_result = np.zeros((1,), dtype="float64") + + kernel_func(x=x_array, r=reduction_result) + + reduction_result[0] +``` + +For GPU platforms, the concepts remain the same but the fields and the write-back pointer now require device memory, +i.e. instances of {any}`cupy.ndarray`. + +:::{admonition} Developers To Do: + +- Support for higher-order data types for reductions, e.g. vector/matrix reductions +::: -- GitLab From a2060520faddc99c58c8f22eedfe267d09d525b1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 10:54:44 +0200 Subject: [PATCH 142/180] Get rid of reduction_assignment_from_str --- docs/source/user_manual/reductions.md | 9 +++------ src/pystencils/sympyextensions/__init__.py | 3 +-- tests/kernelcreation/test_reduction.py | 18 +++++++++--------- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md index 454fd7df4..6af0e8580 100644 --- a/docs/source/user_manual/reductions.md +++ b/docs/source/user_manual/reductions.md @@ -56,21 +56,18 @@ The following reduction assignment classes are available in pystencils: * `MaxReductionAssignment`: Finds maximum element :::{note} -AlternatÃÂvely, you can also make use of the `reduction_assignment` or `reduction_assignment_from_str` functions +AlternatÃÂvely, you can also make use of the `reduction_assignment` function to specify reduction assignments: ::: ```{code-cell} ipython3 -from pystencils.sympyextensions import reduction_assignment, reduction_assignment_from_str +from pystencils.sympyextensions import reduction_assignment from pystencils.sympyextensions.reduction import ReductionOp assign_sum = reduction_assignment(r, ReductionOp.Add, x.center()) - -assign_sum = reduction_assignment_from_str(r, "+", x.center()) ``` -For other reduction operations, the following enums can be passed to `reduction_assignment` -or the corresponding strings can be passed to `reduction_assignment_from_str`. +For other reduction operations, the following enums can be passed to `reduction_assignment`. ```{code-cell} python3 class ReductionOp(Enum): diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py index bd0fa1fe9..c575feeb3 100644 --- a/src/pystencils/sympyextensions/__init__.py +++ b/src/pystencils/sympyextensions/__init__.py @@ -1,7 +1,7 @@ from .astnodes import ConditionalFieldAccess from .typed_sympy import TypedSymbol, CastFunc, tcast, DynamicType from .pointers import mem_acc -from .reduction import reduction_assignment, reduction_assignment_from_str, ReductionOp +from .reduction import reduction_assignment, ReductionOp from .math import ( prod, @@ -35,7 +35,6 @@ from .math import ( __all__ = [ "ConditionalFieldAccess", "reduction_assignment", - "reduction_assignment_from_str", "ReductionOp", "TypedSymbol", "CastFunc", diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py index cd1710cf5..1fb8efc81 100644 --- a/tests/kernelcreation/test_reduction.py +++ b/tests/kernelcreation/test_reduction.py @@ -2,17 +2,17 @@ import pytest import numpy as np import pystencils as ps -from pystencils.sympyextensions import reduction_assignment_from_str +from pystencils.sympyextensions import ReductionOp, reduction_assignment INIT_W = 5 INIT_ARR = 2 SIZE = 15 SOLUTION = { - "+": INIT_W + INIT_ARR * SIZE, - "-": INIT_W - INIT_ARR * SIZE, - "*": INIT_W * INIT_ARR**SIZE, - "min": min(INIT_W, INIT_ARR), - "max": max(INIT_W, INIT_ARR), + ReductionOp.Add: INIT_W + INIT_ARR * SIZE, + ReductionOp.Sub: INIT_W - INIT_ARR * SIZE, + ReductionOp.Mul: INIT_W * INIT_ARR**SIZE, + ReductionOp.Min: min(INIT_W, INIT_ARR), + ReductionOp.Max: max(INIT_W, INIT_ARR), } @@ -21,14 +21,14 @@ def get_reduction_assign_ast(dtype, op, config): x = ps.fields(f"x: {dtype}[1d]") w = ps.TypedSymbol("w", dtype) - red_assign = reduction_assignment_from_str(w, op, x.center()) + red_assign = reduction_assignment(w, op, x.center()) return ps.create_kernel([red_assign], config, default_dtype=dtype) @pytest.mark.parametrize("instruction_set", ["sse", "avx"]) @pytest.mark.parametrize("dtype", ["float64", "float32"]) -@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) +@pytest.mark.parametrize("op", [ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Min, ReductionOp.Max]) def test_reduction_cpu(instruction_set, dtype, op): vectorize_info = { "instruction_set": instruction_set, @@ -51,7 +51,7 @@ def test_reduction_cpu(instruction_set, dtype, op): @pytest.mark.parametrize("dtype", ["float64", "float32"]) -@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"]) +@pytest.mark.parametrize("op", [ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Min, ReductionOp.Max]) @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False]) @pytest.mark.parametrize("use_block_fitting", [True, False]) def test_reduction_gpu( -- GitLab From 21df6f4b5915134b537c20398b8bdeec00b6b28e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 10:57:43 +0200 Subject: [PATCH 143/180] Omit admonitions from docs --- docs/source/backend/reduction_codegen.md | 6 ------ docs/source/user_manual/reductions.md | 5 ----- 2 files changed, 11 deletions(-) diff --git a/docs/source/backend/reduction_codegen.md b/docs/source/backend/reduction_codegen.md index 360c69256..f08fa980a 100644 --- a/docs/source/backend/reduction_codegen.md +++ b/docs/source/backend/reduction_codegen.md @@ -114,9 +114,3 @@ You can find more detailed information about warp size alignment in {ref}`gpu_co ps.inspect(kernel_gpu_opt) ``` -:::{admonition} Developers To Do: - -- Support for HIP platforms -- Support vectorization using NEON intrinsics -::: - diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md index 6af0e8580..0b0c04279 100644 --- a/docs/source/user_manual/reductions.md +++ b/docs/source/user_manual/reductions.md @@ -122,8 +122,3 @@ Since our reduction result is a single scalar value, it is sufficient to set up For GPU platforms, the concepts remain the same but the fields and the write-back pointer now require device memory, i.e. instances of {any}`cupy.ndarray`. - -:::{admonition} Developers To Do: - -- Support for higher-order data types for reductions, e.g. vector/matrix reductions -::: -- GitLab From 99726a97a0a834dfb53960ad1fb72be4f72849e2 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 11:30:29 +0200 Subject: [PATCH 144/180] Add docstring to PsVecHorizontal --- src/pystencils/backend/ast/vector.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 4141b0296..291d76e50 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -41,7 +41,16 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): class PsVecHorizontal(PsBinOp, PsVectorOp): - """Extracts scalar value from N vector lanes.""" + """Represents a binary operation between a scalar and a vector operand. + With the binary operation not being vectorized, a horizontal reduction + along the lanes of the vector operand is required to extract a scalar value. + The result type will be equal to the scalar operand. + + Args: + scalar_operand: Scalar operand + vector_operand: Vector operand to be converted to a scalar value + reduction_op: Binary operation that is also used for the horizontal reduction + """ __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op") -- GitLab From f7142b16ae26f30f96f07e0063c65389fbc34b3f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 11:32:47 +0200 Subject: [PATCH 145/180] Remove lanes arg from PsVecHorizontal --- src/pystencils/backend/ast/vector.py | 13 +------------ src/pystencils/backend/emission/ir_printer.py | 4 ++-- .../backend/transformations/loop_vectorizer.py | 1 - 3 files changed, 3 insertions(+), 15 deletions(-) diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 291d76e50..a074ea6ff 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -56,23 +56,13 @@ class PsVecHorizontal(PsBinOp, PsVectorOp): def __init__( self, - lanes: int, scalar_operand: PsExpression, vector_operand: PsExpression, reduction_op: ReductionOp, ): super().__init__(scalar_operand, vector_operand) - self._lanes = lanes self._reduction_op = reduction_op - @property - def lanes(self) -> int: - return self._lanes - - @lanes.setter - def lanes(self, n: int): - self._lanes = n - @property def scalar_operand(self) -> PsExpression: return self._op1 @@ -99,7 +89,7 @@ class PsVecHorizontal(PsBinOp, PsVectorOp): def _clone_expr(self) -> PsVecHorizontal: return PsVecHorizontal( - self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op + self._op1.clone(), self._op2.clone(), self._reduction_op ) def structurally_equal(self, other: PsAstNode) -> bool: @@ -107,7 +97,6 @@ class PsVecHorizontal(PsBinOp, PsVectorOp): return False return ( super().structurally_equal(other) - and self._lanes == other._lanes and self._reduction_op == other._reduction_op ) diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py index 22ae2f91a..5a3836d50 100644 --- a/src/pystencils/backend/emission/ir_printer.py +++ b/src/pystencils/backend/emission/ir_printer.py @@ -77,14 +77,14 @@ class IRAstPrinter(BasePrinter): f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest ) - case PsVecHorizontal(lanes, scalar_operand, vector_operand, reduction_op): + case PsVecHorizontal(scalar_operand, vector_operand, reduction_op): pc.push_op(Ops.Weakest, LR.Middle) scalar_operand_code = self.visit(scalar_operand, pc) vector_operand_code = self.visit(vector_operand, pc) pc.pop_op() return pc.parenthesize( - f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({scalar_operand_code, vector_operand_code})", + f"vec_horizontal_{reduction_op.name.lower()}({scalar_operand_code, vector_operand_code})", Ops.Weakest, ) diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index 48b9ad0da..09b0aa5dd 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -162,7 +162,6 @@ class LoopVectorizer: PsAssignment( PsSymbolExpr(symb), PsVecHorizontal( - self._lanes, PsSymbolExpr(symb), PsSymbolExpr(vector_symb), reduction_info.op, -- GitLab From 427e53442e166cf34a23e7588a1f8cd72ce52438 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 11:35:47 +0200 Subject: [PATCH 146/180] Remove more parts of reduction_assignment_from_str --- src/pystencils/sympyextensions/reduction.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index e95e37c24..81da0dde9 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -66,17 +66,8 @@ _reduction_assignment_classes = { ] } -# Mapping from ReductionOp str to ReductionAssigment classes -_reduction_assignment_classes_for_str = { - cls.value: cls for cls in _reduction_assignment_classes -} - def reduction_assignment(lhs, op: ReductionOp, rhs): if op not in _reduction_assignment_classes: raise ValueError("Unrecognized operator %s" % op) return _reduction_assignment_classes[op](lhs, rhs) - - -def reduction_assignment_from_str(lhs, op: str, rhs): - return reduction_assignment(lhs, _reduction_assignment_classes_for_str[op], rhs) -- GitLab From 36124cd24e89342ae96dd0bee8a05bdfc2542092 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 11:58:34 +0200 Subject: [PATCH 147/180] Add check for typed symbols for ReductionAssignment constructor --- src/pystencils/sympyextensions/reduction.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index 81da0dde9..794c40451 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -2,6 +2,8 @@ from enum import Enum from sympy.codegen.ast import AssignmentBase +from pystencils import TypedSymbol + class ReductionOp(Enum): Add = "+" @@ -33,6 +35,13 @@ class ReductionAssignment(AssignmentBase): def reduction_op(self, op): self._reduction_op = op + @classmethod + def _check_args(cls, lhs, rhs): + super()._check_args(lhs, rhs) + + if not isinstance(lhs, TypedSymbol): + raise TypeError(f"lhs of needs to be a TypedSymbol. Got {type(lhs)} instead.") + class AddReductionAssignment(ReductionAssignment): reduction_op = ReductionOp.Add -- GitLab From 9e61ccebe4e6a37205441f94573f714e987d17cd Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 12:58:38 +0200 Subject: [PATCH 148/180] Omit lanes for match args in PsVecHorizontal --- src/pystencils/backend/ast/vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index a074ea6ff..55db67e7c 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -52,7 +52,7 @@ class PsVecHorizontal(PsBinOp, PsVectorOp): reduction_op: Binary operation that is also used for the horizontal reduction """ - __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op") + __match_args__ = ("scalar_operand", "vector_operand", "reduction_op") def __init__( self, -- GitLab From 7b74cafaedc589a52f91e7a28d60528956edd19f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 13:01:20 +0200 Subject: [PATCH 149/180] Fix import --- src/pystencils/sympyextensions/reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py index 794c40451..a1a9a026c 100644 --- a/src/pystencils/sympyextensions/reduction.py +++ b/src/pystencils/sympyextensions/reduction.py @@ -2,7 +2,7 @@ from enum import Enum from sympy.codegen.ast import AssignmentBase -from pystencils import TypedSymbol +from . import TypedSymbol class ReductionOp(Enum): -- GitLab From 8394f0f4c115cb37c5e53f3e08f62a32bd128a3e Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 14:18:29 +0200 Subject: [PATCH 150/180] Move reduction_op_mapping.py --- src/pystencils/backend/kernelcreation/freeze.py | 2 +- src/pystencils/backend/platforms/cuda.py | 2 +- src/pystencils/backend/platforms/generic_cpu.py | 2 +- src/pystencils/{ => backend}/reduction_op_mapping.py | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) rename src/pystencils/{ => backend}/reduction_op_mapping.py (82%) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 9dc3928b3..c5ff43fb9 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -13,7 +13,7 @@ from ...sympyextensions import ( integer_functions, ConditionalFieldAccess, ) -from ...reduction_op_mapping import reduction_op_to_expr +from ..reduction_op_mapping import reduction_op_to_expr from ...sympyextensions.typed_sympy import TypedSymbol, TypeCast, DynamicType from ...sympyextensions.pointers import AddressOf, mem_acc from ...sympyextensions.reduction import ReductionAssignment, ReductionOp diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 8c3cd45fa..05e95011d 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -23,7 +23,7 @@ from ..constants import PsConstant from ..exceptions import MaterializationError from ..functions import NumericLimitsFunctions, CFunction from ..literals import PsLiteral -from ...reduction_op_mapping import reduction_op_to_expr +from ..reduction_op_mapping import reduction_op_to_expr from ...sympyextensions import ReductionOp from ...types import PsType, PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType from ...types.quick import SInt, UInt diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py index 3de7cf696..4f8b562fa 100644 --- a/src/pystencils/backend/platforms/generic_cpu.py +++ b/src/pystencils/backend/platforms/generic_cpu.py @@ -13,7 +13,7 @@ from ..functions import ( PsReductionFunction, ) from ..literals import PsLiteral -from ...reduction_op_mapping import reduction_op_to_expr +from ..reduction_op_mapping import reduction_op_to_expr from ...sympyextensions import ReductionOp from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType diff --git a/src/pystencils/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py similarity index 82% rename from src/pystencils/reduction_op_mapping.py rename to src/pystencils/backend/reduction_op_mapping.py index 06fb8aa3e..876912acd 100644 --- a/src/pystencils/reduction_op_mapping.py +++ b/src/pystencils/backend/reduction_op_mapping.py @@ -1,7 +1,7 @@ -from .backend.ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv -from .backend.exceptions import FreezeError -from .backend.functions import PsMathFunction, MathFunctions -from .sympyextensions.reduction import ReductionOp +from .ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv +from .exceptions import FreezeError +from .functions import PsMathFunction, MathFunctions +from ..sympyextensions.reduction import ReductionOp _available_operator_interface: set[ReductionOp] = { ReductionOp.Add, -- GitLab From 935beb559495c5094a42cf0e807ca8ba203beb6d Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 14:41:04 +0200 Subject: [PATCH 151/180] Move kernel AST modifications for reductions to distinct function --- src/pystencils/codegen/driver.py | 53 +++++++++++++++++++------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index c285dd7bf..3d107eda3 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -19,6 +19,7 @@ from .properties import PsSymbolProperty, FieldBasePtr from .parameters import Parameter from .functions import Lambda from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration +from ..backend.kernelcreation.context import ReductionInfo from ..field import Field from ..types import PsIntegerType, PsScalarType @@ -192,28 +193,7 @@ class DefaultKernelCreationDriver: # Extensions for reductions for symbol, reduction_info in self._ctx.symbols_reduction_info.items(): - typify = Typifier(self._ctx) - symbol_expr = typify(PsSymbolExpr(symbol)) - ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol)) - init_val = typify(reduction_info.init_val) - - ptr_access = PsMemAcc( - ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) - ) - write_back_ptr = PsCall( - PsReductionFunction( - ReductionFunctions.WriteBackToPtr, reduction_info.op - ), - [ptr_symbol_expr, symbol_expr], - ) - - # declare and init local copy with neutral element - prepend_ast = [PsDeclaration(symbol_expr, init_val)] - # write back result to reduction target variable - append_ast = [PsAssignment(ptr_access, write_back_ptr)] - - kernel_ast.statements = prepend_ast + kernel_ast.statements - kernel_ast.statements += append_ast + self._modify_kernel_ast_for_reductions(symbol, reduction_info, kernel_ast) # Target-Specific optimizations if self._target.is_cpu(): @@ -315,6 +295,35 @@ class DefaultKernelCreationDriver: return kernel_body + def _modify_kernel_ast_for_reductions(self, + symbol: PsSymbol, + reduction_info: ReductionInfo, + kernel_ast: PsBlock): + # typify local symbol and write-back pointer expressions and initial value + typify = Typifier(self._ctx) + symbol_expr = typify(PsSymbolExpr(symbol)) + ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol)) + init_val = typify(reduction_info.init_val) + + ptr_access = PsMemAcc( + ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)) + ) + write_back_ptr = PsCall( + PsReductionFunction( + ReductionFunctions.WriteBackToPtr, reduction_info.op + ), + [ptr_symbol_expr, symbol_expr], + ) + + # declare and init local copy with neutral element + prepend_ast = [PsDeclaration(symbol_expr, init_val)] + # write back result to reduction target variable + append_ast = [PsAssignment(ptr_access, write_back_ptr)] + + # modify AST + kernel_ast.statements = prepend_ast + kernel_ast.statements + kernel_ast.statements += append_ast + def _transform_for_cpu(self, kernel_ast: PsBlock) -> PsBlock: canonicalize = CanonicalizeSymbols(self._ctx, True) kernel_ast = cast(PsBlock, canonicalize(kernel_ast)) -- GitLab From b4cabdd81a3e94f2881cb26798d6588badb1c377 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 14:57:25 +0200 Subject: [PATCH 152/180] Add consistency check for PsVecHorizontal in typifier --- src/pystencils/backend/kernelcreation/typification.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index b457f39a0..9585cb23f 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -605,6 +605,12 @@ class Typifier: f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}" ) + if vector_op_tc.target_type.scalar_type is not scalar_op_tc.target_type: + raise TypificationError( + f"Scalar type of vector operand {vector_op_tc.target_type} " + f"does not correspond to type of scalar operand {scalar_op_tc.target_type}" + ) + tc.apply_dtype(scalar_op_tc.target_type, expr) case PsBinOp(op1, op2): -- GitLab From 3c3283cd4a823b7aeac883a794c2e2fd05e984e2 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 15:44:30 +0200 Subject: [PATCH 153/180] Merge reduction user guides into one document --- docs/source/backend/reduction_codegen.md | 116 ----------------------- docs/source/user_manual/reductions.md | 90 +++++++++++++++++- 2 files changed, 85 insertions(+), 121 deletions(-) delete mode 100644 docs/source/backend/reduction_codegen.md diff --git a/docs/source/backend/reduction_codegen.md b/docs/source/backend/reduction_codegen.md deleted file mode 100644 index f08fa980a..000000000 --- a/docs/source/backend/reduction_codegen.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.16.4 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 -mystnb: - execution_mode: cache ---- - -```{code-cell} ipython3 -:tags: [remove-cell, raises-exception] - -import pystencils as ps -import numpy as np -import cupy as cp -``` - -(codegen_reductions)= -# Code Generation for Reductions - -In this guide, we demonstrate how reduction kernels can be generated for different platforms and what impact certain -optimization strategies have. -For this, we set up the update rule for a simple dot product kernel: - -```{code-cell} ipython3 -r = ps.TypedSymbol("r", "double") -x, y = ps.fields(f"x, y: double[3D]", layout="fzyx") - -assign_dot_prod = ps.AddReductionAssignment(r, x.center() * y.center()) -``` - -## CPU Platforms - -We first consider a base variant for CPUs without employing any optimizations. -The generated code for this variant looks as follows: - -```{code-cell} ipython3 -cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) -kernel_cpu = ps.create_kernel(assign_dot_prod, cfg) - -ps.inspect(kernel_cpu) -``` - -We want the reduction kernel to be SIMD vectorized and employ shared-memory parallelism using OpenMP. -The supported SIMD instruction sets for reductions are: -* SSE3 -* AVX/AVX2 -* AVX512 - -Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`. -**Note that reductions require `assume_inner_stride_one` to be enabled.** -This is due to the fact that other inner strides would require masked SIMD operations -which are not supported yet. - -```{code-cell} ipython3 -# configure SIMD vectorization -cfg = ps.CreateKernelConfig( - target=ps.Target.X86_AVX, -) -cfg.cpu.vectorize.enable = True -cfg.cpu.vectorize.assume_inner_stride_one = True - -# configure OpenMP parallelization -cfg.cpu.openmp.enable = True -cfg.cpu.openmp.num_threads = 8 - -kernel_cpu_opt = ps.create_kernel(assign_dot_prod, cfg) - -ps.inspect(kernel_cpu_opt) -``` - -## GPU Platforms - -Reductions are currently only supported for CUDA platforms. -Similar to the CPU section, a base variant for GPUs without explicitly employing any optimizations is shown: - -```{code-cell} ipython3 - cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) - - kernel_gpu = ps.create_kernel(assign_dot_prod, cfg) - - ps.inspect(kernel_gpu) -``` - -As evident from the code, the generated kernel employs atomic operations for updating the pointer -holding the reduction result. -Using the explicit warp-level instructions provided by CUDA allows us to achieve higher performance compared to -only using atomic operations. -To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by -the hardware's warp size. -**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this -criterion by enabling `assume_warp_aligned_block_size`.** -While the default block sizes provided by the code generator already fulfill this criterion, -we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space. - -You can find more detailed information about warp size alignment in {ref}`gpu_codegen`. - -```{code-cell} ipython3 - cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) - cfg.gpu.assume_warp_aligned_block_size = True - - kernel_gpu_opt = ps.create_kernel(assign_dot_prod, cfg) - - kernel_func = kernel_gpu_opt.compile() - kernel_func.launch_config.fit_block_size((32, 1, 1)) - - ps.inspect(kernel_gpu_opt) -``` - diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md index 0b0c04279..46e935bbb 100644 --- a/docs/source/user_manual/reductions.md +++ b/docs/source/user_manual/reductions.md @@ -78,11 +78,14 @@ class ReductionOp(Enum): Max = "max" ``` -## Generating Reduction Kernels +## Generating and Running Reduction Kernels With the assignments being fully assembled, we can finally invoke the code generator and -create the kernel object via the {any}`create_kernel` function. -For this example, we assume a kernel configuration where no optimizations are explicitly enabled. +create the kernel object via the {any}`create_kernel` function. + +### CPU Platforms + +For this example, we assume a kernel configuration for CPU platforms with no optimizations explicitly enabled. ```{code-cell} ipython3 cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU) @@ -120,5 +123,82 @@ Since our reduction result is a single scalar value, it is sufficient to set up reduction_result[0] ``` -For GPU platforms, the concepts remain the same but the fields and the write-back pointer now require device memory, -i.e. instances of {any}`cupy.ndarray`. +### GPU Platforms + +Please note that **reductions are currently only supported for CUDA platforms**. +Similar to the CPU section, a base variant for NVIDIA GPUs without +explicitly employing any optimizations is shown: + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + + kernel_gpu = ps.create_kernel(assign_sum, cfg) + + ps.inspect(kernel_gpu) +``` + +The steps for running the generated code on NVIDIA GPUs are identical but the fields and the write-back pointer +now require device memory, i.e. instances of {any}`cupy.ndarray`. + +## Optimizations for Reductions + +Going beyond the aforementioned basic kernel configurations, +we now demonstrate optimization strategies for different platforms +that can be applied to reduction kernels and show what impact they have. + +### CPU Platforms + +For CPU platforms, standard optimizations are employing SIMD vectorization and shared-memory parallelism using OpenMP. +The supported SIMD instruction sets for reductions are: +* SSE3 +* AVX/AVX2 +* AVX512 + +Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`. +**Note that reductions require `assume_inner_stride_one` to be enabled.** +This is due to the fact that other inner strides would require masked SIMD operations +which are not supported yet. + +```{code-cell} ipython3 +# configure SIMD vectorization +cfg = ps.CreateKernelConfig( + target=ps.Target.X86_AVX, +) +cfg.cpu.vectorize.enable = True +cfg.cpu.vectorize.assume_inner_stride_one = True + +# configure OpenMP parallelization +cfg.cpu.openmp.enable = True +cfg.cpu.openmp.num_threads = 8 + +kernel_cpu_opt = ps.create_kernel(assign_sum, cfg) + +ps.inspect(kernel_cpu_opt) +``` + +### GPU Platforms + +As evident from the generated kernel for the base variant, atomic operations are employed +for updating the pointer holding the reduction result. +Using the *explicit warp-level instructions* provided by CUDA allows us to achieve higher performance compared to +only using atomic operations. +To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by +the hardware's warp size. +**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this +criterion by enabling `assume_warp_aligned_block_size`.** +While the default block sizes provided by the code generator already fulfill this criterion, +we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space. + +You can find more detailed information about warp size alignment in {ref}`gpu_codegen`. + +```{code-cell} ipython3 + cfg = ps.CreateKernelConfig(target=ps.Target.CUDA) + cfg.gpu.assume_warp_aligned_block_size = True + + kernel_gpu_opt = ps.create_kernel(assign_sum, cfg) + + kernel_func = kernel_gpu_opt.compile() + kernel_func.launch_config.fit_block_size((32, 1, 1)) + + ps.inspect(kernel_gpu_opt) +``` -- GitLab From b0904d93d48f0a165e4b1561e3b267db09986221 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 16:45:26 +0200 Subject: [PATCH 154/180] Fix typecheck --- src/pystencils/backend/reduction_op_mapping.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py index 876912acd..832f5d0bf 100644 --- a/src/pystencils/backend/reduction_op_mapping.py +++ b/src/pystencils/backend/reduction_op_mapping.py @@ -15,18 +15,17 @@ def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: if op in _available_operator_interface: match op: case ReductionOp.Add: - operator = PsAdd + return PsAdd(op1, op2) case ReductionOp.Sub: - operator = PsSub + return PsSub(op1, op2) case ReductionOp.Mul: - operator = PsMul + return PsMul(op1, op2) case ReductionOp.Div: - operator = PsDiv + return PsDiv(op1, op2) case _: raise FreezeError( f"Found unsupported operation type for reduction assignments: {op}." ) - return operator(op1, op2) else: match op: case ReductionOp.Min: -- GitLab From 603e6a3fce53a0bbd4d2b21faa12c34da3936b1b Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 23 Apr 2025 16:46:27 +0200 Subject: [PATCH 155/180] Fix docs --- docs/source/backend/index.rst | 1 - docs/source/user_manual/reductions.md | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst index b9b400544..0d384c55b 100644 --- a/docs/source/backend/index.rst +++ b/docs/source/backend/index.rst @@ -16,7 +16,6 @@ who wish to customize or extend the behaviour of the code generator in their app iteration_space translation platforms - reduction_codegen transformations gpu_codegen errors diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md index 46e935bbb..7c7904dfc 100644 --- a/docs/source/user_manual/reductions.md +++ b/docs/source/user_manual/reductions.md @@ -96,10 +96,7 @@ ps.inspect(kernel) :::{note} The generated reduction kernels may vary vastly for different platforms and optimizations. -For the sake of compactness, the impact of different backend or optimization choices is left out. - -A detailed description of configuration choices and their impact on the generated code can be found in -{ref}`codegen_reductions`. +You can find a detailed description of configuration choices and their impact on the generated code below. ::: The kernel can be compiled and run immediately. -- GitLab From 6046db6e13ddd875e7c68edd9f99fb7a7101f47f Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 24 Apr 2025 17:54:31 +0200 Subject: [PATCH 156/180] Move symbol handling for reductions to context and add more checks --- .../backend/kernelcreation/context.py | 51 ++++++++++----- .../backend/kernelcreation/freeze.py | 64 +++++++++++-------- .../backend/transformations/add_pragmas.py | 8 +-- .../transformations/loop_vectorizer.py | 11 ++-- src/pystencils/codegen/driver.py | 9 ++- 5 files changed, 89 insertions(+), 54 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 358b5ff6c..58a4bd7d1 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -54,7 +54,8 @@ class ReductionInfo: op: ReductionOp init_val: PsExpression - ptr_symbol: PsSymbol + local_symbol: PsSymbol + writeback_ptr_symbol: PsSymbol class KernelCreationContext: @@ -88,7 +89,7 @@ class KernelCreationContext: self._symbol_ctr_pattern = re.compile(r"__[0-9]+$") self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0) - self._symbols_reduction_info: dict[PsSymbol, ReductionInfo] = dict() + self._reduction_data: dict[str, ReductionInfo] = dict() self._fields_and_arrays: dict[str, FieldArrayPair] = dict() self._fields_collection = FieldsInKernel() @@ -193,19 +194,39 @@ class KernelCreationContext: self._symbols[old.name] = new - def add_symbol_reduction_info( - self, local_symb: PsSymbol, reduction_info: ReductionInfo + def add_reduction_info( + self, + lhs_name: str, + lhs_dtype: PsType, + reduction_op: ReductionOp, + init_value: PsExpression, ): - """Adds entry for a symbol and its reduction info to its corresponding lookup table. + """Create ReductionInfo instance and add to its corresponding lookup table for a given symbol name.""" - The symbol ``symbol`` shall not exist in the symbol table already. - """ - if local_symb in self._symbols_reduction_info: - raise PsInternalCompilerError( - f"add_symbol_reduction_info: {local_symb.name} already exist in the symbol table" - ) + # replace datatype of lhs symbol with pointer datatype for write-back mechanism + symb = self.get_symbol(lhs_name, lhs_dtype) + pointer_symb = PsSymbol(lhs_name, PsPointerType(lhs_dtype)) + self.replace_symbol(symb, pointer_symb) + + # create kernel-local copy of lhs symbol + local_symb = PsSymbol(f"{lhs_name}_local", lhs_dtype) + self.add_symbol(local_symb) - self._symbols_reduction_info[local_symb] = reduction_info + # create reduction info and add to set + reduction_info = ReductionInfo( + reduction_op, init_value, local_symb, pointer_symb + ) + self._reduction_data[lhs_name] = reduction_info + + return reduction_info + + def find_reduction_info(self, name: str) -> ReductionInfo | None: + """Find a ReductionInfo with the given name in the lookup table, if it exists. + + Returns: + The ReductionInfo with the given name, or `None` if it does not exist. + """ + return self._reduction_data.get(name, None) def duplicate_symbol( self, symb: PsSymbol, new_dtype: PsType | None = None @@ -243,9 +264,9 @@ class KernelCreationContext: return self._symbols.values() @property - def symbols_reduction_info(self) -> dict[PsSymbol, ReductionInfo]: - """Return a dictionary holding kernel-local reduction symbols and their reduction properties.""" - return self._symbols_reduction_info + def reduction_data(self) -> dict[str, ReductionInfo]: + """Return a dictionary holding kernel-local reduction information for given symbol names.""" + return self._reduction_data # Fields and Arrays diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index c5ff43fb9..2f00df4e8 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -6,7 +6,6 @@ import sympy as sp import sympy.logic.boolalg from sympy.codegen.ast import AssignmentBase, AugmentedAssignment -from ..memory import PsSymbol from ...assignment import Assignment from ...simp import AssignmentCollection from ...sympyextensions import ( @@ -19,7 +18,7 @@ from ...sympyextensions.pointers import AddressOf, mem_acc from ...sympyextensions.reduction import ReductionAssignment, ReductionOp from ...field import Field, FieldType -from .context import KernelCreationContext, ReductionInfo +from .context import KernelCreationContext from ..ast.structural import ( PsAstNode, @@ -62,7 +61,7 @@ from ..ast.expressions import ( from ..ast.vector import PsVecMemAcc from ..constants import PsConstant -from ...types import PsNumericType, PsStructType, PsType, PsPointerType +from ...types import PsNumericType, PsStructType, PsType from ..exceptions import PsInputError from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions from ..exceptions import FreezeError @@ -190,32 +189,41 @@ class FreezeExpressions: def map_ReductionAssignment(self, expr: ReductionAssignment): assert isinstance(expr.lhs, TypedSymbol) + # make sure that either: + # 1) lhs symbol never occurred + # 2) that it is at least known as lhs of an existing reduction operation + if self._ctx.find_symbol(expr.lhs.name): + # make sure that reduction operations are not mixed within a kernel + if info := self._ctx.find_reduction_info(expr.lhs.name): + if info.op is not expr.reduction_op: + raise FreezeError( + f"Different reduction operation {info.op} already exists " + f"for {expr.lhs} with target reduction op {expr.reduction_op}." + ) + else: + raise FreezeError( + f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table." + f"Make sure that it is exclusively used within the kernel to conduct ReductionAssignment's." + ) + lhs = self.visit(expr.lhs) rhs = self.visit(expr.rhs) assert isinstance(rhs, PsExpression) assert isinstance(lhs, PsSymbolExpr) - op = expr.reduction_op - orig_lhs_symb = lhs.symbol - dtype = lhs.dtype - - assert isinstance(dtype, PsNumericType), \ - "Reduction assignments require type information of the lhs symbol." - - # replace original symbol with pointer-based type used for export - orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype)) - - # create kernel-local copy of lhs symbol to work with - new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype) - new_lhs = PsSymbolExpr(new_lhs_symb) + reduction_op = expr.reduction_op + lhs_symbol = lhs.symbol + lhs_dtype = lhs_symbol.dtype + lhs_name = lhs_symbol.name - # get new rhs from augmented assignment - new_rhs: PsExpression = reduction_op_to_expr(op, new_lhs.clone(), rhs) + assert isinstance( + lhs_dtype, PsNumericType + ), "Reduction assignments require type information of the lhs symbol." # match for reduction operation and set neutral init_val init_val: PsExpression - match op: + match reduction_op: case ReductionOp.Add: init_val = PsConstantExpr(PsConstant(0)) case ReductionOp.Sub: @@ -227,14 +235,20 @@ class FreezeExpressions: case ReductionOp.Max: init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) case _: - raise FreezeError(f"Unsupported kind of reduction assignment: {op}.") + raise FreezeError( + f"Unsupported kind of reduction assignment: {reduction_op}." + ) - reduction_info = ReductionInfo(op, init_val, orig_lhs_symb_as_ptr) + # get reduction info from context + reduction_info = self._ctx.add_reduction_info( + lhs_name, lhs_dtype, reduction_op, init_val + ) + + # create new lhs from newly created local lhs symbol + new_lhs = PsSymbolExpr(reduction_info.local_symbol) - # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info - self._ctx.add_symbol(new_lhs_symb) - self._ctx.add_symbol_reduction_info(new_lhs_symb, reduction_info) - self._ctx.replace_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr) + # get new rhs from augmented assignment + new_rhs: PsExpression = reduction_op_to_expr(reduction_op, new_lhs, rhs) return PsAssignment(new_lhs, new_rhs) diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py index fa466e495..1d1cb6a8d 100644 --- a/src/pystencils/backend/transformations/add_pragmas.py +++ b/src/pystencils/backend/transformations/add_pragmas.py @@ -123,11 +123,11 @@ class AddOpenMP: if num_threads is not None: pragma_text += f" num_threads({str(num_threads)})" - if bool(ctx.symbols_reduction_info): - for symbol, reduction_info in ctx.symbols_reduction_info.items(): - if isinstance(symbol.dtype, PsScalarType): + if bool(ctx.reduction_data): + for _, reduction_info in ctx.reduction_data.items(): + if isinstance(reduction_info.local_symbol.dtype, PsScalarType): pragma_text += ( - f" reduction({reduction_info.op.value}: {symbol.name})" + f" reduction({reduction_info.op.value}: {reduction_info.local_symbol.name})" ) else: NotImplementedError( diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py index 09b0aa5dd..8061240b7 100644 --- a/src/pystencils/backend/transformations/loop_vectorizer.py +++ b/src/pystencils/backend/transformations/loop_vectorizer.py @@ -143,16 +143,17 @@ class LoopVectorizer: # Prepare reductions simd_init_local_reduction_vars: list[PsStructuralNode] = [] simd_writeback_local_reduction_vars: list[PsStructuralNode] = [] - for symb, reduction_info in self._ctx.symbols_reduction_info.items(): + for _, reduction_info in self._ctx.reduction_data.items(): # Vectorize symbol for local copy - vector_symb = vc.vectorize_symbol(symb) + local_symbol = reduction_info.local_symbol + vector_symb = vc.vectorize_symbol(local_symbol) # Declare and init vector simd_init_local_reduction_vars += [ self._type_fold( PsDeclaration( PsSymbolExpr(vector_symb), - PsVecBroadcast(self._lanes, PsSymbolExpr(symb)), + PsVecBroadcast(self._lanes, PsSymbolExpr(local_symbol)), ) ) ] @@ -160,9 +161,9 @@ class LoopVectorizer: # Write back vectorization result simd_writeback_local_reduction_vars += [ PsAssignment( - PsSymbolExpr(symb), + PsSymbolExpr(local_symbol), PsVecHorizontal( - PsSymbolExpr(symb), + PsSymbolExpr(local_symbol), PsSymbolExpr(vector_symb), reduction_info.op, ), diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 3d107eda3..74a07b902 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -192,8 +192,8 @@ class DefaultKernelCreationDriver: self._intermediates.constants_eliminated = kernel_ast.clone() # Extensions for reductions - for symbol, reduction_info in self._ctx.symbols_reduction_info.items(): - self._modify_kernel_ast_for_reductions(symbol, reduction_info, kernel_ast) + for _, reduction_info in self._ctx.reduction_data.items(): + self._modify_kernel_ast_for_reductions(reduction_info, kernel_ast) # Target-Specific optimizations if self._target.is_cpu(): @@ -296,13 +296,12 @@ class DefaultKernelCreationDriver: return kernel_body def _modify_kernel_ast_for_reductions(self, - symbol: PsSymbol, reduction_info: ReductionInfo, kernel_ast: PsBlock): # typify local symbol and write-back pointer expressions and initial value typify = Typifier(self._ctx) - symbol_expr = typify(PsSymbolExpr(symbol)) - ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol)) + symbol_expr = typify(PsSymbolExpr(reduction_info.local_symbol)) + ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.writeback_ptr_symbol)) init_val = typify(reduction_info.init_val) ptr_access = PsMemAcc( -- GitLab From 081d11ad152bfa6e1f809af7d20789f00d39e5a9 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 24 Apr 2025 19:10:21 +0200 Subject: [PATCH 157/180] Fix indent for error handling in freeze --- src/pystencils/backend/kernelcreation/freeze.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 2f00df4e8..045aca1d1 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -200,11 +200,11 @@ class FreezeExpressions: f"Different reduction operation {info.op} already exists " f"for {expr.lhs} with target reduction op {expr.reduction_op}." ) - else: - raise FreezeError( - f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table." - f"Make sure that it is exclusively used within the kernel to conduct ReductionAssignment's." - ) + else: + raise FreezeError( + f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table." + f"Make sure that it is exclusively used within the kernel to conduct ReductionAssignment's." + ) lhs = self.visit(expr.lhs) rhs = self.visit(expr.rhs) -- GitLab From f0aba2e948fef8f535644e80b0f7d35e0e5f60e0 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 24 Apr 2025 20:06:25 +0200 Subject: [PATCH 158/180] Document attributes of ReductionInfo --- src/pystencils/backend/kernelcreation/context.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 58a4bd7d1..63bfc2f7b 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -50,7 +50,16 @@ FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array")) @dataclass(frozen=True) class ReductionInfo: """Information about a reduction operation, its neutral element in form of an initial value - and the pointer used by the kernel as write-back argument.""" + and the pointer used by the kernel as write-back argument. + + Attributes: + =========== + + reduction_op : Reduction operation being performed + init_val : Initial value used to initialize local symbol + local_symbol : Kernel-local symbol used to accumulate intermediate reduction result + writeback_ptr_symbol : Symbol that is used to export the final reduction result + """ op: ReductionOp init_val: PsExpression -- GitLab From eb6e8c0f8c98d00cb56e6dfc7b7add5036e766ae Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 24 Apr 2025 20:35:16 +0200 Subject: [PATCH 159/180] Check if reduction symbol is (illegally) accessed before/after reduction assignment --- .../backend/kernelcreation/freeze.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 045aca1d1..e0ed0f1f7 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -189,22 +189,12 @@ class FreezeExpressions: def map_ReductionAssignment(self, expr: ReductionAssignment): assert isinstance(expr.lhs, TypedSymbol) - # make sure that either: - # 1) lhs symbol never occurred - # 2) that it is at least known as lhs of an existing reduction operation + # make sure that lhs symbol never occurred before ReductionAssignment if self._ctx.find_symbol(expr.lhs.name): - # make sure that reduction operations are not mixed within a kernel - if info := self._ctx.find_reduction_info(expr.lhs.name): - if info.op is not expr.reduction_op: - raise FreezeError( - f"Different reduction operation {info.op} already exists " - f"for {expr.lhs} with target reduction op {expr.reduction_op}." - ) - else: - raise FreezeError( - f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table." - f"Make sure that it is exclusively used within the kernel to conduct ReductionAssignment's." - ) + raise FreezeError( + f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table. " + f"Make sure that it is only used once in a kernel's ReductionAssignment." + ) lhs = self.visit(expr.lhs) rhs = self.visit(expr.rhs) @@ -340,6 +330,16 @@ class FreezeExpressions: def map_TypedSymbol(self, expr: TypedSymbol): dtype = self._ctx.resolve_dynamic_type(expr.dtype) + + # check if symbol is referenced after freezing a ReductionAssignment + if self._ctx.find_reduction_info(expr.name): + # check if types do not align since a ReductionAssignment modifies + # the symbol's type to PsPointerType in the context's symbol table + if (symbol := self._ctx.find_symbol(expr.name)) and symbol.dtype != dtype: + raise FreezeError( + f"Illegal access to reduction symbol {symbol.name} after freezing a kernel's ReductionAssignment. " + ) + symb = self._ctx.get_symbol(expr.name, dtype) return PsSymbolExpr(symb) -- GitLab From f678a2fa2c38e2fa202a2c2a12472cdcc229b3d2 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 24 Apr 2025 20:36:36 +0200 Subject: [PATCH 160/180] Add unit test for checking border cases of freezing illegal usages of ReductionAssignments --- tests/nbackend/kernelcreation/test_freeze.py | 31 ++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py index f6c8f85b2..7082f28ad 100644 --- a/tests/nbackend/kernelcreation/test_freeze.py +++ b/tests/nbackend/kernelcreation/test_freeze.py @@ -66,6 +66,7 @@ from pystencils.sympyextensions.integer_functions import ( ceil_to_multiple, div_ceil, ) +from pystencils.sympyextensions.reduction import AddReductionAssignment def test_freeze_simple(): @@ -494,6 +495,36 @@ def test_invalid_arrays(): _ = freeze(symb_arr) +def test_invalid_reduction_assignments(): + x = fields(f"x: float64[1d]") + w = TypedSymbol("w", "float64") + + ctx = KernelCreationContext() + freeze = FreezeExpressions(ctx) + + one = PsExpression.make(PsConstant(1, ctx.index_dtype)) + counter = ctx.get_symbol("ctr", ctx.index_dtype) + ispace = FullIterationSpace( + ctx, [FullIterationSpace.Dimension(one, one, one, counter)] + ) + ctx.set_iteration_space(ispace) + + invalid_assignment = Assignment(w, -1 * x.center()) + reduction_assignment = AddReductionAssignment(w, 3 * x.center()) + + # reduction symbol is used before ReductionAssignment + with pytest.raises(FreezeError): + _ = [freeze(asm) for asm in [invalid_assignment, reduction_assignment]] + + # reduction symbol is used after ReductionAssignment + with pytest.raises(FreezeError): + _ = [freeze(asm) for asm in [reduction_assignment, invalid_assignment]] + + # duplicate ReductionAssignment + with pytest.raises(FreezeError): + _ = [freeze(asm) for asm in [reduction_assignment, reduction_assignment]] + + def test_memory_access(): ctx = KernelCreationContext() freeze = FreezeExpressions(ctx) -- GitLab From 94dcf3c5362238f1203b88f8a6a5873431749dc5 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 24 Apr 2025 20:47:07 +0200 Subject: [PATCH 161/180] Add unit test for freezing of ReductionAssignments --- tests/nbackend/kernelcreation/test_freeze.py | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py index 7082f28ad..a39a0a994 100644 --- a/tests/nbackend/kernelcreation/test_freeze.py +++ b/tests/nbackend/kernelcreation/test_freeze.py @@ -44,6 +44,7 @@ from pystencils.backend.ast.expressions import ( PsArrayInitList, PsSubscript, PsMemAcc, + PsSymbolExpr, ) from pystencils.backend.constants import PsConstant from pystencils.backend.functions import PsMathFunction, MathFunctions @@ -495,6 +496,31 @@ def test_invalid_arrays(): _ = freeze(symb_arr) +def test_reduction_assignments(): + x = fields(f"x: float64[1d]") + w = TypedSymbol("w", "float64") + + ctx = KernelCreationContext() + freeze = FreezeExpressions(ctx) + + one = PsExpression.make(PsConstant(1, ctx.index_dtype)) + counter = ctx.get_symbol("ctr", ctx.index_dtype) + ispace = FullIterationSpace( + ctx, [FullIterationSpace.Dimension(one, one, one, counter)] + ) + ctx.set_iteration_space(ispace) + + expr = freeze(AddReductionAssignment(w, 3 * x.center())) + + info = ctx.find_reduction_info(w.name) + + assert isinstance(expr, PsAssignment) + assert isinstance(expr.lhs, PsSymbolExpr) + + assert expr.lhs.symbol == info.local_symbol + assert expr.lhs.dtype == w.dtype + + def test_invalid_reduction_assignments(): x = fields(f"x: float64[1d]") w = TypedSymbol("w", "float64") -- GitLab From 3010daed816e9fe4ed9b685b1f666c75736d8222 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Thu, 24 Apr 2025 21:08:49 +0200 Subject: [PATCH 162/180] Add typification test for PsVecHorizontal --- .../kernelcreation/test_typification.py | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/tests/nbackend/kernelcreation/test_typification.py b/tests/nbackend/kernelcreation/test_typification.py index 3defe4ad5..31df7090d 100644 --- a/tests/nbackend/kernelcreation/test_typification.py +++ b/tests/nbackend/kernelcreation/test_typification.py @@ -5,6 +5,7 @@ import numpy as np from typing import cast from pystencils import Assignment, TypedSymbol, Field, FieldType, AddAugmentedAssignment +from pystencils.sympyextensions import ReductionOp from pystencils.sympyextensions.pointers import mem_acc from pystencils.backend.ast.structural import ( @@ -34,7 +35,7 @@ from pystencils.backend.ast.expressions import ( PsTernary, PsMemAcc ) -from pystencils.backend.ast.vector import PsVecBroadcast +from pystencils.backend.ast.vector import PsVecBroadcast, PsVecHorizontal from pystencils.backend.constants import PsConstant from pystencils.backend.functions import CFunction from pystencils.types import constify, create_type, create_numeric_type, PsVectorType @@ -649,6 +650,50 @@ def test_typify_bool_vectors(): assert result.get_dtype() == PsVectorType(Bool(), 4) +def test_typify_horizontal_vector_reductions(): + ctx = KernelCreationContext() + typify = Typifier(ctx) + + reduction_op = ReductionOp.Add + stype = Fp(32) + vtype = PsVectorType(stype, 4) + + def create_symb_expr(name, tpe): + return PsExpression.make(ctx.get_symbol(name, tpe)) + + # create valid horizontal and check if expression type is scalar + result = typify( + PsVecHorizontal( + create_symb_expr("s1", stype), create_symb_expr("v1", vtype), ReductionOp.Add + ) + ) + assert result.get_dtype() == stype + + # create invalid horizontal by using scalar type for expected vector type + with pytest.raises(TypificationError): + _ = typify( + PsVecHorizontal( + create_symb_expr("s2", stype), create_symb_expr("v2", stype), reduction_op + ) + ) + + # create invalid horizontal by using vector type for expected scalar type + with pytest.raises(TypificationError): + _ = typify( + PsVecHorizontal( + create_symb_expr("s3", vtype), create_symb_expr("v3", vtype), reduction_op + ) + ) + + # create invalid horizontal where base type of vector does not match with scalar type + with pytest.raises(TypificationError): + _ = typify( + PsVecHorizontal( + create_symb_expr("s4", Int(32)), create_symb_expr("v4", vtype), reduction_op + ) + ) + + def test_inference_fails(): ctx = KernelCreationContext() typify = Typifier(ctx) -- GitLab From da00b78ff50a2b5e61ae0d9ad53f56d3a6bc4c4d Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 25 Apr 2025 12:58:57 +0200 Subject: [PATCH 163/180] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Frederik Hennig <frederik.hennig@fau.de> --- src/pystencils/backend/ast/vector.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py index 55db67e7c..d7ae8d6a9 100644 --- a/src/pystencils/backend/ast/vector.py +++ b/src/pystencils/backend/ast/vector.py @@ -41,10 +41,9 @@ class PsVecBroadcast(PsUnOp, PsVectorOp): class PsVecHorizontal(PsBinOp, PsVectorOp): - """Represents a binary operation between a scalar and a vector operand. - With the binary operation not being vectorized, a horizontal reduction - along the lanes of the vector operand is required to extract a scalar value. - The result type will be equal to the scalar operand. + """Perform a horizontal reduction across a vector onto a scalar base value. + + **Example:** vec_horizontal_add(s, v)` will compute `s + v[0] + v[1] + ... + v[n-1]`. Args: scalar_operand: Scalar operand -- GitLab From c0df001f196655dd5b3ada6f1fa3909b90584abd Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Fri, 25 Apr 2025 12:59:07 +0200 Subject: [PATCH 164/180] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Frederik Hennig <frederik.hennig@fau.de> --- .../backend/reduction_op_mapping.py | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py index 832f5d0bf..59273efab 100644 --- a/src/pystencils/backend/reduction_op_mapping.py +++ b/src/pystencils/backend/reduction_op_mapping.py @@ -12,27 +12,20 @@ _available_operator_interface: set[ReductionOp] = { def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: - if op in _available_operator_interface: - match op: - case ReductionOp.Add: - return PsAdd(op1, op2) - case ReductionOp.Sub: - return PsSub(op1, op2) - case ReductionOp.Mul: - return PsMul(op1, op2) - case ReductionOp.Div: - return PsDiv(op1, op2) - case _: - raise FreezeError( - f"Found unsupported operation type for reduction assignments: {op}." - ) - else: - match op: - case ReductionOp.Min: - return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2]) - case ReductionOp.Max: - return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) - case _: - raise FreezeError( - f"Found unsupported operation type for reduction assignments: {op}." - ) + match op: + case ReductionOp.Add: + return PsAdd(op1, op2) + case ReductionOp.Sub: + return PsSub(op1, op2) + case ReductionOp.Mul: + return PsMul(op1, op2) + case ReductionOp.Div: + return PsDiv(op1, op2) + case ReductionOp.Min: + return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2]) + case ReductionOp.Max: + return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) + case _: + raise FreezeError( + f"Found unsupported operation type for reduction assignments: {op}." + ) -- GitLab From 705ac53161f9a2fe49a2898de36f118a79e2d6a2 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 15:19:00 +0200 Subject: [PATCH 165/180] Minor doc change --- docs/source/user_manual/reductions.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md index 7c7904dfc..5b45a921c 100644 --- a/docs/source/user_manual/reductions.md +++ b/docs/source/user_manual/reductions.md @@ -105,8 +105,9 @@ To execute the kernel on CPUs, not only a {any}`numpy.ndarray` has to be passed but also one for exporting reduction results. The export mechanism can be seen in the previously generated code snippet. Here, the kernel obtains a pointer with the name of the reduction symbol (here: `r`). -This pointer not only allows providing initial values for the reduction but is also used for writing back the -reduction result. +This pointer is used for exporting the reduction result back from the kernel. +Please note that the **values passed via pointer will not be overwritten** +but will be incorporated in the reduction computation. Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value. ```{code-cell} ipython3 -- GitLab From 0c8654e3251dea9c4d0ca9e3fa14f54dc970564b Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 15:36:58 +0200 Subject: [PATCH 166/180] Extend KernelAnalysis to check for invalid references to reduction symbols --- .../backend/kernelcreation/analysis.py | 17 +++++++++ .../nbackend/kernelcreation/test_analysis.py | 38 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 tests/nbackend/kernelcreation/test_analysis.py diff --git a/src/pystencils/backend/kernelcreation/analysis.py b/src/pystencils/backend/kernelcreation/analysis.py index 1365e1ef3..e5f8b921e 100644 --- a/src/pystencils/backend/kernelcreation/analysis.py +++ b/src/pystencils/backend/kernelcreation/analysis.py @@ -13,6 +13,8 @@ from ...simp import AssignmentCollection from sympy.codegen.ast import AssignmentBase from ..exceptions import PsInternalCompilerError, KernelConstraintsError +from ...sympyextensions.reduction import ReductionAssignment +from ...sympyextensions.typed_sympy import TypedSymbol class KernelAnalysis: @@ -54,6 +56,8 @@ class KernelAnalysis: self._check_access_independence = check_access_independence self._check_double_writes = check_double_writes + self._reduction_symbols: set[TypedSymbol] = set() + # Map pairs of fields and indices to offsets self._field_writes: dict[KernelAnalysis.FieldAndIndex, set[Any]] = defaultdict( set @@ -88,6 +92,14 @@ class KernelAnalysis: for asm in asms: self._visit(asm) + case ReductionAssignment(): + assert isinstance(obj.lhs, TypedSymbol) + + self._reduction_symbols.add(obj.lhs) + + self._handle_rhs(obj.rhs) + self._handle_lhs(obj.lhs) + case AssignmentBase(): self._handle_rhs(obj.rhs) self._handle_lhs(obj.lhs) @@ -152,6 +164,11 @@ class KernelAnalysis: f"{field} is read at {offsets} and written at {write_offset}" ) case sp.Symbol(): + if expr in self._reduction_symbols: + raise KernelConstraintsError( + f"Illegal access to reduction symbol {expr.name} outside of ReductionAssignment. " + ) + self._scopes.access_symbol(expr) for arg in expr.args: diff --git a/tests/nbackend/kernelcreation/test_analysis.py b/tests/nbackend/kernelcreation/test_analysis.py new file mode 100644 index 000000000..d68c0a5f3 --- /dev/null +++ b/tests/nbackend/kernelcreation/test_analysis.py @@ -0,0 +1,38 @@ +import pytest + +from pystencils import fields, TypedSymbol, AddReductionAssignment, Assignment, KernelConstraintsError +from pystencils.backend.kernelcreation import KernelCreationContext, KernelAnalysis +from pystencils.sympyextensions import mem_acc +from pystencils.types.quick import Ptr, Fp + + +def test_invalid_reduction_symbol_reassign(): + dtype = Fp(64) + ctx = KernelCreationContext(default_dtype=dtype) + analysis = KernelAnalysis(ctx) + + x = fields(f"x: [1d]") + w = TypedSymbol("w", dtype) + + # illegal reassign to already locally defined symbol (here: reduction symbol) + with pytest.raises(KernelConstraintsError): + analysis([ + AddReductionAssignment(w, 3 * x.center()), + Assignment(w, 0) + ]) + +def test_invalid_reduction_symbol_reference(): + dtype = Fp(64) + ctx = KernelCreationContext(default_dtype=dtype) + analysis = KernelAnalysis(ctx) + + x = fields(f"x: [1d]") + v = TypedSymbol("v", dtype) + w = TypedSymbol("w", dtype) + + # do not allow reduction symbol to be referenced on rhs of other assignments + with pytest.raises(KernelConstraintsError): + analysis([ + AddReductionAssignment(w, 3 * x.center()), + Assignment(v, w) + ]) \ No newline at end of file -- GitLab From 325ca38652751eccdd33fe2cc9dc55735c6dfc0c Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 15:44:08 +0200 Subject: [PATCH 167/180] Move checks and init value determination for ReductionAssignments to add_reduction_info --- .../backend/kernelcreation/context.py | 31 ++++++++++++-- .../backend/kernelcreation/freeze.py | 41 +------------------ 2 files changed, 30 insertions(+), 42 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 63bfc2f7b..1d7e75db1 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -6,7 +6,8 @@ from itertools import chain, count from collections import namedtuple, defaultdict import re -from ..ast.expressions import PsExpression +from ..ast.expressions import PsExpression, PsConstantExpr, PsCall +from ..functions import NumericLimitsFunctions, PsMathFunction from ...defaults import DEFAULTS from ...field import Field, FieldType from ...sympyextensions import ReductionOp @@ -208,10 +209,16 @@ class KernelCreationContext: lhs_name: str, lhs_dtype: PsType, reduction_op: ReductionOp, - init_value: PsExpression, ): """Create ReductionInfo instance and add to its corresponding lookup table for a given symbol name.""" + # make sure that lhs symbol never occurred before ReductionAssignment + if self.find_symbol(lhs_name): + raise KernelConstraintsError( + f"Left-hand side {lhs_name} of ReductionAssignment already exists in symbol table. " + f"Make sure that it is only used once in a kernel's ReductionAssignment." + ) + # replace datatype of lhs symbol with pointer datatype for write-back mechanism symb = self.get_symbol(lhs_name, lhs_dtype) pointer_symb = PsSymbol(lhs_name, PsPointerType(lhs_dtype)) @@ -221,9 +228,27 @@ class KernelCreationContext: local_symb = PsSymbol(f"{lhs_name}_local", lhs_dtype) self.add_symbol(local_symb) + # match for reduction operation and set neutral init_val + init_val: PsExpression + match reduction_op: + case ReductionOp.Add: + init_val = PsConstantExpr(PsConstant(0)) + case ReductionOp.Sub: + init_val = PsConstantExpr(PsConstant(0)) + case ReductionOp.Mul: + init_val = PsConstantExpr(PsConstant(1)) + case ReductionOp.Min: + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) + case ReductionOp.Max: + init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) + case _: + raise PsInternalCompilerError( + f"Unsupported kind of reduction assignment: {reduction_op}." + ) + # create reduction info and add to set reduction_info = ReductionInfo( - reduction_op, init_value, local_symb, pointer_symb + reduction_op, init_val, local_symb, pointer_symb ) self._reduction_data[lhs_name] = reduction_info diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index e0ed0f1f7..4c7b8fb23 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -189,21 +189,12 @@ class FreezeExpressions: def map_ReductionAssignment(self, expr: ReductionAssignment): assert isinstance(expr.lhs, TypedSymbol) - # make sure that lhs symbol never occurred before ReductionAssignment - if self._ctx.find_symbol(expr.lhs.name): - raise FreezeError( - f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table. " - f"Make sure that it is only used once in a kernel's ReductionAssignment." - ) - - lhs = self.visit(expr.lhs) rhs = self.visit(expr.rhs) assert isinstance(rhs, PsExpression) - assert isinstance(lhs, PsSymbolExpr) reduction_op = expr.reduction_op - lhs_symbol = lhs.symbol + lhs_symbol = expr.lhs lhs_dtype = lhs_symbol.dtype lhs_name = lhs_symbol.name @@ -211,27 +202,9 @@ class FreezeExpressions: lhs_dtype, PsNumericType ), "Reduction assignments require type information of the lhs symbol." - # match for reduction operation and set neutral init_val - init_val: PsExpression - match reduction_op: - case ReductionOp.Add: - init_val = PsConstantExpr(PsConstant(0)) - case ReductionOp.Sub: - init_val = PsConstantExpr(PsConstant(0)) - case ReductionOp.Mul: - init_val = PsConstantExpr(PsConstant(1)) - case ReductionOp.Min: - init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), []) - case ReductionOp.Max: - init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), []) - case _: - raise FreezeError( - f"Unsupported kind of reduction assignment: {reduction_op}." - ) - # get reduction info from context reduction_info = self._ctx.add_reduction_info( - lhs_name, lhs_dtype, reduction_op, init_val + lhs_name, lhs_dtype, reduction_op ) # create new lhs from newly created local lhs symbol @@ -330,16 +303,6 @@ class FreezeExpressions: def map_TypedSymbol(self, expr: TypedSymbol): dtype = self._ctx.resolve_dynamic_type(expr.dtype) - - # check if symbol is referenced after freezing a ReductionAssignment - if self._ctx.find_reduction_info(expr.name): - # check if types do not align since a ReductionAssignment modifies - # the symbol's type to PsPointerType in the context's symbol table - if (symbol := self._ctx.find_symbol(expr.name)) and symbol.dtype != dtype: - raise FreezeError( - f"Illegal access to reduction symbol {symbol.name} after freezing a kernel's ReductionAssignment. " - ) - symb = self._ctx.get_symbol(expr.name, dtype) return PsSymbolExpr(symb) -- GitLab From 1580a2b06b0aa060def7428cf5ac3f7938b81f92 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 15:45:11 +0200 Subject: [PATCH 168/180] Adapt test_invalid_reduction_assignments to create new contexts for each subtest --- tests/nbackend/kernelcreation/test_freeze.py | 46 +++++++++++--------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py index a39a0a994..987b68043 100644 --- a/tests/nbackend/kernelcreation/test_freeze.py +++ b/tests/nbackend/kernelcreation/test_freeze.py @@ -8,6 +8,7 @@ from pystencils import ( create_numeric_type, TypedSymbol, DynamicType, + KernelConstraintsError, ) from pystencils.sympyextensions import tcast from pystencils.sympyextensions.pointers import mem_acc @@ -68,6 +69,7 @@ from pystencils.sympyextensions.integer_functions import ( div_ceil, ) from pystencils.sympyextensions.reduction import AddReductionAssignment +from pystencils.types import PsTypeError def test_freeze_simple(): @@ -525,30 +527,34 @@ def test_invalid_reduction_assignments(): x = fields(f"x: float64[1d]") w = TypedSymbol("w", "float64") - ctx = KernelCreationContext() - freeze = FreezeExpressions(ctx) - - one = PsExpression.make(PsConstant(1, ctx.index_dtype)) - counter = ctx.get_symbol("ctr", ctx.index_dtype) - ispace = FullIterationSpace( - ctx, [FullIterationSpace.Dimension(one, one, one, counter)] - ) - ctx.set_iteration_space(ispace) - - invalid_assignment = Assignment(w, -1 * x.center()) + assignment = Assignment(w, -1 * x.center()) reduction_assignment = AddReductionAssignment(w, 3 * x.center()) - # reduction symbol is used before ReductionAssignment - with pytest.raises(FreezeError): - _ = [freeze(asm) for asm in [invalid_assignment, reduction_assignment]] + expected_errors_for_invalid_cases = [ + # 1) Reduction symbol is used before ReductionAssignment. + # May only be used for reductions -> KernelConstraintsError + ([assignment, reduction_assignment], KernelConstraintsError), + # 2) Reduction symbol is used after ReductionAssignment. + # Reduction symbol is converted to pointer after freeze -> PsTypeError + ([reduction_assignment, assignment], PsTypeError), + # 3) Duplicate ReductionAssignment + # May only be used once for now -> KernelConstraintsError + ([reduction_assignment, reduction_assignment], KernelConstraintsError) + ] - # reduction symbol is used after ReductionAssignment - with pytest.raises(FreezeError): - _ = [freeze(asm) for asm in [reduction_assignment, invalid_assignment]] + for invalid_assignment, error_class in expected_errors_for_invalid_cases: + ctx = KernelCreationContext() + freeze = FreezeExpressions(ctx) - # duplicate ReductionAssignment - with pytest.raises(FreezeError): - _ = [freeze(asm) for asm in [reduction_assignment, reduction_assignment]] + one = PsExpression.make(PsConstant(1, ctx.index_dtype)) + counter = ctx.get_symbol("ctr", ctx.index_dtype) + ispace = FullIterationSpace( + ctx, [FullIterationSpace.Dimension(one, one, one, counter)] + ) + ctx.set_iteration_space(ispace) + + with pytest.raises(error_class): + _ = [freeze(asm) for asm in invalid_assignment] def test_memory_access(): -- GitLab From 490ec9144b01c10c6a6e8d1a927fd42e37a56511 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 15:51:08 +0200 Subject: [PATCH 169/180] Omit old stuff from reduction_op_to_expr --- src/pystencils/backend/reduction_op_mapping.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py index 59273efab..389c0940a 100644 --- a/src/pystencils/backend/reduction_op_mapping.py +++ b/src/pystencils/backend/reduction_op_mapping.py @@ -1,16 +1,8 @@ from .ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv -from .exceptions import FreezeError +from .exceptions import PsInternalCompilerError from .functions import PsMathFunction, MathFunctions from ..sympyextensions.reduction import ReductionOp -_available_operator_interface: set[ReductionOp] = { - ReductionOp.Add, - ReductionOp.Sub, - ReductionOp.Mul, - ReductionOp.Div, -} - - def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: match op: case ReductionOp.Add: @@ -26,6 +18,6 @@ def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: case ReductionOp.Max: return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2]) case _: - raise FreezeError( + raise PsInternalCompilerError( f"Found unsupported operation type for reduction assignments: {op}." ) -- GitLab From 4106f9fdf0a4f1c0c6806ac3bca2a6ae467d31d1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 15:57:33 +0200 Subject: [PATCH 170/180] Parameterize test_reduction_assignments with reduction ops --- tests/nbackend/kernelcreation/test_freeze.py | 26 +++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py index 987b68043..fe31cf94c 100644 --- a/tests/nbackend/kernelcreation/test_freeze.py +++ b/tests/nbackend/kernelcreation/test_freeze.py @@ -68,7 +68,13 @@ from pystencils.sympyextensions.integer_functions import ( ceil_to_multiple, div_ceil, ) -from pystencils.sympyextensions.reduction import AddReductionAssignment +from pystencils.sympyextensions.reduction import ( + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, +) from pystencils.types import PsTypeError @@ -498,10 +504,22 @@ def test_invalid_arrays(): _ = freeze(symb_arr) -def test_reduction_assignments(): +@pytest.mark.parametrize("reduction_assignment_rhs_type", + [ + (AddReductionAssignment, PsAdd), + (SubReductionAssignment, PsSub), + (MulReductionAssignment, PsMul), + (MinReductionAssignment, PsCall), + (MaxReductionAssignment, PsCall), + ]) +def test_reduction_assignments( + reduction_assignment_rhs_type +): x = fields(f"x: float64[1d]") w = TypedSymbol("w", "float64") + reduction_op, rhs_type = reduction_assignment_rhs_type + ctx = KernelCreationContext() freeze = FreezeExpressions(ctx) @@ -512,7 +530,7 @@ def test_reduction_assignments(): ) ctx.set_iteration_space(ispace) - expr = freeze(AddReductionAssignment(w, 3 * x.center())) + expr = freeze(reduction_op(w, 3 * x.center())) info = ctx.find_reduction_info(w.name) @@ -522,6 +540,8 @@ def test_reduction_assignments(): assert expr.lhs.symbol == info.local_symbol assert expr.lhs.dtype == w.dtype + assert isinstance(expr.rhs, rhs_type) + def test_invalid_reduction_assignments(): x = fields(f"x: float64[1d]") -- GitLab From f3105780593ce575581cc6231c7b02f87fe40319 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 16:15:37 +0200 Subject: [PATCH 171/180] Add unit test for ReductionAssignment --- tests/frontend/test_sympyextensions.py | 38 +++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/tests/frontend/test_sympyextensions.py b/tests/frontend/test_sympyextensions.py index ad5d2513b..152527441 100644 --- a/tests/frontend/test_sympyextensions.py +++ b/tests/frontend/test_sympyextensions.py @@ -3,7 +3,9 @@ import numpy as np import sympy as sp import pystencils -from pystencils import Assignment +import pytest + +from pystencils import Assignment, TypedSymbol from pystencils.sympyextensions import replace_second_order_products from pystencils.sympyextensions import remove_higher_order_terms from pystencils.sympyextensions import complete_the_squares_in_exp @@ -27,6 +29,16 @@ from pystencils.sympyextensions.integer_functions import ( div_ceil, ) +from pystencils.sympyextensions.reduction import ( + ReductionOp, + AddReductionAssignment, + SubReductionAssignment, + MulReductionAssignment, + MinReductionAssignment, + MaxReductionAssignment, + reduction_assignment, +) + def test_utility(): a = [1, 2] @@ -199,6 +211,30 @@ def test_count_operations(): assert ops["muls"] == 99 +@pytest.mark.parametrize("reduction_assignment_for_op", [ + (ReductionOp.Add, AddReductionAssignment), + (ReductionOp.Sub, SubReductionAssignment), + (ReductionOp.Mul, MulReductionAssignment), + (ReductionOp.Min, MinReductionAssignment), + (ReductionOp.Max, MaxReductionAssignment), +]) +def test_reduction_assignments( + reduction_assignment_for_op +): + reduction_op, reduction_assignment_type = reduction_assignment_for_op + + w = TypedSymbol("w", "float64") + v = sympy.symbols("v") + + assignment = reduction_assignment(w, reduction_op, 0) + + assert isinstance(assignment, reduction_assignment_type) + + # invalid assignment since v is not a typed symbol + with pytest.raises(TypeError): + _ = reduction_assignment(v, reduction_op, 0) + + def test_common_denominator(): x = sympy.symbols("x") expr = sympy.Rational(1, 2) + x * sympy.Rational(2, 3) -- GitLab From 3757e179c0e3b73f715819dbef1c316ba1b13af8 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 16:27:07 +0200 Subject: [PATCH 172/180] Omit unnecessary replace_symbol call --- src/pystencils/backend/kernelcreation/context.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 1d7e75db1..3e79bf24a 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -219,10 +219,8 @@ class KernelCreationContext: f"Make sure that it is only used once in a kernel's ReductionAssignment." ) - # replace datatype of lhs symbol with pointer datatype for write-back mechanism - symb = self.get_symbol(lhs_name, lhs_dtype) - pointer_symb = PsSymbol(lhs_name, PsPointerType(lhs_dtype)) - self.replace_symbol(symb, pointer_symb) + # add symbol for lhs with pointer datatype for write-back mechanism + pointer_symb = self.get_symbol(lhs_name, PsPointerType(lhs_dtype)) # create kernel-local copy of lhs symbol local_symb = PsSymbol(f"{lhs_name}_local", lhs_dtype) -- GitLab From 508701f6d3f352d0adfb655b8ea9aae3a15a5745 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 16:32:16 +0200 Subject: [PATCH 173/180] Fix lint --- src/pystencils/backend/kernelcreation/freeze.py | 2 +- src/pystencils/backend/reduction_op_mapping.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 4c7b8fb23..b1bb4cd4a 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -63,7 +63,7 @@ from ..ast.vector import PsVecMemAcc from ..constants import PsConstant from ...types import PsNumericType, PsStructType, PsType from ..exceptions import PsInputError -from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions +from ..functions import PsMathFunction, MathFunctions from ..exceptions import FreezeError diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py index 389c0940a..a97a496e0 100644 --- a/src/pystencils/backend/reduction_op_mapping.py +++ b/src/pystencils/backend/reduction_op_mapping.py @@ -3,6 +3,7 @@ from .exceptions import PsInternalCompilerError from .functions import PsMathFunction, MathFunctions from ..sympyextensions.reduction import ReductionOp + def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression: match op: case ReductionOp.Add: -- GitLab From 9c5b9bce9065759461bb0c6ed2e88b4a665789d6 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 16:35:17 +0200 Subject: [PATCH 174/180] Omit unnecessary type context creations for PsVecHorizontal --- .../backend/kernelcreation/typification.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 9585cb23f..8dfc57a7a 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -584,34 +584,32 @@ class Typifier: # bin op consisting of a scalar and a vector that is converted to a scalar # -> whole expression should be treated as scalar - scalar_op_tc = TypeContext() - self.visit_expr(expr.scalar_operand, scalar_op_tc) + self.visit_expr(expr.scalar_operand, tc) - vector_op_tc = TypeContext() - self.visit_expr(expr.vector_operand, vector_op_tc) + self.visit_expr(expr.vector_operand, tc) - if scalar_op_tc.target_type is None or vector_op_tc.target_type is None: + if tc.target_type is None or tc.target_type is None: raise TypificationError( f"Unable to determine type of argument to vector horizontal: {expr}" ) - if not isinstance(scalar_op_tc.target_type, PsScalarType): + if not isinstance(tc.target_type, PsScalarType): raise TypificationError( - f"Illegal type in scalar operand (op1) to vector horizontal: {scalar_op_tc.target_type}" + f"Illegal type in scalar operand (op1) to vector horizontal: {tc.target_type}" ) - if not isinstance(vector_op_tc.target_type, PsVectorType): + if not isinstance(tc.target_type, PsVectorType): raise TypificationError( - f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}" + f"Illegal type in vector operand (op2) to vector horizontal: {tc.target_type}" ) - if vector_op_tc.target_type.scalar_type is not scalar_op_tc.target_type: + if tc.target_type.scalar_type is not tc.target_type: raise TypificationError( - f"Scalar type of vector operand {vector_op_tc.target_type} " - f"does not correspond to type of scalar operand {scalar_op_tc.target_type}" + f"Scalar type of vector operand {tc.target_type} " + f"does not correspond to type of scalar operand {tc.target_type}" ) - tc.apply_dtype(scalar_op_tc.target_type, expr) + tc.apply_dtype(tc.target_type, expr) case PsBinOp(op1, op2): self.visit_expr(op1, tc) -- GitLab From a14f13fb6bd17fb9dbe8e4db00a3ebd3fd2391a1 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Fri, 25 Apr 2025 17:39:22 +0200 Subject: [PATCH 175/180] Omit extra type context creation for scalar op in PsVecHorizontal --- .../backend/kernelcreation/typification.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py index 9585cb23f..1c34fac6f 100644 --- a/src/pystencils/backend/kernelcreation/typification.py +++ b/src/pystencils/backend/kernelcreation/typification.py @@ -584,20 +584,19 @@ class Typifier: # bin op consisting of a scalar and a vector that is converted to a scalar # -> whole expression should be treated as scalar - scalar_op_tc = TypeContext() - self.visit_expr(expr.scalar_operand, scalar_op_tc) + self.visit_expr(expr.scalar_operand, tc) vector_op_tc = TypeContext() self.visit_expr(expr.vector_operand, vector_op_tc) - if scalar_op_tc.target_type is None or vector_op_tc.target_type is None: + if tc.target_type is None or vector_op_tc.target_type is None: raise TypificationError( f"Unable to determine type of argument to vector horizontal: {expr}" ) - if not isinstance(scalar_op_tc.target_type, PsScalarType): + if not isinstance(tc.target_type, PsScalarType): raise TypificationError( - f"Illegal type in scalar operand (op1) to vector horizontal: {scalar_op_tc.target_type}" + f"Illegal type in scalar operand (op1) to vector horizontal: {tc.target_type}" ) if not isinstance(vector_op_tc.target_type, PsVectorType): @@ -605,13 +604,13 @@ class Typifier: f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}" ) - if vector_op_tc.target_type.scalar_type is not scalar_op_tc.target_type: + if vector_op_tc.target_type.scalar_type is not tc.target_type: raise TypificationError( f"Scalar type of vector operand {vector_op_tc.target_type} " - f"does not correspond to type of scalar operand {scalar_op_tc.target_type}" + f"does not correspond to type of scalar operand {tc.target_type}" ) - tc.apply_dtype(scalar_op_tc.target_type, expr) + tc.apply_dtype(tc.target_type, expr) case PsBinOp(op1, op2): self.visit_expr(op1, tc) -- GitLab From a9c8d6a7ccf65e79fc57d3e92818becb513ae4d3 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Sat, 26 Apr 2025 21:01:57 +0200 Subject: [PATCH 176/180] Apply 4 suggestion(s) to 2 file(s) Co-authored-by: Frederik Hennig <frederik.hennig@fau.de> --- src/pystencils/backend/kernelcreation/context.py | 9 ++++----- src/pystencils/backend/kernelcreation/freeze.py | 5 ++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 3e79bf24a..48e2f4a3a 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -207,7 +207,7 @@ class KernelCreationContext: def add_reduction_info( self, lhs_name: str, - lhs_dtype: PsType, + lhs_dtype: PsNumericType, reduction_op: ReductionOp, ): """Create ReductionInfo instance and add to its corresponding lookup table for a given symbol name.""" @@ -215,16 +215,15 @@ class KernelCreationContext: # make sure that lhs symbol never occurred before ReductionAssignment if self.find_symbol(lhs_name): raise KernelConstraintsError( - f"Left-hand side {lhs_name} of ReductionAssignment already exists in symbol table. " - f"Make sure that it is only used once in a kernel's ReductionAssignment." + f"Cannot create reduction with symbol {lhs_name}: " + "Another symbol with the same name already exist." ) # add symbol for lhs with pointer datatype for write-back mechanism pointer_symb = self.get_symbol(lhs_name, PsPointerType(lhs_dtype)) # create kernel-local copy of lhs symbol - local_symb = PsSymbol(f"{lhs_name}_local", lhs_dtype) - self.add_symbol(local_symb) + local_symb = self.get_new_symbol(f"{lhs_name}_local", lhs_dtype) # match for reduction operation and set neutral init_val init_val: PsExpression diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index b1bb4cd4a..598716567 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -198,9 +198,8 @@ class FreezeExpressions: lhs_dtype = lhs_symbol.dtype lhs_name = lhs_symbol.name - assert isinstance( - lhs_dtype, PsNumericType - ), "Reduction assignments require type information of the lhs symbol." + if not isinstance(lhs_dtype, PsNumericType): + raise FreezeError("Reduction symbol must have a numeric data type.") # get reduction info from context reduction_info = self._ctx.add_reduction_info( -- GitLab From 65362ddfc2bed0a28a7c071962366c206d552d97 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 30 Apr 2025 13:55:29 +0200 Subject: [PATCH 177/180] Fix typecheck note for match args of PsConstantFunction --- src/pystencils/backend/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py index 3ff61e039..6a9d3e4f4 100644 --- a/src/pystencils/backend/functions.py +++ b/src/pystencils/backend/functions.py @@ -163,7 +163,7 @@ class PsConstantFunction(PsFunction): and will be broadcast by the vectorizer. """ - __match_args__ = ("func,") + __match_args__ = ("func",) def __init__( self, func: ConstantFunctions, dtype: PsNumericType | None = None -- GitLab From 2ca58226cdac171f0373f4cd23b4efb77e9e7505 Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 30 Apr 2025 15:28:14 +0200 Subject: [PATCH 178/180] Minor adaptation of required gpu headers --- src/pystencils/backend/platforms/cuda.py | 2 +- src/pystencils/backend/platforms/generic_gpu.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py index 6e6488ee1..60571db94 100644 --- a/src/pystencils/backend/platforms/cuda.py +++ b/src/pystencils/backend/platforms/cuda.py @@ -33,7 +33,7 @@ class CudaPlatform(GenericGpu): @property def required_headers(self) -> set[str]: - return super().required_headers | {'"pystencils_runtime/cuda.cuh"'} + return super().required_headers | {'"pystencils_runtime/cuda.cuh"', '"gpu_atomics.h"'} def resolve_reduction( self, diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index 2cfe11d51..b87e6411f 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -203,9 +203,7 @@ class GenericGpu(Platform): @property @abstractmethod def required_headers(self) -> set[str]: - return { - '"gpu_atomics.h"', - } + return set() @abstractmethod def resolve_reduction( -- GitLab From c430559032934ce9234dfad11fea2d60bd37c6ff Mon Sep 17 00:00:00 2001 From: zy69guqi <richard.angersbach@fau.de> Date: Wed, 30 Apr 2025 15:39:06 +0200 Subject: [PATCH 179/180] Fix missing resolution of ConstantFunctions on GPU platforms --- src/pystencils/backend/platforms/generic_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py index b87e6411f..06b230454 100644 --- a/src/pystencils/backend/platforms/generic_gpu.py +++ b/src/pystencils/backend/platforms/generic_gpu.py @@ -313,7 +313,7 @@ class GenericGpu(Platform): arg_types = (dtype,) * call.function.arg_count expr: PsExpression | None = None - if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions: + if isinstance(dtype, PsIeeeFloatType): match func: case ( MathFunctions.Exp -- GitLab From 3e66466514bc1d426056369e21d0ed53580082f5 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Wed, 30 Apr 2025 15:47:44 +0200 Subject: [PATCH 180/180] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Frederik Hennig <frederik.hennig@fau.de> --- src/pystencils/codegen/driver.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py index 74a07b902..59e313913 100644 --- a/src/pystencils/codegen/driver.py +++ b/src/pystencils/codegen/driver.py @@ -320,8 +320,7 @@ class DefaultKernelCreationDriver: append_ast = [PsAssignment(ptr_access, write_back_ptr)] # modify AST - kernel_ast.statements = prepend_ast + kernel_ast.statements - kernel_ast.statements += append_ast + kernel_ast.statements = prepend_ast + kernel_ast.statements + append_ast def _transform_for_cpu(self, kernel_ast: PsBlock) -> PsBlock: canonicalize = CanonicalizeSymbols(self._ctx, True) -- GitLab