From 8a2bf6648548c18ab47d71db429956e2bf1f52b6 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 14 Jan 2025 18:13:18 +0100
Subject: [PATCH 001/180] Initial work for introducing reduction capabilities
 to pystencils

Signed-off-by: zy69guqi <richard.angersbach@fau.de>
---
 .../backend/kernelcreation/freeze.py          | 27 +++++++++
 src/pystencils/simp/assignment_collection.py  | 13 +++++
 src/pystencils/sympyextensions/__init__.py    |  2 +
 src/pystencils/sympyextensions/reduction.py   | 57 +++++++++++++++++++
 tests/kernelcreation/test_reduction.py        | 44 ++++++++++++++
 5 files changed, 143 insertions(+)
 create mode 100644 src/pystencils/sympyextensions/reduction.py
 create mode 100644 tests/kernelcreation/test_reduction.py

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 44ee17077..65be23065 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -1,3 +1,4 @@
+from sympyextensions.reduction import ReducedAssignment
 from typing import overload, cast, Any
 from functools import reduce
 from operator import add, mul, sub, truediv
@@ -183,6 +184,32 @@ class FreezeExpressions:
 
         return PsAssignment(lhs, op(lhs.clone(), rhs))
 
+    def map_ReducedAssignment(self, expr: ReducedAssignment):
+        lhs = self.visit(expr.lhs)
+        rhs = self.visit(expr.rhs)
+
+        assert isinstance(lhs, PsExpression)
+        assert isinstance(rhs, PsExpression)
+
+        match expr.op:
+            case "+=":
+                op = add
+            case "-=":
+                op = sub
+            case "*=":
+                op = mul
+            case "/=":
+                op = truediv
+            # TODO: unsure if sp.Min & sp.Max work here
+            case "min=":
+                op = sp.Min
+            case "max=":
+                op = sp.Max
+            case _:
+                raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
+
+        return PsAssignment(lhs, op(lhs.clone(), rhs)) # TODO: PsReducedAssignment?
+
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
         return PsSymbolExpr(symb)
diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py
index f1ba87154..4de3e8dc6 100644
--- a/src/pystencils/simp/assignment_collection.py
+++ b/src/pystencils/simp/assignment_collection.py
@@ -1,5 +1,8 @@
 import itertools
 from copy import copy
+
+from sympyextensions import reduced_assign
+from sympyextensions.reduction import ReducedAssignment
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union
 
 import sympy as sp
@@ -55,8 +58,13 @@ class AssignmentCollection:
         subexpressions = list(itertools.chain.from_iterable(
             [(a if isinstance(a, Iterable) else [a]) for a in subexpressions]))
 
+        # filter out reduced assignments
+        reduced_assignments = [a for a in main_assignments if isinstance(a, ReducedAssignment)]
+        main_assignments = [a for a in main_assignments if (a not in reduced_assignments)]
+
         self.main_assignments = main_assignments
         self.subexpressions = subexpressions
+        self.reductions = reduced_assignments
 
         if simplification_hints is None:
             simplification_hints = {}
@@ -71,6 +79,11 @@ class AssignmentCollection:
         else:
             self.subexpression_symbol_generator = subexpression_symbol_generator
 
+    def add_reduction(self, lhs: sp.Symbol, op: str, rhs: sp.Expr) -> None:
+        """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet."""
+        assert lhs not in self.reductions, f"Reduction for lhs {lhs} exists"
+        self.reductions.append(reduced_assign(lhs, op, rhs))
+
     def add_simplification_hint(self, key: str, value: Any) -> None:
         """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet."""
         assert key not in self.simplification_hints, "This hint already exists"
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 7431416c9..6ab24e936 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,6 +1,7 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc
 from .pointers import mem_acc
+from .reduction import reduced_assign
 
 from .math import (
     prod,
@@ -33,6 +34,7 @@ from .math import (
 
 __all__ = [
     "ConditionalFieldAccess",
+    "reduced_assign",
     "TypedSymbol",
     "CastFunc",
     "mem_acc",
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
new file mode 100644
index 000000000..aa947c1d2
--- /dev/null
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -0,0 +1,57 @@
+from sympy.codegen.ast import AssignmentBase
+
+
+class ReducedAssignment(AssignmentBase):
+    """
+    Base class for reduced assignments.
+
+    Attributes:
+    ===========
+
+    binop : str
+       Symbol for binary operation being applied in the assignment, such as "+",
+       "*", etc.
+    """
+    binop = None  # type: str
+
+    # TODO: initial value
+
+    @property
+    def op(self):
+        return self.binop + '='
+
+
+class AddReducedAssignment(ReducedAssignment):
+    binop = '+'
+
+class SubReducedAssignment(ReducedAssignment):
+    binop = '-'
+
+
+class MulReducedAssignment(ReducedAssignment):
+    binop = '*'
+
+
+class DivReducedAssignment(ReducedAssignment):
+    binop = '/'
+
+
+class MinReducedssignment(ReducedAssignment):
+    binop = 'min'
+
+class MaxReducedssignment(ReducedAssignment):
+    binop = 'max'
+
+
+# Mapping from binary op strings to AugmentedAssignment subclasses
+reduced_assign_classes = {
+    cls.binop: cls for cls in [
+        AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, DivReducedAssignment,
+        MinReducedssignment, MaxReducedssignment
+    ]
+}
+
+def reduced_assign(lhs, op, rhs):
+    if op not in reduced_assign_classes:
+        raise ValueError("Unrecognized operator %s" % op)
+    return reduced_assign_classes[op](lhs, rhs)
\ No newline at end of file
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
new file mode 100644
index 000000000..47509e267
--- /dev/null
+++ b/tests/kernelcreation/test_reduction.py
@@ -0,0 +1,44 @@
+import pytest
+import numpy as np
+import sympy as sp
+
+import pystencils as ps
+from sympyextensions.reduction import reduced_assign
+
+
+@pytest.mark.parametrize('dtype', ["float64", "float32"])
+def test_log(dtype):
+    a = sp.Symbol("a")
+    x = ps.fields(f'x: {dtype}[1d]')
+
+    # kernel with main assignments and no reduction
+
+    main_assignment = ps.AssignmentCollection({x.center(): a})
+
+    ast_main = ps.create_kernel(main_assignment, default_dtype=dtype)
+    code_main = ps.get_code_str(ast_main)
+    kernel_main = ast_main.compile()
+
+    # ps.show_code(ast)
+
+    if dtype == "float64":
+        assert "float" not in code_main
+
+    array = np.zeros((10,), dtype=dtype)
+    kernel_main(x=array, a=100)
+    assert np.allclose(array, 4.60517019)
+
+    # kernel with single reduction assignment
+
+    omega = sp.Symbol("omega")
+
+    reduction_assignment = reduced_assign(omega, "+", x.center())
+
+    ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype)
+    code_reduction = ps.get_code_str(ast_reduction)
+    kernel_reduction = ast_reduction.compile()
+
+    if dtype == "float64":
+        assert "float" not in code_reduction
+
+    ps.show_code(ast_reduction)
\ No newline at end of file
-- 
GitLab


From 543bf118944b32b851b526964ee275d7a1808034 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 11:36:53 +0100
Subject: [PATCH 002/180] Fix relative module imports for newly introduced
 sympyextensions for reductions

---
 src/pystencils/backend/kernelcreation/freeze.py | 2 +-
 src/pystencils/simp/assignment_collection.py    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 65be23065..4d75f1ca6 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -1,4 +1,3 @@
-from sympyextensions.reduction import ReducedAssignment
 from typing import overload, cast, Any
 from functools import reduce
 from operator import add, mul, sub, truediv
@@ -16,6 +15,7 @@ from ...sympyextensions import (
 )
 from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType
 from ...sympyextensions.pointers import AddressOf, mem_acc
+from ...sympyextensions.reduction import ReducedAssignment
 from ...field import Field, FieldType
 
 from .context import KernelCreationContext
diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py
index 4de3e8dc6..212dbf751 100644
--- a/src/pystencils/simp/assignment_collection.py
+++ b/src/pystencils/simp/assignment_collection.py
@@ -1,8 +1,6 @@
 import itertools
 from copy import copy
 
-from sympyextensions import reduced_assign
-from sympyextensions.reduction import ReducedAssignment
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union
 
 import sympy as sp
@@ -11,6 +9,8 @@ import pystencils
 from ..assignment import Assignment
 from .simplifications import (sort_assignments_topologically, transform_lhs_and_rhs, transform_rhs)
 from ..sympyextensions import count_operations, fast_subs
+from ..sympyextensions import reduced_assign
+from ..sympyextensions.reduction import ReducedAssignment
 
 
 class AssignmentCollection:
-- 
GitLab


From 558a0f20e082370a0bccd20b96a647e3536bc31e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 12:59:36 +0100
Subject: [PATCH 003/180] Expose new reduced assignments to pystencils
 interface

---
 src/pystencils/__init__.py             | 14 ++++++++++++++
 tests/kernelcreation/test_reduction.py |  4 ++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 6cb375b61..eecd929cf 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -38,6 +38,14 @@ from .simp import AssignmentCollection
 from .sympyextensions.typed_sympy import TypedSymbol, DynamicType
 from .sympyextensions import SymbolCreator
 from .datahandling import create_data_handling
+from .sympyextensions.reduction import (
+    AddReducedAssignment,
+    SubReducedAssignment,
+    MulReducedAssignment,
+    DivReducedAssignment,
+    MinReducedssignment,
+    MaxReducedssignment
+)
 
 __all__ = [
     "Field",
@@ -69,6 +77,12 @@ __all__ = [
     "AssignmentCollection",
     "Assignment",
     "AddAugmentedAssignment",
+    "AddReducedAssignment",
+    "SubReducedAssignment",
+    "MulReducedAssignment",
+    "DivReducedAssignment",
+    "MinReducedssignment",
+    "MaxReducedssignment",
     "assignment_from_stencil",
     "SymbolCreator",
     "create_data_handling",
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 47509e267..f8c2b1870 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -3,7 +3,7 @@ import numpy as np
 import sympy as sp
 
 import pystencils as ps
-from sympyextensions.reduction import reduced_assign
+from pystencils import AddReducedAssignment
 
 
 @pytest.mark.parametrize('dtype', ["float64", "float32"])
@@ -32,7 +32,7 @@ def test_log(dtype):
 
     omega = sp.Symbol("omega")
 
-    reduction_assignment = reduced_assign(omega, "+", x.center())
+    reduction_assignment = AddReducedAssignment(omega, x.center())
 
     ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype)
     code_reduction = ps.get_code_str(ast_reduction)
-- 
GitLab


From ba1458538a9c954803d26337f7b428f599421f2c Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 16:36:18 +0100
Subject: [PATCH 004/180] Get rid of reduction using the division operator

---
 src/pystencils/__init__.py                  | 2 --
 src/pystencils/sympyextensions/reduction.py | 6 +-----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index eecd929cf..916a61392 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -42,7 +42,6 @@ from .sympyextensions.reduction import (
     AddReducedAssignment,
     SubReducedAssignment,
     MulReducedAssignment,
-    DivReducedAssignment,
     MinReducedssignment,
     MaxReducedssignment
 )
@@ -80,7 +79,6 @@ __all__ = [
     "AddReducedAssignment",
     "SubReducedAssignment",
     "MulReducedAssignment",
-    "DivReducedAssignment",
     "MinReducedssignment",
     "MaxReducedssignment",
     "assignment_from_stencil",
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index aa947c1d2..90ab61ede 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -32,10 +32,6 @@ class MulReducedAssignment(ReducedAssignment):
     binop = '*'
 
 
-class DivReducedAssignment(ReducedAssignment):
-    binop = '/'
-
-
 class MinReducedssignment(ReducedAssignment):
     binop = 'min'
 
@@ -46,7 +42,7 @@ class MaxReducedssignment(ReducedAssignment):
 # Mapping from binary op strings to AugmentedAssignment subclasses
 reduced_assign_classes = {
     cls.binop: cls for cls in [
-        AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, DivReducedAssignment,
+        AddReducedAssignment, SubReducedAssignment, MulReducedAssignment,
         MinReducedssignment, MaxReducedssignment
     ]
 }
-- 
GitLab


From 778cfd51b4df56c7fccf3686b0c0d3273b43a202 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 16:39:39 +0100
Subject: [PATCH 005/180] Add functions for numeric limits (to be supported by
 the backends)

---
 src/pystencils/backend/functions.py             | 10 ++++++++++
 src/pystencils/backend/platforms/generic_cpu.py |  4 +++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index 388160f30..ea0d6cb9d 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -94,6 +94,16 @@ class MathFunctions(Enum):
         self.num_args = num_args
 
 
+class NumericLimitsFunctions(MathFunctions):
+    """Numerical limits functions supported by the backend.
+
+    Each platform has to materialize these functions to a concrete implementation.
+    """
+
+    min = ("min", 0)
+    max = ("max", 0)
+
+
 class PsMathFunction(PsFunction):
     """Homogenously typed mathematical functions."""
 
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 94fbfa0e1..7cb378703 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -43,7 +43,7 @@ class GenericCpu(Platform):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"<math.h>"}
+        return {"<math.h>", "<climits.h"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
@@ -62,6 +62,8 @@ class GenericCpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
+        # TODO: numeric limits
+
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
             cfunc: CFunction
             match func:
-- 
GitLab


From 719a76fba40197320d03bab06e1d139e6d24a724 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 16:42:00 +0100
Subject: [PATCH 006/180] Introduce reduction symbol property and add to lhs of
 reduced symbol

---
 .../backend/kernelcreation/context.py         |  2 ++
 .../backend/kernelcreation/freeze.py          | 28 ++++++++++++-------
 src/pystencils/codegen/properties.py          | 10 +++++++
 src/pystencils/sympyextensions/reduction.py   |  6 ++--
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 39fb8ef6d..4b4604a21 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -75,6 +75,8 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
+        # TODO: add list of reduction symbols
+
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
 
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 4d75f1ca6..0d1ce72e1 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -65,6 +65,9 @@ from ..exceptions import PsInputError
 from ..functions import PsMathFunction, MathFunctions
 from ..exceptions import FreezeError
 
+import backend.functions
+from codegen.properties import ReductionSymbolProperty
+
 
 ExprLike = (
     sp.Expr
@@ -188,27 +191,32 @@ class FreezeExpressions:
         lhs = self.visit(expr.lhs)
         rhs = self.visit(expr.rhs)
 
-        assert isinstance(lhs, PsExpression)
         assert isinstance(rhs, PsExpression)
+        assert isinstance(lhs, PsSymbolExpr)
 
         match expr.op:
-            case "+=":
+            case "+":
                 op = add
-            case "-=":
+                init_val = PsConstant(0)
+            case "-":
                 op = sub
-            case "*=":
+                init_val = PsConstant(0)
+            case "*":
                 op = mul
-            case "/=":
-                op = truediv
-            # TODO: unsure if sp.Min & sp.Max work here
-            case "min=":
+                init_val = PsConstant(1)
+            # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards
+            case "min":
                 op = sp.Min
-            case "max=":
+                init_val = backend.functions.NumericLimitsFunctions("min")
+            case "max":
                 op = sp.Max
+                init_val = backend.functions.NumericLimitsFunctions("max")
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
-        return PsAssignment(lhs, op(lhs.clone(), rhs)) # TODO: PsReducedAssignment?
+        lhs.symbol.add_property(ReductionSymbolProperty(expr.op, init_val))
+
+        return PsAssignment(lhs, op(lhs.clone(), rhs))
 
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index d377fb3d3..5578d2408 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -3,6 +3,8 @@ from dataclasses import dataclass
 
 from ..field import Field
 
+from backend.ast.expressions import PsExpression
+
 
 @dataclass(frozen=True)
 class PsSymbolProperty:
@@ -14,6 +16,14 @@ class UniqueSymbolProperty(PsSymbolProperty):
     """Base class for unique properties, of which only one instance may be registered at a time."""
 
 
+@dataclass(frozen=True)
+class ReductionSymbolProperty(UniqueSymbolProperty):
+    """Symbol acts as a base pointer to a field."""
+
+    op: str
+    init_val: PsExpression
+
+
 @dataclass(frozen=True)
 class FieldShape(PsSymbolProperty):
     """Symbol acts as a shape parameter to a field."""
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index 90ab61ede..e2760cc6c 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -12,13 +12,11 @@ class ReducedAssignment(AssignmentBase):
        Symbol for binary operation being applied in the assignment, such as "+",
        "*", etc.
     """
-    binop = None  # type: str
-
-    # TODO: initial value
+    binop = None # type: str
 
     @property
     def op(self):
-        return self.binop + '='
+        return self.binop
 
 
 class AddReducedAssignment(ReducedAssignment):
-- 
GitLab


From 66ce43954c585e5d576f895ab0d60f62196db813 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 17:19:09 +0100
Subject: [PATCH 007/180] Minor import fixes

---
 src/pystencils/backend/kernelcreation/freeze.py | 9 ++++-----
 src/pystencils/codegen/properties.py            | 2 +-
 src/pystencils/sympyextensions/__init__.py      | 2 --
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 0d1ce72e1..7316e2f9f 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -62,11 +62,10 @@ from ..ast.vector import PsVecMemAcc
 from ..constants import PsConstant
 from ...types import PsNumericType, PsStructType, PsType
 from ..exceptions import PsInputError
-from ..functions import PsMathFunction, MathFunctions
+from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
 from ..exceptions import FreezeError
 
-import backend.functions
-from codegen.properties import ReductionSymbolProperty
+from ...codegen.properties import ReductionSymbolProperty
 
 
 ExprLike = (
@@ -207,10 +206,10 @@ class FreezeExpressions:
             # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards
             case "min":
                 op = sp.Min
-                init_val = backend.functions.NumericLimitsFunctions("min")
+                init_val = NumericLimitsFunctions("min")
             case "max":
                 op = sp.Max
-                init_val = backend.functions.NumericLimitsFunctions("max")
+                init_val = NumericLimitsFunctions("max")
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index 5578d2408..cc4ff4101 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -3,7 +3,7 @@ from dataclasses import dataclass
 
 from ..field import Field
 
-from backend.ast.expressions import PsExpression
+from ..backend.ast.expressions import PsExpression
 
 
 @dataclass(frozen=True)
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 6ab24e936..7431416c9 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,7 +1,6 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc
 from .pointers import mem_acc
-from .reduction import reduced_assign
 
 from .math import (
     prod,
@@ -34,7 +33,6 @@ from .math import (
 
 __all__ = [
     "ConditionalFieldAccess",
-    "reduced_assign",
     "TypedSymbol",
     "CastFunc",
     "mem_acc",
-- 
GitLab


From 53fc7ca4c0ad2e601a09050b3273781ded65542a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 17:50:51 +0100
Subject: [PATCH 008/180] Add dictionary of reduced symbols to codegen context

---
 .../backend/kernelcreation/context.py           | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 4b4604a21..b9df6f682 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -75,7 +75,7 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
-        # TODO: add list of reduction symbols
+        self._symbols_with_reduction: dict[PsSymbol, ReductionSymbolProperty] = dict()
 
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
@@ -170,6 +170,21 @@ class KernelCreationContext:
 
         self._symbols[old.name] = new
 
+    def add_reduction_to_symbol(self, symbol: PsSymbol, reduction: ReductionSymbolProperty):
+        """Adds a reduction property to a symbol.
+
+        The symbol ``symbol`` should not have a reduction property and must exist in the symbol table.
+        """
+        if self.find_symbol(symbol.name) is None:
+            raise PsInternalCompilerError(
+                "add_reduction_to_symbol: Symbol does not exist in the symbol table"
+            )
+
+        if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty):
+            self._symbols_with_reduction[symbol] = reduction
+        else:
+            raise PsInternalCompilerError(f"add_reduction_to_symbol: Symbol {symbol.name} already has a reduction property")
+
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
     ) -> PsSymbol:
-- 
GitLab


From b8718cb1d67b14b39b9d806ff3131179fa97e24e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 17:51:22 +0100
Subject: [PATCH 009/180] Try fixing circular module import

---
 src/pystencils/backend/kernelcreation/context.py | 2 ++
 src/pystencils/codegen/properties.py             | 6 ++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index b9df6f682..686646815 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -9,6 +9,8 @@ from ...defaults import DEFAULTS
 from ...field import Field, FieldType
 from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType
 
+from ...codegen.properties import ReductionSymbolProperty
+
 from ..memory import PsSymbol, PsBuffer
 from ..constants import PsConstant
 from ...types import (
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index cc4ff4101..2b0af986a 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -2,9 +2,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 
 from ..field import Field
-
-from ..backend.ast.expressions import PsExpression
-
+from typing import Any
 
 @dataclass(frozen=True)
 class PsSymbolProperty:
@@ -21,7 +19,7 @@ class ReductionSymbolProperty(UniqueSymbolProperty):
     """Symbol acts as a base pointer to a field."""
 
     op: str
-    init_val: PsExpression
+    init_val: Any # TODO: type?
 
 
 @dataclass(frozen=True)
-- 
GitLab


From af855492661c5649d3286eb2153f369b3813fb88 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 12:59:16 +0100
Subject: [PATCH 010/180] Minor adaptation on how symbols are given reduction
 property

---
 src/pystencils/backend/kernelcreation/context.py | 5 +++--
 src/pystencils/backend/kernelcreation/freeze.py  | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 686646815..bcb3a53f8 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -179,13 +179,14 @@ class KernelCreationContext:
         """
         if self.find_symbol(symbol.name) is None:
             raise PsInternalCompilerError(
-                "add_reduction_to_symbol: Symbol does not exist in the symbol table"
+                f"add_reduction_to_symbol: {symbol.name} does not exist in the symbol table"
             )
 
         if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty):
+            symbol.add_property(reduction)
             self._symbols_with_reduction[symbol] = reduction
         else:
-            raise PsInternalCompilerError(f"add_reduction_to_symbol: Symbol {symbol.name} already has a reduction property")
+            raise PsInternalCompilerError(f"add_reduction_to_symbol: {symbol.name} already has a reduction property")
 
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 7316e2f9f..ae728dd49 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -213,7 +213,7 @@ class FreezeExpressions:
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
-        lhs.symbol.add_property(ReductionSymbolProperty(expr.op, init_val))
+        self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val))
 
         return PsAssignment(lhs, op(lhs.clone(), rhs))
 
-- 
GitLab


From 4ae330dc8e87ac8cbb83c6f402ab99cdcb9e9edb Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 13:33:12 +0100
Subject: [PATCH 011/180] Add C function selection for numeric limits functions

---
 src/pystencils/backend/platforms/generic_cpu.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 7cb378703..ea7799a14 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -3,8 +3,8 @@ from typing import Sequence
 
 from pystencils.backend.ast.expressions import PsCall
 
-from ..functions import CFunction, PsMathFunction, MathFunctions
-from ...types import PsIntegerType, PsIeeeFloatType
+from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions
+from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType
 
 from .platform import Platform
 from ..exceptions import MaterializationError
@@ -62,7 +62,10 @@ class GenericCpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        # TODO: numeric limits
+        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
+            cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
+            call.function = cfunc
+            return call
 
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
             cfunc: CFunction
-- 
GitLab


From a16969bf5c054b93c5e8a1a69a291d8437bbaa35 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 15:52:05 +0100
Subject: [PATCH 012/180] Add omp reduction clauses for reduced symbols

---
 src/pystencils/backend/kernelcreation/context.py      | 5 +++++
 src/pystencils/backend/transformations/add_pragmas.py | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index bcb3a53f8..f3ee646a5 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -219,6 +219,11 @@ class KernelCreationContext:
         """Return an iterable of all symbols listed in the symbol table."""
         return self._symbols.values()
 
+    @property
+    def symbols_with_reduction(self) -> dict[PsSymbol, ReductionSymbolProperty]:
+        """Return a dictionary holding symbols and their reduction property."""
+        return self._symbols_with_reduction
+
     #   Fields and Arrays
 
     @property
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index 78e721f38..6d72e1550 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -10,6 +10,8 @@ from ..ast import PsAstNode
 from ..ast.structural import PsBlock, PsLoop, PsPragma
 from ..ast.expressions import PsExpression
 
+from ...types import PsScalarType
+
 if TYPE_CHECKING:
     from ...codegen.config import OpenMpConfig
 
@@ -110,6 +112,13 @@ class AddOpenMP:
         pragma_text += " parallel" if not omp_params.omit_parallel_construct else ""
         pragma_text += f" for schedule({omp_params.schedule})"
 
+        if bool(ctx.symbols_with_reduction):
+            for symbol, reduction in ctx.symbols_with_reduction.items():
+                if isinstance(symbol.dtype, PsScalarType):
+                    pragma_text += f" reduction({reduction.op}: {symbol.name})"
+                else:
+                    NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.")
+
         if omp_params.num_threads is not None:
             pragma_text += f" num_threads({str(omp_params.num_threads)})"
 
-- 
GitLab


From 555a6a836408071ca32c705bdf0fdf5e5e610437 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 16:15:24 +0100
Subject: [PATCH 013/180] Reformat reduction.py

---
 src/pystencils/sympyextensions/reduction.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index e2760cc6c..c9e5bfdfb 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -12,7 +12,7 @@ class ReducedAssignment(AssignmentBase):
        Symbol for binary operation being applied in the assignment, such as "+",
        "*", etc.
     """
-    binop = None # type: str
+    binop = None  # type: str
 
     @property
     def op(self):
@@ -22,6 +22,7 @@ class ReducedAssignment(AssignmentBase):
 class AddReducedAssignment(ReducedAssignment):
     binop = '+'
 
+
 class SubReducedAssignment(ReducedAssignment):
     binop = '-'
 
@@ -33,6 +34,7 @@ class MulReducedAssignment(ReducedAssignment):
 class MinReducedssignment(ReducedAssignment):
     binop = 'min'
 
+
 class MaxReducedssignment(ReducedAssignment):
     binop = 'max'
 
@@ -45,7 +47,8 @@ reduced_assign_classes = {
     ]
 }
 
+
 def reduced_assign(lhs, op, rhs):
     if op not in reduced_assign_classes:
         raise ValueError("Unrecognized operator %s" % op)
-    return reduced_assign_classes[op](lhs, rhs)
\ No newline at end of file
+    return reduced_assign_classes[op](lhs, rhs)
-- 
GitLab


From ef9239ede7d00457d90aab5a1740894dfc47d21a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 16:18:37 +0100
Subject: [PATCH 014/180] Add back reduced_assign to sympyextensions interface

---
 src/pystencils/sympyextensions/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 7431416c9..6ab24e936 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,6 +1,7 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc
 from .pointers import mem_acc
+from .reduction import reduced_assign
 
 from .math import (
     prod,
@@ -33,6 +34,7 @@ from .math import (
 
 __all__ = [
     "ConditionalFieldAccess",
+    "reduced_assign",
     "TypedSymbol",
     "CastFunc",
     "mem_acc",
-- 
GitLab


From cf2ec0662b8ea423c252c71e4e926e7fc388d4da Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 16:19:23 +0100
Subject: [PATCH 015/180] Fix inheritance of special math function enum classes

---
 src/pystencils/backend/functions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index ea0d6cb9d..736345395 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -94,7 +94,7 @@ class MathFunctions(Enum):
         self.num_args = num_args
 
 
-class NumericLimitsFunctions(MathFunctions):
+class NumericLimitsFunctions(Enum):
     """Numerical limits functions supported by the backend.
 
     Each platform has to materialize these functions to a concrete implementation.
@@ -109,12 +109,12 @@ class PsMathFunction(PsFunction):
 
     __match_args__ = ("func",)
 
-    def __init__(self, func: MathFunctions) -> None:
+    def __init__(self, func: MathFunctions | NumericLimitsFunctions) -> None:
         super().__init__(func.function_name, func.num_args)
         self._func = func
 
     @property
-    def func(self) -> MathFunctions:
+    def func(self) -> MathFunctions | NumericLimitsFunctions:
         return self._func
 
     def __str__(self) -> str:
-- 
GitLab


From 9741c024245137405c2c7b09db63267d88d6c12b Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 16:20:14 +0100
Subject: [PATCH 016/180] Fix header include of limits.h

---
 src/pystencils/backend/platforms/generic_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index ea7799a14..e1a34564d 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -43,7 +43,7 @@ class GenericCpu(Platform):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"<math.h>", "<climits.h"}
+        return {"<math.h>", "<limits.h>"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-- 
GitLab


From 9a8e6f9bb9a14a144f80b678147c1a4c36456741 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 17:30:25 +0100
Subject: [PATCH 017/180] Omit distinction between normal and reduced
 assignments in AssignmentCollection

---
 src/pystencils/simp/assignment_collection.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py
index 212dbf751..03b4edccf 100644
--- a/src/pystencils/simp/assignment_collection.py
+++ b/src/pystencils/simp/assignment_collection.py
@@ -9,8 +9,6 @@ import pystencils
 from ..assignment import Assignment
 from .simplifications import (sort_assignments_topologically, transform_lhs_and_rhs, transform_rhs)
 from ..sympyextensions import count_operations, fast_subs
-from ..sympyextensions import reduced_assign
-from ..sympyextensions.reduction import ReducedAssignment
 
 
 class AssignmentCollection:
@@ -58,13 +56,8 @@ class AssignmentCollection:
         subexpressions = list(itertools.chain.from_iterable(
             [(a if isinstance(a, Iterable) else [a]) for a in subexpressions]))
 
-        # filter out reduced assignments
-        reduced_assignments = [a for a in main_assignments if isinstance(a, ReducedAssignment)]
-        main_assignments = [a for a in main_assignments if (a not in reduced_assignments)]
-
         self.main_assignments = main_assignments
         self.subexpressions = subexpressions
-        self.reductions = reduced_assignments
 
         if simplification_hints is None:
             simplification_hints = {}
@@ -79,11 +72,6 @@ class AssignmentCollection:
         else:
             self.subexpression_symbol_generator = subexpression_symbol_generator
 
-    def add_reduction(self, lhs: sp.Symbol, op: str, rhs: sp.Expr) -> None:
-        """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet."""
-        assert lhs not in self.reductions, f"Reduction for lhs {lhs} exists"
-        self.reductions.append(reduced_assign(lhs, op, rhs))
-
     def add_simplification_hint(self, key: str, value: Any) -> None:
         """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet."""
         assert key not in self.simplification_hints, "This hint already exists"
-- 
GitLab


From e9ee769d2b8d2f177611d6ca5c2ccc0394d83874 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 17:38:46 +0100
Subject: [PATCH 018/180] Adaptations to reduction test

---
 tests/kernelcreation/test_reduction.py | 40 ++++++++------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index f8c2b1870..0532b30f5 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -6,39 +6,23 @@ import pystencils as ps
 from pystencils import AddReducedAssignment
 
 
-@pytest.mark.parametrize('dtype', ["float64", "float32"])
-def test_log(dtype):
-    a = sp.Symbol("a")
+@pytest.mark.parametrize('dtype', ["float64"])
+def test_reduction(dtype):
     x = ps.fields(f'x: {dtype}[1d]')
+    w = sp.Symbol("w")
 
-    # kernel with main assignments and no reduction
+    # kernel with reduction assignment
 
-    main_assignment = ps.AssignmentCollection({x.center(): a})
+    reduction_assignment = AddReducedAssignment(w, x.center())
 
-    ast_main = ps.create_kernel(main_assignment, default_dtype=dtype)
-    code_main = ps.get_code_str(ast_main)
-    kernel_main = ast_main.compile()
+    config = ps.CreateKernelConfig(cpu_openmp=True)
 
-    # ps.show_code(ast)
-
-    if dtype == "float64":
-        assert "float" not in code_main
-
-    array = np.zeros((10,), dtype=dtype)
-    kernel_main(x=array, a=100)
-    assert np.allclose(array, 4.60517019)
-
-    # kernel with single reduction assignment
-
-    omega = sp.Symbol("omega")
-
-    reduction_assignment = AddReducedAssignment(omega, x.center())
-
-    ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype)
-    code_reduction = ps.get_code_str(ast_reduction)
+    ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype)
+    #code_reduction = ps.get_code_str(ast_reduction)
     kernel_reduction = ast_reduction.compile()
 
-    if dtype == "float64":
-        assert "float" not in code_reduction
+    ps.show_code(ast_reduction)
 
-    ps.show_code(ast_reduction)
\ No newline at end of file
+    array = np.ones((10,), dtype=dtype)
+    kernel_reduction(x=array, w=0)
+    # TODO: check if "w = #points"
\ No newline at end of file
-- 
GitLab


From f16d8e7978174c24f682202b65aa64e1d53003bb Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 18:15:58 +0100
Subject: [PATCH 019/180] Rename min/max of numeric limits enum

---
 src/pystencils/backend/platforms/generic_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index e1a34564d..27df6aee4 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -62,7 +62,7 @@ class GenericCpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
+        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.min, NumericLimitsFunctions.max):
             cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
             call.function = cfunc
             return call
-- 
GitLab


From 1a1c23b57015ace0c821d0a709d2bf17fa67b42a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 18:38:34 +0100
Subject: [PATCH 020/180] Adapt comment of ReductionSymbolProperty

---
 src/pystencils/codegen/properties.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index 2b0af986a..0bad4e898 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -4,6 +4,7 @@ from dataclasses import dataclass
 from ..field import Field
 from typing import Any
 
+
 @dataclass(frozen=True)
 class PsSymbolProperty:
     """Base class for symbol properties, which can be used to add additional information to symbols"""
@@ -16,10 +17,10 @@ class UniqueSymbolProperty(PsSymbolProperty):
 
 @dataclass(frozen=True)
 class ReductionSymbolProperty(UniqueSymbolProperty):
-    """Symbol acts as a base pointer to a field."""
+    """Property for symbols specifying the operation and initial value for a reduction."""
 
     op: str
-    init_val: Any # TODO: type?
+    init_val: Any  # TODO: type?
 
 
 @dataclass(frozen=True)
-- 
GitLab


From fff5a079171ad884069cd307f2777a2dc0aa68f6 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 13:43:32 +0100
Subject: [PATCH 021/180] Fix removal of function parameters for lhs symbols
 that are not declared in the kernel

---
 src/pystencils/backend/ast/analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/ast/analysis.py b/src/pystencils/backend/ast/analysis.py
index edeba04f2..7032690a0 100644
--- a/src/pystencils/backend/ast/analysis.py
+++ b/src/pystencils/backend/ast/analysis.py
@@ -62,7 +62,7 @@ class UndefinedSymbolsCollector:
 
             case PsAssignment(lhs, rhs):
                 undefined_vars = self(lhs) | self(rhs)
-                if isinstance(lhs, PsSymbolExpr):
+                if isinstance(node, PsDeclaration) and isinstance(lhs, PsSymbolExpr):
                     undefined_vars.remove(lhs.symbol)
                 return undefined_vars
 
-- 
GitLab


From bb984679607d2f3625849667f06c348245b1813a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 14:29:10 +0100
Subject: [PATCH 022/180] Fix usage of numerical limits for init value of
 reduction

---
 src/pystencils/backend/functions.py             | 8 ++++++--
 src/pystencils/backend/kernelcreation/freeze.py | 4 ++--
 src/pystencils/backend/platforms/generic_cpu.py | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index 736345395..18c2277cf 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -100,8 +100,12 @@ class NumericLimitsFunctions(Enum):
     Each platform has to materialize these functions to a concrete implementation.
     """
 
-    min = ("min", 0)
-    max = ("max", 0)
+    Min = ("min", 0)
+    Max = ("max", 0)
+
+    def __init__(self, func_name, num_args):
+        self.function_name = func_name
+        self.num_args = num_args
 
 
 class PsMathFunction(PsFunction):
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index ae728dd49..9a34303e2 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -206,10 +206,10 @@ class FreezeExpressions:
             # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards
             case "min":
                 op = sp.Min
-                init_val = NumericLimitsFunctions("min")
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
             case "max":
                 op = sp.Max
-                init_val = NumericLimitsFunctions("max")
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 27df6aee4..e1a34564d 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -62,7 +62,7 @@ class GenericCpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.min, NumericLimitsFunctions.max):
+        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
             cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
             call.function = cfunc
             return call
-- 
GitLab


From a3025645e30265621d41315e7a0449768225c361 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 14:55:42 +0100
Subject: [PATCH 023/180] Fix min/max reductions

---
 src/pystencils/backend/kernelcreation/freeze.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 9a34303e2..64230203f 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -193,29 +193,31 @@ class FreezeExpressions:
         assert isinstance(rhs, PsExpression)
         assert isinstance(lhs, PsSymbolExpr)
 
+        # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
+        new_rhs: PsExpression
         match expr.op:
             case "+":
-                op = add
                 init_val = PsConstant(0)
+                new_rhs = add(lhs.clone(), rhs)
             case "-":
-                op = sub
                 init_val = PsConstant(0)
+                new_rhs = sub(lhs.clone(), rhs)
             case "*":
-                op = mul
                 init_val = PsConstant(1)
-            # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards
+                new_rhs = mul(lhs.clone(), rhs)
             case "min":
-                op = sp.Min
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
+                new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs])
             case "max":
-                op = sp.Max
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
+                new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
+        # set reduction symbol property in context
         self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val))
 
-        return PsAssignment(lhs, op(lhs.clone(), rhs))
+        return PsAssignment(lhs, new_rhs)
 
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
-- 
GitLab


From 9bbb8181dcd0717dd61de853f33e11c1ca19d806 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 14:56:12 +0100
Subject: [PATCH 024/180] Parameterize test_reduction.py for different
 reduction operations

---
 tests/kernelcreation/test_reduction.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 0532b30f5..c41d250f4 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -3,17 +3,18 @@ import numpy as np
 import sympy as sp
 
 import pystencils as ps
-from pystencils import AddReducedAssignment
+from pystencils.sympyextensions import reduced_assign
 
 
 @pytest.mark.parametrize('dtype', ["float64"])
-def test_reduction(dtype):
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
+def test_reduction(dtype, op):
     x = ps.fields(f'x: {dtype}[1d]')
     w = sp.Symbol("w")
 
     # kernel with reduction assignment
 
-    reduction_assignment = AddReducedAssignment(w, x.center())
+    reduction_assignment = reduced_assign(w, op, x.center())
 
     config = ps.CreateKernelConfig(cpu_openmp=True)
 
-- 
GitLab


From 3c5a93b4a0016a2c12e0f210fd1324816e6df15e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 15:02:19 +0100
Subject: [PATCH 025/180] Define type of init_val for reduction as Any

---
 src/pystencils/backend/kernelcreation/freeze.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 64230203f..840329013 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -195,6 +195,7 @@ class FreezeExpressions:
 
         # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
         new_rhs: PsExpression
+        init_val: Any  # TODO: type?
         match expr.op:
             case "+":
                 init_val = PsConstant(0)
-- 
GitLab


From 75ea862f50d5372126d1a934b4b1e15ba3dc8c85 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 15:06:57 +0100
Subject: [PATCH 026/180] Try fix mypy no-redef error

---
 src/pystencils/backend/platforms/generic_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index e1a34564d..3deb03329 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -63,12 +63,12 @@ class GenericCpu(Platform):
         arg_types = (dtype,) * func.num_args
 
         if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
+            cfunc: CFunction
             cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
             call.function = cfunc
             return call
 
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
-            cfunc: CFunction
             match func:
                 case (
                     MathFunctions.Exp
-- 
GitLab


From 90ca9ead0199cd4f5988e6c43e9c9c5350f566b6 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 20 Jan 2025 17:46:49 +0100
Subject: [PATCH 027/180] Try initializing kernel-local reduction variable copy

---
 .../backend/kernelcreation/freeze.py          | 28 +++++++++++--------
 src/pystencils/codegen/driver.py              | 12 +++++++-
 src/pystencils/codegen/properties.py          |  7 +++--
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 840329013..e0dcba8fd 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -7,6 +7,7 @@ import sympy.core.relational
 import sympy.logic.boolalg
 from sympy.codegen.ast import AssignmentBase, AugmentedAssignment
 
+from ..memory import PsSymbol
 from ...assignment import Assignment
 from ...simp import AssignmentCollection
 from ...sympyextensions import (
@@ -193,32 +194,37 @@ class FreezeExpressions:
         assert isinstance(rhs, PsExpression)
         assert isinstance(lhs, PsSymbolExpr)
 
+        # create kernel-local copy of lhs symbol to work with
+        new_lhs_symbol = PsSymbol(f"{lhs.symbol.name}_local", lhs.dtype)
+        new_lhs = PsSymbolExpr(new_lhs_symbol)
+        self._ctx.add_symbol(new_lhs_symbol)
+
         # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
         new_rhs: PsExpression
-        init_val: Any  # TODO: type?
+        init_val: PsExpression
         match expr.op:
             case "+":
-                init_val = PsConstant(0)
-                new_rhs = add(lhs.clone(), rhs)
+                init_val = PsConstantExpr(PsConstant(0))
+                new_rhs = add(new_lhs.clone(), rhs)
             case "-":
-                init_val = PsConstant(0)
-                new_rhs = sub(lhs.clone(), rhs)
+                init_val = PsConstantExpr(PsConstant(0))
+                new_rhs = sub(new_lhs.clone(), rhs)
             case "*":
-                init_val = PsConstant(1)
-                new_rhs = mul(lhs.clone(), rhs)
+                init_val = PsConstantExpr(PsConstant(1))
+                new_rhs = mul(new_lhs.clone(), rhs)
             case "min":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
-                new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs])
+                new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs])
             case "max":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
-                new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs])
+                new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
         # set reduction symbol property in context
-        self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val))
+        self._ctx.add_reduction_to_symbol(new_lhs_symbol, ReductionSymbolProperty(expr.op, init_val, lhs.symbol))
 
-        return PsAssignment(lhs, new_rhs)
+        return PsAssignment(new_lhs, new_rhs)
 
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 7bdec96cc..199860743 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -7,12 +7,13 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
 from .parameters import Parameter
+from ..backend.ast.expressions import PsSymbolExpr
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
 
 from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
-from ..backend.ast.structural import PsBlock, PsLoop
+from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment
 from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -151,6 +152,14 @@ class DefaultKernelCreationDriver:
         if self._intermediates is not None:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
+        #   Init local reduction variable copy
+        # for red, prop in self._ctx.symbols_with_reduction.items():
+        #     kernel_ast.statements = [PsAssignment(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements
+
+        #   Write back result to reduction target variable
+        # for red, prop in self._ctx.symbols_with_reduction.items():
+        #     kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))]
+
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
             kernel_ast = self._transform_for_cpu(kernel_ast)
@@ -449,6 +458,7 @@ def _get_function_params(
         props: set[PsSymbolProperty] = set()
         for prop in symb.properties:
             match prop:
+                # TODO: how to export reduction result (via pointer)?
                 case FieldShape() | FieldStride():
                     props.add(prop)
                 case BufferBasePtr(buf):
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index 0bad4e898..4b8e7f2bf 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 from dataclasses import dataclass
 
 from ..field import Field
-from typing import Any
 
 
 @dataclass(frozen=True)
@@ -19,8 +18,12 @@ class UniqueSymbolProperty(PsSymbolProperty):
 class ReductionSymbolProperty(UniqueSymbolProperty):
     """Property for symbols specifying the operation and initial value for a reduction."""
 
+    from ..backend.memory import PsSymbol
+    from ..backend.ast.expressions import PsExpression
+
     op: str
-    init_val: Any  # TODO: type?
+    init_val: PsExpression
+    orig_symbol: PsSymbol
 
 
 @dataclass(frozen=True)
-- 
GitLab


From 3fc9a049683b0cbac6bfa721efd38db05c201236 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 21 Jan 2025 13:55:35 +0100
Subject: [PATCH 028/180] Swap out neutral init values for reduced assignments
 with min/max op

---
 src/pystencils/backend/kernelcreation/freeze.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 840329013..b58813fcd 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -207,10 +207,10 @@ class FreezeExpressions:
                 init_val = PsConstant(1)
                 new_rhs = mul(lhs.clone(), rhs)
             case "min":
-                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
                 new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs])
             case "max":
-                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
                 new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
-- 
GitLab


From 9fd1c2ad9bb4f8c225627306f0369273092ef737 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 21 Jan 2025 17:34:06 +0100
Subject: [PATCH 029/180] Fix declaration of local reduction var and write back
 to original variable

---
 src/pystencils/codegen/driver.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 199860743..4b08b84ef 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -13,7 +13,7 @@ from ..types import create_numeric_type, PsIntegerType, PsScalarType
 
 from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
-from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment
+from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment, PsDeclaration
 from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -153,12 +153,16 @@ class DefaultKernelCreationDriver:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
         #   Init local reduction variable copy
-        # for red, prop in self._ctx.symbols_with_reduction.items():
-        #     kernel_ast.statements = [PsAssignment(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements
+        for red, prop in self._ctx.symbols_with_reduction.items():
+            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements
 
         #   Write back result to reduction target variable
-        # for red, prop in self._ctx.symbols_with_reduction.items():
-        #     kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))]
+        for red, prop in self._ctx.symbols_with_reduction.items():
+            kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))]
+
+        # TODO: can this be omitted?
+        typify = Typifier(self._ctx)
+        kernel_ast = typify(kernel_ast)
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
-- 
GitLab


From 6bc3cf3f17ed3395dabc15a2207739de245cd038 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 21 Jan 2025 18:42:43 +0100
Subject: [PATCH 030/180] Set type of reduced variable to pointer and write
 back via PsMemAcc

---
 src/pystencils/backend/kernelcreation/freeze.py | 15 ++++++++++-----
 src/pystencils/codegen/driver.py                | 10 ++++------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 06d98a44e..d8fb1b91e 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -61,7 +61,7 @@ from ..ast.expressions import (
 from ..ast.vector import PsVecMemAcc
 
 from ..constants import PsConstant
-from ...types import PsNumericType, PsStructType, PsType
+from ...types import PsNumericType, PsStructType, PsType, PsPointerType
 from ..exceptions import PsInputError
 from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
 from ..exceptions import FreezeError
@@ -195,9 +195,9 @@ class FreezeExpressions:
         assert isinstance(lhs, PsSymbolExpr)
 
         # create kernel-local copy of lhs symbol to work with
-        new_lhs_symbol = PsSymbol(f"{lhs.symbol.name}_local", lhs.dtype)
-        new_lhs = PsSymbolExpr(new_lhs_symbol)
-        self._ctx.add_symbol(new_lhs_symbol)
+        new_lhs_symb = PsSymbol(f"{lhs.symbol.name}_local", rhs.dtype)
+        new_lhs = PsSymbolExpr(new_lhs_symb)
+        self._ctx.add_symbol(new_lhs_symb)
 
         # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
         new_rhs: PsExpression
@@ -221,8 +221,13 @@ class FreezeExpressions:
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
+        # replace original symbol with pointer-based type used for export
+        orig_symbol_as_ptr = PsSymbol(lhs.symbol.name, PsPointerType(rhs.dtype))
+        self._ctx.replace_symbol(lhs.symbol, orig_symbol_as_ptr)
+
         # set reduction symbol property in context
-        self._ctx.add_reduction_to_symbol(new_lhs_symbol, ReductionSymbolProperty(expr.op, init_val, lhs.symbol))
+        init_val.dtype = rhs.dtype
+        self._ctx.add_reduction_to_symbol(new_lhs_symb, ReductionSymbolProperty(expr.op, init_val, orig_symbol_as_ptr))
 
         return PsAssignment(new_lhs, new_rhs)
 
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 4b08b84ef..04d7376d0 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -7,7 +7,7 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
 from .parameters import Parameter
-from ..backend.ast.expressions import PsSymbolExpr
+from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
 
@@ -158,11 +158,9 @@ class DefaultKernelCreationDriver:
 
         #   Write back result to reduction target variable
         for red, prop in self._ctx.symbols_with_reduction.items():
-            kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))]
-
-        # TODO: can this be omitted?
-        typify = Typifier(self._ctx)
-        kernel_ast = typify(kernel_ast)
+            kernel_ast.statements += [PsAssignment(
+                PsMemAcc(PsSymbolExpr(prop.orig_symbol), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))),
+                PsSymbolExpr(red))]
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
-- 
GitLab


From b5dd2ef085b19e505d0331d8aa8f6dae9ee85eb0 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 15:16:32 +0100
Subject: [PATCH 031/180] Split reduction var property into local and
 pointer-based reduction var properties

---
 .../backend/kernelcreation/context.py         | 54 ++++++++++++++-----
 .../backend/kernelcreation/freeze.py          | 31 ++++++-----
 .../backend/transformations/add_pragmas.py    |  4 +-
 src/pystencils/codegen/driver.py              | 10 ++--
 src/pystencils/codegen/properties.py          | 16 ++++--
 5 files changed, 78 insertions(+), 37 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index f3ee646a5..5e5ca117d 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -9,7 +9,7 @@ from ...defaults import DEFAULTS
 from ...field import Field, FieldType
 from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType
 
-from ...codegen.properties import ReductionSymbolProperty
+from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable
 
 from ..memory import PsSymbol, PsBuffer
 from ..constants import PsConstant
@@ -77,7 +77,8 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
-        self._symbols_with_reduction: dict[PsSymbol, ReductionSymbolProperty] = dict()
+        self._local_reduction_symbols: dict[PsSymbol, LocalReductionVariable] = dict()
+        self._reduction_ptr_symbols: dict[PsSymbol, ReductionPointerVariable] = dict()
 
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
@@ -172,21 +173,41 @@ class KernelCreationContext:
 
         self._symbols[old.name] = new
 
-    def add_reduction_to_symbol(self, symbol: PsSymbol, reduction: ReductionSymbolProperty):
-        """Adds a reduction property to a symbol.
+    def add_local_reduction_symbol(self, local_symb: PsSymbol, local_var_prop: LocalReductionVariable):
+        """Adds entry for a symbol and its property to the lookup table for local reduction variables.
 
-        The symbol ``symbol`` should not have a reduction property and must exist in the symbol table.
+        The symbol ``symbol`` should not have a 'LocalReductionSymbol' property and shall not exist in the symbol table.
         """
-        if self.find_symbol(symbol.name) is None:
+        if self.find_symbol(local_symb.name) is not None:
             raise PsInternalCompilerError(
-                f"add_reduction_to_symbol: {symbol.name} does not exist in the symbol table"
+                f"add_local_reduction_symbol: {local_symb.name} already exist in the symbol table"
             )
+        self.add_symbol(local_symb)
 
-        if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty):
-            symbol.add_property(reduction)
-            self._symbols_with_reduction[symbol] = reduction
+        if local_symb not in self._local_reduction_symbols and not local_symb.get_properties(LocalReductionVariable):
+            local_symb.add_property(local_var_prop)
+            self._local_reduction_symbols[local_symb] = local_var_prop
         else:
-            raise PsInternalCompilerError(f"add_reduction_to_symbol: {symbol.name} already has a reduction property")
+            raise PsInternalCompilerError(
+                f"add_local_reduction_symbol: {local_symb.name} already exists in local reduction table"
+            )
+
+    def add_reduction_ptr_symbol(self, orig_symb: PsSymbol, ptr_symb: PsSymbol, ptr_var_prop: ReductionPointerVariable):
+        """Replaces reduction symbol with a pointer-based counterpart used for export
+        and adds the new symbol and its property to the lookup table for pointer-based reduction variables
+
+        The symbol ``ptr_symbol`` should not exist in the symbol table.
+        """
+        self.replace_symbol(orig_symb, ptr_symb)
+
+        if ptr_symb not in self._reduction_ptr_symbols and not ptr_symb.get_properties(
+                ReductionPointerVariable):
+            ptr_symb.add_property(ptr_var_prop)
+            self._reduction_ptr_symbols[ptr_symb] = ptr_var_prop
+        else:
+            raise PsInternalCompilerError(
+                f"add_reduction_ptr_symbol: {ptr_symb.name} already exists in pointer-based reduction variable table "
+            )
 
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
@@ -220,9 +241,14 @@ class KernelCreationContext:
         return self._symbols.values()
 
     @property
-    def symbols_with_reduction(self) -> dict[PsSymbol, ReductionSymbolProperty]:
-        """Return a dictionary holding symbols and their reduction property."""
-        return self._symbols_with_reduction
+    def local_reduction_symbols(self) -> dict[PsSymbol, LocalReductionVariable]:
+        """Return a dictionary holding kernel-local reduction symbols and their reduction properties."""
+        return self._local_reduction_symbols
+
+    @property
+    def reduction_pointer_symbols(self) -> dict[PsSymbol, ReductionPointerVariable]:
+        """Return a dictionary holding pointer-based reduction symbols and their reduction properties."""
+        return self._reduction_ptr_symbols
 
     #   Fields and Arrays
 
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index d8fb1b91e..1e9984def 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -66,7 +66,7 @@ from ..exceptions import PsInputError
 from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
 from ..exceptions import FreezeError
 
-from ...codegen.properties import ReductionSymbolProperty
+from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable
 
 
 ExprLike = (
@@ -194,40 +194,45 @@ class FreezeExpressions:
         assert isinstance(rhs, PsExpression)
         assert isinstance(lhs, PsSymbolExpr)
 
+        orig_lhs_symb = lhs.symbol
+        dtype = rhs.dtype # TODO: kernel with (implicit) up/downcasts?
+
+        # replace original symbol with pointer-based type used for export
+        orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype))
+
         # create kernel-local copy of lhs symbol to work with
-        new_lhs_symb = PsSymbol(f"{lhs.symbol.name}_local", rhs.dtype)
+        new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype)
         new_lhs = PsSymbolExpr(new_lhs_symb)
-        self._ctx.add_symbol(new_lhs_symb)
 
         # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
         new_rhs: PsExpression
         init_val: PsExpression
         match expr.op:
             case "+":
-                init_val = PsConstantExpr(PsConstant(0))
+                init_val = PsConstantExpr(PsConstant(0, dtype))
                 new_rhs = add(new_lhs.clone(), rhs)
             case "-":
-                init_val = PsConstantExpr(PsConstant(0))
+                init_val = PsConstantExpr(PsConstant(0, dtype))
                 new_rhs = sub(new_lhs.clone(), rhs)
             case "*":
-                init_val = PsConstantExpr(PsConstant(1))
+                init_val = PsConstantExpr(PsConstant(1, dtype))
                 new_rhs = mul(new_lhs.clone(), rhs)
             case "min":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
+                init_val.dtype = dtype
                 new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs])
             case "max":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
+                init_val.dtype = dtype
                 new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
-        # replace original symbol with pointer-based type used for export
-        orig_symbol_as_ptr = PsSymbol(lhs.symbol.name, PsPointerType(rhs.dtype))
-        self._ctx.replace_symbol(lhs.symbol, orig_symbol_as_ptr)
-
-        # set reduction symbol property in context
-        init_val.dtype = rhs.dtype
-        self._ctx.add_reduction_to_symbol(new_lhs_symb, ReductionSymbolProperty(expr.op, init_val, orig_symbol_as_ptr))
+        # set reduction symbol properties (local/pointer variables) in context
+        self._ctx.add_local_reduction_symbol(new_lhs_symb,
+                                             LocalReductionVariable(expr.op, init_val, orig_lhs_symb_as_ptr))
+        self._ctx.add_reduction_ptr_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr,
+                                           ReductionPointerVariable(expr.op, new_lhs_symb))
 
         return PsAssignment(new_lhs, new_rhs)
 
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index 6d72e1550..44d1d1ede 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -112,8 +112,8 @@ class AddOpenMP:
         pragma_text += " parallel" if not omp_params.omit_parallel_construct else ""
         pragma_text += f" for schedule({omp_params.schedule})"
 
-        if bool(ctx.symbols_with_reduction):
-            for symbol, reduction in ctx.symbols_with_reduction.items():
+        if bool(ctx.local_reduction_symbols):
+            for symbol, reduction in ctx.local_reduction_symbols.items():
                 if isinstance(symbol.dtype, PsScalarType):
                     pragma_text += f" reduction({reduction.op}: {symbol.name})"
                 else:
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 04d7376d0..3fe2fe74e 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -153,14 +153,14 @@ class DefaultKernelCreationDriver:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
         #   Init local reduction variable copy
-        for red, prop in self._ctx.symbols_with_reduction.items():
-            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements
+        for local_red, prop in self._ctx.local_reduction_symbols.items():
+            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), prop.init_val)] + kernel_ast.statements
 
         #   Write back result to reduction target variable
-        for red, prop in self._ctx.symbols_with_reduction.items():
+        for red_ptr, prop in self._ctx.reduction_pointer_symbols.items():
             kernel_ast.statements += [PsAssignment(
-                PsMemAcc(PsSymbolExpr(prop.orig_symbol), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))),
-                PsSymbolExpr(red))]
+                PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))),
+                PsSymbolExpr(prop.local_symbol))]
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index 4b8e7f2bf..1e71c5b98 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -15,15 +15,25 @@ class UniqueSymbolProperty(PsSymbolProperty):
 
 
 @dataclass(frozen=True)
-class ReductionSymbolProperty(UniqueSymbolProperty):
-    """Property for symbols specifying the operation and initial value for a reduction."""
+class LocalReductionVariable(PsSymbolProperty):
+    """Property for symbols specifying the operation and initial value for a kernel-local reduction variable."""
 
     from ..backend.memory import PsSymbol
     from ..backend.ast.expressions import PsExpression
 
     op: str
     init_val: PsExpression
-    orig_symbol: PsSymbol
+    ptr_symbol: PsSymbol
+
+
+@dataclass(frozen=True)
+class ReductionPointerVariable(PsSymbolProperty):
+    """Property for pointer-type symbols exporting the reduction result from the kernel."""
+
+    from ..backend.memory import PsSymbol
+
+    op: str
+    local_symbol: PsSymbol
 
 
 @dataclass(frozen=True)
-- 
GitLab


From 350a4eac9394ca7bb95231b5ad14a1526952e23c Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 16:01:00 +0100
Subject: [PATCH 032/180] Propagate properties of reduction pointer symbols to
 kernel parameters

---
 src/pystencils/codegen/driver.py           | 5 +++--
 src/pystencils/jit/cpu_extension_module.py | 7 +++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 3fe2fe74e..dd71e30be 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -5,7 +5,7 @@ from dataclasses import dataclass, replace
 from .target import Target
 from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
-from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
+from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable
 from .parameters import Parameter
 from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
 
@@ -460,7 +460,8 @@ def _get_function_params(
         props: set[PsSymbolProperty] = set()
         for prop in symb.properties:
             match prop:
-                # TODO: how to export reduction result (via pointer)?
+                case ReductionPointerVariable():
+                    props.add(prop)
                 case FieldShape() | FieldStride():
                     props.add(prop)
                 case BufferBasePtr(buf):
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index befb033e6..c2c969eaa 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -13,7 +13,7 @@ from ..codegen import (
     Kernel,
     Parameter,
 )
-from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride
+from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride, ReductionPointerVariable
 from ..types import (
     PsType,
     PsUnsignedIntegerType,
@@ -265,7 +265,10 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         return self._array_buffers[field]
 
     def extract_scalar(self, param: Parameter) -> str:
-        if param not in self._scalar_extractions:
+        if any(isinstance(e, ReductionPointerVariable) for e in param.properties):
+            # TODO: implement
+            pass
+        elif param not in self._scalar_extractions:
             extract_func = self._scalar_extractor(param.dtype)
             code = self.TMPL_EXTRACT_SCALAR.format(
                 name=param.name,
-- 
GitLab


From 6c8ee44f148f13b456c0c9c754e434cc9cf5b59a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 16:01:51 +0100
Subject: [PATCH 033/180] Use literals for C macros used for the numeric limits

---
 .../backend/platforms/generic_cpu.py          | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 3deb03329..40c338315 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -4,6 +4,7 @@ from typing import Sequence
 from pystencils.backend.ast.expressions import PsCall
 
 from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions
+from ..literals import PsLiteral
 from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType
 
 from .platform import Platform
@@ -25,7 +26,7 @@ from ..ast.expressions import (
     PsLookup,
     PsGe,
     PsLe,
-    PsTernary,
+    PsTernary, PsLiteralExpr,
 )
 from ..ast.vector import PsVecMemAcc
 from ...types import PsVectorType, PsCustomType
@@ -43,7 +44,7 @@ class GenericCpu(Platform):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"<math.h>", "<limits.h>"}
+        return {"<math.h>", "<limits.h>", "<float.h>"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
@@ -63,12 +64,25 @@ class GenericCpu(Platform):
         arg_types = (dtype,) * func.num_args
 
         if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
-            cfunc: CFunction
-            cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
-            call.function = cfunc
-            return call
+            # get type prefix for macro
+            # TODO: there must be a better way...
+            tpe = ""
+            match dtype:
+                case PsIeeeFloatType():
+                    match dtype.width:
+                        case 32:
+                            tpe = "FLT"
+                        case 64:
+                            tpe = "DBL"
+                case _:
+                    raise MaterializationError(
+                        f"No implementation available for function {func} on data type {dtype}"
+                    )
+
+            return PsLiteralExpr(PsLiteral(f"{tpe}_{func.function_name}".upper(), dtype))
 
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
+            cfunc: CFunction
             match func:
                 case (
                     MathFunctions.Exp
-- 
GitLab


From 4e748308aca64b02cfa689fb0356a9a38d8c35af Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 16:30:43 +0100
Subject: [PATCH 034/180] Integrate reduction pointers to parameters.py

---
 src/pystencils/codegen/parameters.py       | 16 ++++++++++++++--
 src/pystencils/jit/cpu_extension_module.py | 17 ++++++++++++-----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py
index d8411266e..094553517 100644
--- a/src/pystencils/codegen/parameters.py
+++ b/src/pystencils/codegen/parameters.py
@@ -1,14 +1,14 @@
 from __future__ import annotations
 
 from warnings import warn
-from typing import Sequence, Iterable
+from typing import Sequence, Iterable, Optional
 
 from .properties import (
     PsSymbolProperty,
     _FieldProperty,
     FieldShape,
     FieldStride,
-    FieldBasePtr,
+    FieldBasePtr, ReductionPointerVariable,
 )
 from ..types import PsType
 from ..field import Field
@@ -39,6 +39,9 @@ class Parameter:
                 key=lambda f: f.name,
             )
         )
+        self._reduction_ptr: Optional[ReductionPointerVariable] = next(
+            (e for e in self._properties if isinstance(e, ReductionPointerVariable)), None
+        )
 
     @property
     def name(self):
@@ -79,6 +82,11 @@ class Parameter:
         """Set of fields associated with this parameter."""
         return self._fields
 
+    @property
+    def reduction_pointer(self) -> Optional[ReductionPointerVariable]:
+        """Reduction pointer associated with this parameter."""
+        return self._reduction_ptr
+
     def get_properties(
         self, prop_type: type[PsSymbolProperty] | tuple[type[PsSymbolProperty], ...]
     ) -> set[PsSymbolProperty]:
@@ -105,6 +113,10 @@ class Parameter:
         )
         return bool(self.get_properties(FieldBasePtr))
 
+    @property
+    def is_reduction_pointer(self) -> bool:
+        return bool(self._reduction_ptr)
+
     @property
     def is_field_stride(self) -> bool:  # pragma: no cover
         warn(
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index c2c969eaa..f9c04200c 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -206,6 +206,8 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         self._array_assoc_var_extractions: dict[Parameter, str] = dict()
         self._scalar_extractions: dict[Parameter, str] = dict()
 
+        self._reduction_ptrs: dict[Parameter, str] = dict()
+
         self._constraint_checks: list[str] = []
 
         self._call: str | None = None
@@ -265,10 +267,7 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         return self._array_buffers[field]
 
     def extract_scalar(self, param: Parameter) -> str:
-        if any(isinstance(e, ReductionPointerVariable) for e in param.properties):
-            # TODO: implement
-            pass
-        elif param not in self._scalar_extractions:
+        if param not in self._scalar_extractions:
             extract_func = self._scalar_extractor(param.dtype)
             code = self.TMPL_EXTRACT_SCALAR.format(
                 name=param.name,
@@ -279,6 +278,12 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
         return param.name
 
+    def extract_reduction_ptr(self, param: Parameter) -> str:
+        if param not in self._reduction_ptrs:
+            # TODO: implement
+            pass
+        return param.name
+
     def extract_array_assoc_var(self, param: Parameter) -> str:
         if param not in self._array_assoc_var_extractions:
             field = param.fields[0]
@@ -306,7 +311,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         return param.name
 
     def extract_parameter(self, param: Parameter):
-        if param.is_field_parameter:
+        if param.is_reduction_pointer:
+            self.extract_reduction_ptr(param)
+        elif param.is_field_parameter:
             self.extract_array_assoc_var(param)
         else:
             self.extract_scalar(param)
-- 
GitLab


From 4f6f5580bc5cb302e1a064a9d642e66822155db6 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 17:23:02 +0100
Subject: [PATCH 035/180] Rewire existing code extraction of fields to support
 reduction pointer extraction

---
 src/pystencils/jit/cpu_extension_module.py | 53 ++++++++++++----------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index f9c04200c..d8d90c924 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -199,9 +199,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 """
 
     def __init__(self) -> None:
-        self._array_buffers: dict[Field, str] = dict()
-        self._array_extractions: dict[Field, str] = dict()
-        self._array_frees: dict[Field, str] = dict()
+        self._array_buffers: dict[Any, str] = dict()
+        self._array_extractions: dict[Any, str] = dict()
+        self._array_frees: dict[Any, str] = dict()
 
         self._array_assoc_var_extractions: dict[Parameter, str] = dict()
         self._scalar_extractions: dict[Parameter, str] = dict()
@@ -235,36 +235,37 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         else:
             return None
 
-    def extract_field(self, field: Field) -> str:
+    def extract_buffer(self, buffer: Any, name: str, dtype: PsType) -> str:
         """Adds an array, and returns the name of the underlying Py_Buffer."""
-        if field not in self._array_extractions:
-            extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=field.name)
+        if buffer not in self._array_extractions:
+            extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=name)
 
             #   Check array type
-            type_char = self._type_char(field.dtype)
+            type_char = self._type_char(dtype)
             if type_char is not None:
-                dtype_cond = f"buffer_{field.name}.format[0] == '{type_char}'"
+                dtype_cond = f"buffer_{name}.format[0] == '{type_char}'"
                 extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
                     cond=dtype_cond,
                     what="data type",
-                    name=field.name,
-                    expected=str(field.dtype),
+                    name=name,
+                    expected=str(dtype),
                 )
 
             #   Check item size
-            itemsize = field.dtype.itemsize
-            item_size_cond = f"buffer_{field.name}.itemsize == {itemsize}"
-            extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
-                cond=item_size_cond, what="itemsize", name=field.name, expected=itemsize
-            )
+            itemsize = dtype.itemsize
+            if itemsize is not None:  # itemsize of pointer not known (TODO?)
+                item_size_cond = f"buffer_{name}.itemsize == {itemsize}"
+                extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
+                    cond=item_size_cond, what="itemsize", name=name, expected=itemsize
+                )
 
-            self._array_buffers[field] = f"buffer_{field.name}"
-            self._array_extractions[field] = extraction_code
+            self._array_buffers[buffer] = f"buffer_{name}"
+            self._array_extractions[buffer] = extraction_code
 
-            release_code = f"PyBuffer_Release(&buffer_{field.name});"
-            self._array_frees[field] = release_code
+            release_code = f"PyBuffer_Release(&buffer_{name});"
+            self._array_frees[buffer] = release_code
 
-        return self._array_buffers[field]
+        return self._array_buffers[buffer]
 
     def extract_scalar(self, param: Parameter) -> str:
         if param not in self._scalar_extractions:
@@ -280,14 +281,20 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
     def extract_reduction_ptr(self, param: Parameter) -> str:
         if param not in self._reduction_ptrs:
-            # TODO: implement
-            pass
+            ptr = param.reduction_pointer
+            buffer = self.extract_buffer(ptr, param.name, param.dtype)
+            code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;"
+
+            assert code is not None
+
+            self._array_assoc_var_extractions[param] = code
+
         return param.name
 
     def extract_array_assoc_var(self, param: Parameter) -> str:
         if param not in self._array_assoc_var_extractions:
             field = param.fields[0]
-            buffer = self.extract_field(field)
+            buffer = self.extract_buffer(field, field.name, field.dtype)
             code: str | None = None
 
             for prop in param.properties:
-- 
GitLab


From 72fa86729d84028e8b900e8a9cd0f9a8cdfab401 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 17:35:50 +0100
Subject: [PATCH 036/180] Refine test_reduction.py to check for result
 correctness

---
 tests/kernelcreation/test_reduction.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index c41d250f4..b97343e72 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -5,6 +5,15 @@ import sympy as sp
 import pystencils as ps
 from pystencils.sympyextensions import reduced_assign
 
+INIT=2
+SIZE=15
+SOLUTION = {
+    "+": INIT * SIZE,
+    "-": INIT * -SIZE,
+    "*": INIT**SIZE,
+    "min": INIT,
+    "max": INIT
+}
 
 @pytest.mark.parametrize('dtype', ["float64"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
@@ -24,6 +33,7 @@ def test_reduction(dtype, op):
 
     ps.show_code(ast_reduction)
 
-    array = np.ones((10,), dtype=dtype)
-    kernel_reduction(x=array, w=0)
-    # TODO: check if "w = #points"
\ No newline at end of file
+    array = np.full((SIZE,), INIT, dtype=dtype)
+    reduction_array = np.zeros(1, dtype=dtype)
+    kernel_reduction(x=array, w=reduction_array)
+    assert np.allclose(reduction_array, SOLUTION[op])
\ No newline at end of file
-- 
GitLab


From 6b8bff09ef5b1c3f451aacae6aa730a9a822f5b2 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 14 Jan 2025 18:13:18 +0100
Subject: [PATCH 037/180] Initial work for introducing reduction capabilities
 to pystencils

Signed-off-by: zy69guqi <richard.angersbach@fau.de>
---
 .../backend/kernelcreation/freeze.py          | 27 +++++++++
 src/pystencils/simp/assignment_collection.py  | 13 +++++
 src/pystencils/sympyextensions/__init__.py    |  2 +
 src/pystencils/sympyextensions/reduction.py   | 57 +++++++++++++++++++
 tests/kernelcreation/test_reduction.py        | 44 ++++++++++++++
 5 files changed, 143 insertions(+)
 create mode 100644 src/pystencils/sympyextensions/reduction.py
 create mode 100644 tests/kernelcreation/test_reduction.py

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 44ee17077..65be23065 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -1,3 +1,4 @@
+from sympyextensions.reduction import ReducedAssignment
 from typing import overload, cast, Any
 from functools import reduce
 from operator import add, mul, sub, truediv
@@ -183,6 +184,32 @@ class FreezeExpressions:
 
         return PsAssignment(lhs, op(lhs.clone(), rhs))
 
+    def map_ReducedAssignment(self, expr: ReducedAssignment):
+        lhs = self.visit(expr.lhs)
+        rhs = self.visit(expr.rhs)
+
+        assert isinstance(lhs, PsExpression)
+        assert isinstance(rhs, PsExpression)
+
+        match expr.op:
+            case "+=":
+                op = add
+            case "-=":
+                op = sub
+            case "*=":
+                op = mul
+            case "/=":
+                op = truediv
+            # TODO: unsure if sp.Min & sp.Max work here
+            case "min=":
+                op = sp.Min
+            case "max=":
+                op = sp.Max
+            case _:
+                raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
+
+        return PsAssignment(lhs, op(lhs.clone(), rhs)) # TODO: PsReducedAssignment?
+
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
         return PsSymbolExpr(symb)
diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py
index f1ba87154..4de3e8dc6 100644
--- a/src/pystencils/simp/assignment_collection.py
+++ b/src/pystencils/simp/assignment_collection.py
@@ -1,5 +1,8 @@
 import itertools
 from copy import copy
+
+from sympyextensions import reduced_assign
+from sympyextensions.reduction import ReducedAssignment
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union
 
 import sympy as sp
@@ -55,8 +58,13 @@ class AssignmentCollection:
         subexpressions = list(itertools.chain.from_iterable(
             [(a if isinstance(a, Iterable) else [a]) for a in subexpressions]))
 
+        # filter out reduced assignments
+        reduced_assignments = [a for a in main_assignments if isinstance(a, ReducedAssignment)]
+        main_assignments = [a for a in main_assignments if (a not in reduced_assignments)]
+
         self.main_assignments = main_assignments
         self.subexpressions = subexpressions
+        self.reductions = reduced_assignments
 
         if simplification_hints is None:
             simplification_hints = {}
@@ -71,6 +79,11 @@ class AssignmentCollection:
         else:
             self.subexpression_symbol_generator = subexpression_symbol_generator
 
+    def add_reduction(self, lhs: sp.Symbol, op: str, rhs: sp.Expr) -> None:
+        """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet."""
+        assert lhs not in self.reductions, f"Reduction for lhs {lhs} exists"
+        self.reductions.append(reduced_assign(lhs, op, rhs))
+
     def add_simplification_hint(self, key: str, value: Any) -> None:
         """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet."""
         assert key not in self.simplification_hints, "This hint already exists"
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 7431416c9..6ab24e936 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,6 +1,7 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc
 from .pointers import mem_acc
+from .reduction import reduced_assign
 
 from .math import (
     prod,
@@ -33,6 +34,7 @@ from .math import (
 
 __all__ = [
     "ConditionalFieldAccess",
+    "reduced_assign",
     "TypedSymbol",
     "CastFunc",
     "mem_acc",
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
new file mode 100644
index 000000000..aa947c1d2
--- /dev/null
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -0,0 +1,57 @@
+from sympy.codegen.ast import AssignmentBase
+
+
+class ReducedAssignment(AssignmentBase):
+    """
+    Base class for reduced assignments.
+
+    Attributes:
+    ===========
+
+    binop : str
+       Symbol for binary operation being applied in the assignment, such as "+",
+       "*", etc.
+    """
+    binop = None  # type: str
+
+    # TODO: initial value
+
+    @property
+    def op(self):
+        return self.binop + '='
+
+
+class AddReducedAssignment(ReducedAssignment):
+    binop = '+'
+
+class SubReducedAssignment(ReducedAssignment):
+    binop = '-'
+
+
+class MulReducedAssignment(ReducedAssignment):
+    binop = '*'
+
+
+class DivReducedAssignment(ReducedAssignment):
+    binop = '/'
+
+
+class MinReducedssignment(ReducedAssignment):
+    binop = 'min'
+
+class MaxReducedssignment(ReducedAssignment):
+    binop = 'max'
+
+
+# Mapping from binary op strings to AugmentedAssignment subclasses
+reduced_assign_classes = {
+    cls.binop: cls for cls in [
+        AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, DivReducedAssignment,
+        MinReducedssignment, MaxReducedssignment
+    ]
+}
+
+def reduced_assign(lhs, op, rhs):
+    if op not in reduced_assign_classes:
+        raise ValueError("Unrecognized operator %s" % op)
+    return reduced_assign_classes[op](lhs, rhs)
\ No newline at end of file
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
new file mode 100644
index 000000000..47509e267
--- /dev/null
+++ b/tests/kernelcreation/test_reduction.py
@@ -0,0 +1,44 @@
+import pytest
+import numpy as np
+import sympy as sp
+
+import pystencils as ps
+from sympyextensions.reduction import reduced_assign
+
+
+@pytest.mark.parametrize('dtype', ["float64", "float32"])
+def test_log(dtype):
+    a = sp.Symbol("a")
+    x = ps.fields(f'x: {dtype}[1d]')
+
+    # kernel with main assignments and no reduction
+
+    main_assignment = ps.AssignmentCollection({x.center(): a})
+
+    ast_main = ps.create_kernel(main_assignment, default_dtype=dtype)
+    code_main = ps.get_code_str(ast_main)
+    kernel_main = ast_main.compile()
+
+    # ps.show_code(ast)
+
+    if dtype == "float64":
+        assert "float" not in code_main
+
+    array = np.zeros((10,), dtype=dtype)
+    kernel_main(x=array, a=100)
+    assert np.allclose(array, 4.60517019)
+
+    # kernel with single reduction assignment
+
+    omega = sp.Symbol("omega")
+
+    reduction_assignment = reduced_assign(omega, "+", x.center())
+
+    ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype)
+    code_reduction = ps.get_code_str(ast_reduction)
+    kernel_reduction = ast_reduction.compile()
+
+    if dtype == "float64":
+        assert "float" not in code_reduction
+
+    ps.show_code(ast_reduction)
\ No newline at end of file
-- 
GitLab


From f54fa321869682e71edd3f2828725104de9abe36 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 11:36:53 +0100
Subject: [PATCH 038/180] Fix relative module imports for newly introduced
 sympyextensions for reductions

---
 src/pystencils/backend/kernelcreation/freeze.py | 2 +-
 src/pystencils/simp/assignment_collection.py    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 65be23065..4d75f1ca6 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -1,4 +1,3 @@
-from sympyextensions.reduction import ReducedAssignment
 from typing import overload, cast, Any
 from functools import reduce
 from operator import add, mul, sub, truediv
@@ -16,6 +15,7 @@ from ...sympyextensions import (
 )
 from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType
 from ...sympyextensions.pointers import AddressOf, mem_acc
+from ...sympyextensions.reduction import ReducedAssignment
 from ...field import Field, FieldType
 
 from .context import KernelCreationContext
diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py
index 4de3e8dc6..212dbf751 100644
--- a/src/pystencils/simp/assignment_collection.py
+++ b/src/pystencils/simp/assignment_collection.py
@@ -1,8 +1,6 @@
 import itertools
 from copy import copy
 
-from sympyextensions import reduced_assign
-from sympyextensions.reduction import ReducedAssignment
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union
 
 import sympy as sp
@@ -11,6 +9,8 @@ import pystencils
 from ..assignment import Assignment
 from .simplifications import (sort_assignments_topologically, transform_lhs_and_rhs, transform_rhs)
 from ..sympyextensions import count_operations, fast_subs
+from ..sympyextensions import reduced_assign
+from ..sympyextensions.reduction import ReducedAssignment
 
 
 class AssignmentCollection:
-- 
GitLab


From c7b9bb522828f8d5e32d3c20ad7d17eee689d2eb Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 12:59:36 +0100
Subject: [PATCH 039/180] Expose new reduced assignments to pystencils
 interface

---
 src/pystencils/__init__.py             | 14 ++++++++++++++
 tests/kernelcreation/test_reduction.py |  4 ++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 6cb375b61..eecd929cf 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -38,6 +38,14 @@ from .simp import AssignmentCollection
 from .sympyextensions.typed_sympy import TypedSymbol, DynamicType
 from .sympyextensions import SymbolCreator
 from .datahandling import create_data_handling
+from .sympyextensions.reduction import (
+    AddReducedAssignment,
+    SubReducedAssignment,
+    MulReducedAssignment,
+    DivReducedAssignment,
+    MinReducedssignment,
+    MaxReducedssignment
+)
 
 __all__ = [
     "Field",
@@ -69,6 +77,12 @@ __all__ = [
     "AssignmentCollection",
     "Assignment",
     "AddAugmentedAssignment",
+    "AddReducedAssignment",
+    "SubReducedAssignment",
+    "MulReducedAssignment",
+    "DivReducedAssignment",
+    "MinReducedssignment",
+    "MaxReducedssignment",
     "assignment_from_stencil",
     "SymbolCreator",
     "create_data_handling",
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 47509e267..f8c2b1870 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -3,7 +3,7 @@ import numpy as np
 import sympy as sp
 
 import pystencils as ps
-from sympyextensions.reduction import reduced_assign
+from pystencils import AddReducedAssignment
 
 
 @pytest.mark.parametrize('dtype', ["float64", "float32"])
@@ -32,7 +32,7 @@ def test_log(dtype):
 
     omega = sp.Symbol("omega")
 
-    reduction_assignment = reduced_assign(omega, "+", x.center())
+    reduction_assignment = AddReducedAssignment(omega, x.center())
 
     ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype)
     code_reduction = ps.get_code_str(ast_reduction)
-- 
GitLab


From fae371d48773f9423855be824f495f3f20abcdc2 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 16:36:18 +0100
Subject: [PATCH 040/180] Get rid of reduction using the division operator

---
 src/pystencils/__init__.py                  | 2 --
 src/pystencils/sympyextensions/reduction.py | 6 +-----
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index eecd929cf..916a61392 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -42,7 +42,6 @@ from .sympyextensions.reduction import (
     AddReducedAssignment,
     SubReducedAssignment,
     MulReducedAssignment,
-    DivReducedAssignment,
     MinReducedssignment,
     MaxReducedssignment
 )
@@ -80,7 +79,6 @@ __all__ = [
     "AddReducedAssignment",
     "SubReducedAssignment",
     "MulReducedAssignment",
-    "DivReducedAssignment",
     "MinReducedssignment",
     "MaxReducedssignment",
     "assignment_from_stencil",
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index aa947c1d2..90ab61ede 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -32,10 +32,6 @@ class MulReducedAssignment(ReducedAssignment):
     binop = '*'
 
 
-class DivReducedAssignment(ReducedAssignment):
-    binop = '/'
-
-
 class MinReducedssignment(ReducedAssignment):
     binop = 'min'
 
@@ -46,7 +42,7 @@ class MaxReducedssignment(ReducedAssignment):
 # Mapping from binary op strings to AugmentedAssignment subclasses
 reduced_assign_classes = {
     cls.binop: cls for cls in [
-        AddReducedAssignment, SubReducedAssignment, MulReducedAssignment, DivReducedAssignment,
+        AddReducedAssignment, SubReducedAssignment, MulReducedAssignment,
         MinReducedssignment, MaxReducedssignment
     ]
 }
-- 
GitLab


From b263d752d8af83569517aa56ef17facfa26adc91 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 16:39:39 +0100
Subject: [PATCH 041/180] Add functions for numeric limits (to be supported by
 the backends)

---
 src/pystencils/backend/functions.py             | 10 ++++++++++
 src/pystencils/backend/platforms/generic_cpu.py |  4 +++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index 388160f30..ea0d6cb9d 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -94,6 +94,16 @@ class MathFunctions(Enum):
         self.num_args = num_args
 
 
+class NumericLimitsFunctions(MathFunctions):
+    """Numerical limits functions supported by the backend.
+
+    Each platform has to materialize these functions to a concrete implementation.
+    """
+
+    min = ("min", 0)
+    max = ("max", 0)
+
+
 class PsMathFunction(PsFunction):
     """Homogenously typed mathematical functions."""
 
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index b6d7dd551..affeb34d4 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -43,7 +43,7 @@ class GenericCpu(Platform):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"<math.h>"}
+        return {"<math.h>", "<climits.h"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
@@ -62,6 +62,8 @@ class GenericCpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
+        # TODO: numeric limits
+
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
             cfunc: CFunction
             match func:
-- 
GitLab


From 548375295a08f88bef9ba8e597c47fe7143d8139 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 16:42:00 +0100
Subject: [PATCH 042/180] Introduce reduction symbol property and add to lhs of
 reduced symbol

---
 .../backend/kernelcreation/context.py         |  2 ++
 .../backend/kernelcreation/freeze.py          | 28 ++++++++++++-------
 src/pystencils/codegen/properties.py          | 10 +++++++
 src/pystencils/sympyextensions/reduction.py   |  6 ++--
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 8f5931c64..b6bf09dba 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -75,6 +75,8 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
+        # TODO: add list of reduction symbols
+
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
 
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 4d75f1ca6..0d1ce72e1 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -65,6 +65,9 @@ from ..exceptions import PsInputError
 from ..functions import PsMathFunction, MathFunctions
 from ..exceptions import FreezeError
 
+import backend.functions
+from codegen.properties import ReductionSymbolProperty
+
 
 ExprLike = (
     sp.Expr
@@ -188,27 +191,32 @@ class FreezeExpressions:
         lhs = self.visit(expr.lhs)
         rhs = self.visit(expr.rhs)
 
-        assert isinstance(lhs, PsExpression)
         assert isinstance(rhs, PsExpression)
+        assert isinstance(lhs, PsSymbolExpr)
 
         match expr.op:
-            case "+=":
+            case "+":
                 op = add
-            case "-=":
+                init_val = PsConstant(0)
+            case "-":
                 op = sub
-            case "*=":
+                init_val = PsConstant(0)
+            case "*":
                 op = mul
-            case "/=":
-                op = truediv
-            # TODO: unsure if sp.Min & sp.Max work here
-            case "min=":
+                init_val = PsConstant(1)
+            # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards
+            case "min":
                 op = sp.Min
-            case "max=":
+                init_val = backend.functions.NumericLimitsFunctions("min")
+            case "max":
                 op = sp.Max
+                init_val = backend.functions.NumericLimitsFunctions("max")
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
-        return PsAssignment(lhs, op(lhs.clone(), rhs)) # TODO: PsReducedAssignment?
+        lhs.symbol.add_property(ReductionSymbolProperty(expr.op, init_val))
+
+        return PsAssignment(lhs, op(lhs.clone(), rhs))
 
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index d377fb3d3..5578d2408 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -3,6 +3,8 @@ from dataclasses import dataclass
 
 from ..field import Field
 
+from backend.ast.expressions import PsExpression
+
 
 @dataclass(frozen=True)
 class PsSymbolProperty:
@@ -14,6 +16,14 @@ class UniqueSymbolProperty(PsSymbolProperty):
     """Base class for unique properties, of which only one instance may be registered at a time."""
 
 
+@dataclass(frozen=True)
+class ReductionSymbolProperty(UniqueSymbolProperty):
+    """Symbol acts as a base pointer to a field."""
+
+    op: str
+    init_val: PsExpression
+
+
 @dataclass(frozen=True)
 class FieldShape(PsSymbolProperty):
     """Symbol acts as a shape parameter to a field."""
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index 90ab61ede..e2760cc6c 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -12,13 +12,11 @@ class ReducedAssignment(AssignmentBase):
        Symbol for binary operation being applied in the assignment, such as "+",
        "*", etc.
     """
-    binop = None  # type: str
-
-    # TODO: initial value
+    binop = None # type: str
 
     @property
     def op(self):
-        return self.binop + '='
+        return self.binop
 
 
 class AddReducedAssignment(ReducedAssignment):
-- 
GitLab


From 35c8160bf5f9da900255583bbfb10f935d5b3687 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 17:19:09 +0100
Subject: [PATCH 043/180] Minor import fixes

---
 src/pystencils/backend/kernelcreation/freeze.py | 9 ++++-----
 src/pystencils/codegen/properties.py            | 2 +-
 src/pystencils/sympyextensions/__init__.py      | 2 --
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 0d1ce72e1..7316e2f9f 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -62,11 +62,10 @@ from ..ast.vector import PsVecMemAcc
 from ..constants import PsConstant
 from ...types import PsNumericType, PsStructType, PsType
 from ..exceptions import PsInputError
-from ..functions import PsMathFunction, MathFunctions
+from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
 from ..exceptions import FreezeError
 
-import backend.functions
-from codegen.properties import ReductionSymbolProperty
+from ...codegen.properties import ReductionSymbolProperty
 
 
 ExprLike = (
@@ -207,10 +206,10 @@ class FreezeExpressions:
             # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards
             case "min":
                 op = sp.Min
-                init_val = backend.functions.NumericLimitsFunctions("min")
+                init_val = NumericLimitsFunctions("min")
             case "max":
                 op = sp.Max
-                init_val = backend.functions.NumericLimitsFunctions("max")
+                init_val = NumericLimitsFunctions("max")
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index 5578d2408..cc4ff4101 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -3,7 +3,7 @@ from dataclasses import dataclass
 
 from ..field import Field
 
-from backend.ast.expressions import PsExpression
+from ..backend.ast.expressions import PsExpression
 
 
 @dataclass(frozen=True)
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 6ab24e936..7431416c9 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,7 +1,6 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc
 from .pointers import mem_acc
-from .reduction import reduced_assign
 
 from .math import (
     prod,
@@ -34,7 +33,6 @@ from .math import (
 
 __all__ = [
     "ConditionalFieldAccess",
-    "reduced_assign",
     "TypedSymbol",
     "CastFunc",
     "mem_acc",
-- 
GitLab


From d8a717874a0156d0c27c75b04c859b3a4a98268d Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 17:50:51 +0100
Subject: [PATCH 044/180] Add dictionary of reduced symbols to codegen context

---
 .../backend/kernelcreation/context.py           | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index b6bf09dba..39205d707 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -75,7 +75,7 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
-        # TODO: add list of reduction symbols
+        self._symbols_with_reduction: dict[PsSymbol, ReductionSymbolProperty] = dict()
 
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
@@ -170,6 +170,21 @@ class KernelCreationContext:
 
         self._symbols[old.name] = new
 
+    def add_reduction_to_symbol(self, symbol: PsSymbol, reduction: ReductionSymbolProperty):
+        """Adds a reduction property to a symbol.
+
+        The symbol ``symbol`` should not have a reduction property and must exist in the symbol table.
+        """
+        if self.find_symbol(symbol.name) is None:
+            raise PsInternalCompilerError(
+                "add_reduction_to_symbol: Symbol does not exist in the symbol table"
+            )
+
+        if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty):
+            self._symbols_with_reduction[symbol] = reduction
+        else:
+            raise PsInternalCompilerError(f"add_reduction_to_symbol: Symbol {symbol.name} already has a reduction property")
+
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
     ) -> PsSymbol:
-- 
GitLab


From 3d592ab07194bc623884b81413278d954434e89b Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 15 Jan 2025 17:51:22 +0100
Subject: [PATCH 045/180] Try fixing circular module import

---
 src/pystencils/backend/kernelcreation/context.py | 2 ++
 src/pystencils/codegen/properties.py             | 6 ++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 39205d707..258204f8d 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -9,6 +9,8 @@ from ...defaults import DEFAULTS
 from ...field import Field, FieldType
 from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType
 
+from ...codegen.properties import ReductionSymbolProperty
+
 from ..memory import PsSymbol, PsBuffer
 from ..constants import PsConstant
 from ...types import (
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index cc4ff4101..2b0af986a 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -2,9 +2,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 
 from ..field import Field
-
-from ..backend.ast.expressions import PsExpression
-
+from typing import Any
 
 @dataclass(frozen=True)
 class PsSymbolProperty:
@@ -21,7 +19,7 @@ class ReductionSymbolProperty(UniqueSymbolProperty):
     """Symbol acts as a base pointer to a field."""
 
     op: str
-    init_val: PsExpression
+    init_val: Any # TODO: type?
 
 
 @dataclass(frozen=True)
-- 
GitLab


From e5861425c5e2e4868e0c696763297768814c02ec Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 12:59:16 +0100
Subject: [PATCH 046/180] Minor adaptation on how symbols are given reduction
 property

---
 src/pystencils/backend/kernelcreation/context.py | 5 +++--
 src/pystencils/backend/kernelcreation/freeze.py  | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 258204f8d..e41f8371c 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -179,13 +179,14 @@ class KernelCreationContext:
         """
         if self.find_symbol(symbol.name) is None:
             raise PsInternalCompilerError(
-                "add_reduction_to_symbol: Symbol does not exist in the symbol table"
+                f"add_reduction_to_symbol: {symbol.name} does not exist in the symbol table"
             )
 
         if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty):
+            symbol.add_property(reduction)
             self._symbols_with_reduction[symbol] = reduction
         else:
-            raise PsInternalCompilerError(f"add_reduction_to_symbol: Symbol {symbol.name} already has a reduction property")
+            raise PsInternalCompilerError(f"add_reduction_to_symbol: {symbol.name} already has a reduction property")
 
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 7316e2f9f..ae728dd49 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -213,7 +213,7 @@ class FreezeExpressions:
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
-        lhs.symbol.add_property(ReductionSymbolProperty(expr.op, init_val))
+        self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val))
 
         return PsAssignment(lhs, op(lhs.clone(), rhs))
 
-- 
GitLab


From c96a94619ff058ba81d798b6f4bfbe06cd273535 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 13:33:12 +0100
Subject: [PATCH 047/180] Add C function selection for numeric limits functions

---
 src/pystencils/backend/platforms/generic_cpu.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index affeb34d4..6e3c58e6f 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -3,8 +3,8 @@ from typing import Sequence
 
 from pystencils.backend.ast.expressions import PsCall
 
-from ..functions import CFunction, PsMathFunction, MathFunctions
-from ...types import PsIntegerType, PsIeeeFloatType
+from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions
+from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType
 
 from .platform import Platform
 from ..exceptions import MaterializationError
@@ -62,7 +62,10 @@ class GenericCpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        # TODO: numeric limits
+        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
+            cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
+            call.function = cfunc
+            return call
 
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
             cfunc: CFunction
-- 
GitLab


From 99a3335135baf5da92ec56713f96beae37798b60 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 15:52:05 +0100
Subject: [PATCH 048/180] Add omp reduction clauses for reduced symbols

---
 src/pystencils/backend/kernelcreation/context.py      | 5 +++++
 src/pystencils/backend/transformations/add_pragmas.py | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index e41f8371c..a8728e6ac 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -223,6 +223,11 @@ class KernelCreationContext:
         """Return an iterable of all symbols listed in the symbol table."""
         return self._symbols.values()
 
+    @property
+    def symbols_with_reduction(self) -> dict[PsSymbol, ReductionSymbolProperty]:
+        """Return a dictionary holding symbols and their reduction property."""
+        return self._symbols_with_reduction
+
     #   Fields and Arrays
 
     @property
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index 78e721f38..6d72e1550 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -10,6 +10,8 @@ from ..ast import PsAstNode
 from ..ast.structural import PsBlock, PsLoop, PsPragma
 from ..ast.expressions import PsExpression
 
+from ...types import PsScalarType
+
 if TYPE_CHECKING:
     from ...codegen.config import OpenMpConfig
 
@@ -110,6 +112,13 @@ class AddOpenMP:
         pragma_text += " parallel" if not omp_params.omit_parallel_construct else ""
         pragma_text += f" for schedule({omp_params.schedule})"
 
+        if bool(ctx.symbols_with_reduction):
+            for symbol, reduction in ctx.symbols_with_reduction.items():
+                if isinstance(symbol.dtype, PsScalarType):
+                    pragma_text += f" reduction({reduction.op}: {symbol.name})"
+                else:
+                    NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.")
+
         if omp_params.num_threads is not None:
             pragma_text += f" num_threads({str(omp_params.num_threads)})"
 
-- 
GitLab


From f00708edcaec67412905fbd26842735152da9812 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 16:15:24 +0100
Subject: [PATCH 049/180] Reformat reduction.py

---
 src/pystencils/sympyextensions/reduction.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index e2760cc6c..c9e5bfdfb 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -12,7 +12,7 @@ class ReducedAssignment(AssignmentBase):
        Symbol for binary operation being applied in the assignment, such as "+",
        "*", etc.
     """
-    binop = None # type: str
+    binop = None  # type: str
 
     @property
     def op(self):
@@ -22,6 +22,7 @@ class ReducedAssignment(AssignmentBase):
 class AddReducedAssignment(ReducedAssignment):
     binop = '+'
 
+
 class SubReducedAssignment(ReducedAssignment):
     binop = '-'
 
@@ -33,6 +34,7 @@ class MulReducedAssignment(ReducedAssignment):
 class MinReducedssignment(ReducedAssignment):
     binop = 'min'
 
+
 class MaxReducedssignment(ReducedAssignment):
     binop = 'max'
 
@@ -45,7 +47,8 @@ reduced_assign_classes = {
     ]
 }
 
+
 def reduced_assign(lhs, op, rhs):
     if op not in reduced_assign_classes:
         raise ValueError("Unrecognized operator %s" % op)
-    return reduced_assign_classes[op](lhs, rhs)
\ No newline at end of file
+    return reduced_assign_classes[op](lhs, rhs)
-- 
GitLab


From 71aaf722f0a3aecfcc79e2633311aa8f5677b42d Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 16:18:37 +0100
Subject: [PATCH 050/180] Add back reduced_assign to sympyextensions interface

---
 src/pystencils/sympyextensions/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 7431416c9..6ab24e936 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,6 +1,7 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc
 from .pointers import mem_acc
+from .reduction import reduced_assign
 
 from .math import (
     prod,
@@ -33,6 +34,7 @@ from .math import (
 
 __all__ = [
     "ConditionalFieldAccess",
+    "reduced_assign",
     "TypedSymbol",
     "CastFunc",
     "mem_acc",
-- 
GitLab


From f30ca33b9ea897e66c996759a40b1aebf43a3688 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 16:19:23 +0100
Subject: [PATCH 051/180] Fix inheritance of special math function enum classes

---
 src/pystencils/backend/functions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index ea0d6cb9d..736345395 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -94,7 +94,7 @@ class MathFunctions(Enum):
         self.num_args = num_args
 
 
-class NumericLimitsFunctions(MathFunctions):
+class NumericLimitsFunctions(Enum):
     """Numerical limits functions supported by the backend.
 
     Each platform has to materialize these functions to a concrete implementation.
@@ -109,12 +109,12 @@ class PsMathFunction(PsFunction):
 
     __match_args__ = ("func",)
 
-    def __init__(self, func: MathFunctions) -> None:
+    def __init__(self, func: MathFunctions | NumericLimitsFunctions) -> None:
         super().__init__(func.function_name, func.num_args)
         self._func = func
 
     @property
-    def func(self) -> MathFunctions:
+    def func(self) -> MathFunctions | NumericLimitsFunctions:
         return self._func
 
     def __str__(self) -> str:
-- 
GitLab


From 8fb5af398d208cf9779f7b0c528237d0d79ef7fb Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 16:20:14 +0100
Subject: [PATCH 052/180] Fix header include of limits.h

---
 src/pystencils/backend/platforms/generic_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 6e3c58e6f..ae59d0423 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -43,7 +43,7 @@ class GenericCpu(Platform):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"<math.h>", "<climits.h"}
+        return {"<math.h>", "<limits.h>"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
-- 
GitLab


From dc5898a77e7b72a2221af8b66fa56640c2516e76 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 17:30:25 +0100
Subject: [PATCH 053/180] Omit distinction between normal and reduced
 assignments in AssignmentCollection

---
 src/pystencils/simp/assignment_collection.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py
index 212dbf751..03b4edccf 100644
--- a/src/pystencils/simp/assignment_collection.py
+++ b/src/pystencils/simp/assignment_collection.py
@@ -9,8 +9,6 @@ import pystencils
 from ..assignment import Assignment
 from .simplifications import (sort_assignments_topologically, transform_lhs_and_rhs, transform_rhs)
 from ..sympyextensions import count_operations, fast_subs
-from ..sympyextensions import reduced_assign
-from ..sympyextensions.reduction import ReducedAssignment
 
 
 class AssignmentCollection:
@@ -58,13 +56,8 @@ class AssignmentCollection:
         subexpressions = list(itertools.chain.from_iterable(
             [(a if isinstance(a, Iterable) else [a]) for a in subexpressions]))
 
-        # filter out reduced assignments
-        reduced_assignments = [a for a in main_assignments if isinstance(a, ReducedAssignment)]
-        main_assignments = [a for a in main_assignments if (a not in reduced_assignments)]
-
         self.main_assignments = main_assignments
         self.subexpressions = subexpressions
-        self.reductions = reduced_assignments
 
         if simplification_hints is None:
             simplification_hints = {}
@@ -79,11 +72,6 @@ class AssignmentCollection:
         else:
             self.subexpression_symbol_generator = subexpression_symbol_generator
 
-    def add_reduction(self, lhs: sp.Symbol, op: str, rhs: sp.Expr) -> None:
-        """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet."""
-        assert lhs not in self.reductions, f"Reduction for lhs {lhs} exists"
-        self.reductions.append(reduced_assign(lhs, op, rhs))
-
     def add_simplification_hint(self, key: str, value: Any) -> None:
         """Adds an entry to the simplification_hints dictionary and checks that is does not exist yet."""
         assert key not in self.simplification_hints, "This hint already exists"
-- 
GitLab


From 97c171f3cd5371152fd102d92e302fc13ae4ee2b Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 17:38:46 +0100
Subject: [PATCH 054/180] Adaptations to reduction test

---
 tests/kernelcreation/test_reduction.py | 40 ++++++++------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index f8c2b1870..0532b30f5 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -6,39 +6,23 @@ import pystencils as ps
 from pystencils import AddReducedAssignment
 
 
-@pytest.mark.parametrize('dtype', ["float64", "float32"])
-def test_log(dtype):
-    a = sp.Symbol("a")
+@pytest.mark.parametrize('dtype', ["float64"])
+def test_reduction(dtype):
     x = ps.fields(f'x: {dtype}[1d]')
+    w = sp.Symbol("w")
 
-    # kernel with main assignments and no reduction
+    # kernel with reduction assignment
 
-    main_assignment = ps.AssignmentCollection({x.center(): a})
+    reduction_assignment = AddReducedAssignment(w, x.center())
 
-    ast_main = ps.create_kernel(main_assignment, default_dtype=dtype)
-    code_main = ps.get_code_str(ast_main)
-    kernel_main = ast_main.compile()
+    config = ps.CreateKernelConfig(cpu_openmp=True)
 
-    # ps.show_code(ast)
-
-    if dtype == "float64":
-        assert "float" not in code_main
-
-    array = np.zeros((10,), dtype=dtype)
-    kernel_main(x=array, a=100)
-    assert np.allclose(array, 4.60517019)
-
-    # kernel with single reduction assignment
-
-    omega = sp.Symbol("omega")
-
-    reduction_assignment = AddReducedAssignment(omega, x.center())
-
-    ast_reduction = ps.create_kernel(reduction_assignment, default_dtype=dtype)
-    code_reduction = ps.get_code_str(ast_reduction)
+    ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype)
+    #code_reduction = ps.get_code_str(ast_reduction)
     kernel_reduction = ast_reduction.compile()
 
-    if dtype == "float64":
-        assert "float" not in code_reduction
+    ps.show_code(ast_reduction)
 
-    ps.show_code(ast_reduction)
\ No newline at end of file
+    array = np.ones((10,), dtype=dtype)
+    kernel_reduction(x=array, w=0)
+    # TODO: check if "w = #points"
\ No newline at end of file
-- 
GitLab


From 355d638aab1179db95204d490fe585f7f4fcb7c1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 18:15:58 +0100
Subject: [PATCH 055/180] Rename min/max of numeric limits enum

---
 src/pystencils/backend/platforms/generic_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index ae59d0423..620cf9cfb 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -62,7 +62,7 @@ class GenericCpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
+        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.min, NumericLimitsFunctions.max):
             cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
             call.function = cfunc
             return call
-- 
GitLab


From 55c9812023c384c7978f2536020d8587b7a12019 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 16 Jan 2025 18:38:34 +0100
Subject: [PATCH 056/180] Adapt comment of ReductionSymbolProperty

---
 src/pystencils/codegen/properties.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index 2b0af986a..0bad4e898 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -4,6 +4,7 @@ from dataclasses import dataclass
 from ..field import Field
 from typing import Any
 
+
 @dataclass(frozen=True)
 class PsSymbolProperty:
     """Base class for symbol properties, which can be used to add additional information to symbols"""
@@ -16,10 +17,10 @@ class UniqueSymbolProperty(PsSymbolProperty):
 
 @dataclass(frozen=True)
 class ReductionSymbolProperty(UniqueSymbolProperty):
-    """Symbol acts as a base pointer to a field."""
+    """Property for symbols specifying the operation and initial value for a reduction."""
 
     op: str
-    init_val: Any # TODO: type?
+    init_val: Any  # TODO: type?
 
 
 @dataclass(frozen=True)
-- 
GitLab


From 6f8fbdfe7a6cbf14c6f64a86315fa435ed0c9336 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 13:43:32 +0100
Subject: [PATCH 057/180] Fix removal of function parameters for lhs symbols
 that are not declared in the kernel

---
 src/pystencils/backend/ast/analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/ast/analysis.py b/src/pystencils/backend/ast/analysis.py
index edeba04f2..7032690a0 100644
--- a/src/pystencils/backend/ast/analysis.py
+++ b/src/pystencils/backend/ast/analysis.py
@@ -62,7 +62,7 @@ class UndefinedSymbolsCollector:
 
             case PsAssignment(lhs, rhs):
                 undefined_vars = self(lhs) | self(rhs)
-                if isinstance(lhs, PsSymbolExpr):
+                if isinstance(node, PsDeclaration) and isinstance(lhs, PsSymbolExpr):
                     undefined_vars.remove(lhs.symbol)
                 return undefined_vars
 
-- 
GitLab


From b4dd0c8c55d26f87b4467f814c526fafc2ced76b Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 14:29:10 +0100
Subject: [PATCH 058/180] Fix usage of numerical limits for init value of
 reduction

---
 src/pystencils/backend/functions.py             | 8 ++++++--
 src/pystencils/backend/kernelcreation/freeze.py | 4 ++--
 src/pystencils/backend/platforms/generic_cpu.py | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index 736345395..18c2277cf 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -100,8 +100,12 @@ class NumericLimitsFunctions(Enum):
     Each platform has to materialize these functions to a concrete implementation.
     """
 
-    min = ("min", 0)
-    max = ("max", 0)
+    Min = ("min", 0)
+    Max = ("max", 0)
+
+    def __init__(self, func_name, num_args):
+        self.function_name = func_name
+        self.num_args = num_args
 
 
 class PsMathFunction(PsFunction):
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index ae728dd49..9a34303e2 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -206,10 +206,10 @@ class FreezeExpressions:
             # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards
             case "min":
                 op = sp.Min
-                init_val = NumericLimitsFunctions("min")
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
             case "max":
                 op = sp.Max
-                init_val = NumericLimitsFunctions("max")
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 620cf9cfb..ae59d0423 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -62,7 +62,7 @@ class GenericCpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.min, NumericLimitsFunctions.max):
+        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
             cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
             call.function = cfunc
             return call
-- 
GitLab


From ffcd54e053a2ddda2e015d2836184f2fcaef59f3 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 14:55:42 +0100
Subject: [PATCH 059/180] Fix min/max reductions

---
 src/pystencils/backend/kernelcreation/freeze.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 9a34303e2..64230203f 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -193,29 +193,31 @@ class FreezeExpressions:
         assert isinstance(rhs, PsExpression)
         assert isinstance(lhs, PsSymbolExpr)
 
+        # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
+        new_rhs: PsExpression
         match expr.op:
             case "+":
-                op = add
                 init_val = PsConstant(0)
+                new_rhs = add(lhs.clone(), rhs)
             case "-":
-                op = sub
                 init_val = PsConstant(0)
+                new_rhs = sub(lhs.clone(), rhs)
             case "*":
-                op = mul
                 init_val = PsConstant(1)
-            # TODO: unsure if sp.Min & sp.Max are mapped by map_Min/map_Max afterwards
+                new_rhs = mul(lhs.clone(), rhs)
             case "min":
-                op = sp.Min
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
+                new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs])
             case "max":
-                op = sp.Max
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
+                new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
+        # set reduction symbol property in context
         self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val))
 
-        return PsAssignment(lhs, op(lhs.clone(), rhs))
+        return PsAssignment(lhs, new_rhs)
 
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
-- 
GitLab


From dcdfff042120db9960f1859a60fdeb1890f02878 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 14:56:12 +0100
Subject: [PATCH 060/180] Parameterize test_reduction.py for different
 reduction operations

---
 tests/kernelcreation/test_reduction.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 0532b30f5..c41d250f4 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -3,17 +3,18 @@ import numpy as np
 import sympy as sp
 
 import pystencils as ps
-from pystencils import AddReducedAssignment
+from pystencils.sympyextensions import reduced_assign
 
 
 @pytest.mark.parametrize('dtype', ["float64"])
-def test_reduction(dtype):
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
+def test_reduction(dtype, op):
     x = ps.fields(f'x: {dtype}[1d]')
     w = sp.Symbol("w")
 
     # kernel with reduction assignment
 
-    reduction_assignment = AddReducedAssignment(w, x.center())
+    reduction_assignment = reduced_assign(w, op, x.center())
 
     config = ps.CreateKernelConfig(cpu_openmp=True)
 
-- 
GitLab


From 2c15b9890291deea0cb929ae3a5221f3a0671a45 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 15:02:19 +0100
Subject: [PATCH 061/180] Define type of init_val for reduction as Any

---
 src/pystencils/backend/kernelcreation/freeze.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 64230203f..840329013 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -195,6 +195,7 @@ class FreezeExpressions:
 
         # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
         new_rhs: PsExpression
+        init_val: Any  # TODO: type?
         match expr.op:
             case "+":
                 init_val = PsConstant(0)
-- 
GitLab


From 6a7a251f77d0274c32729bc5bfacfe0308d3fec9 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 17 Jan 2025 15:06:57 +0100
Subject: [PATCH 062/180] Try fix mypy no-redef error

---
 src/pystencils/backend/platforms/generic_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index ae59d0423..2b4309627 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -63,12 +63,12 @@ class GenericCpu(Platform):
         arg_types = (dtype,) * func.num_args
 
         if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
+            cfunc: CFunction
             cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
             call.function = cfunc
             return call
 
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
-            cfunc: CFunction
             match func:
                 case (
                     MathFunctions.Exp
-- 
GitLab


From 45ab4e86617492462188a5fc46d3160450a54bf0 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 20 Jan 2025 17:46:49 +0100
Subject: [PATCH 063/180] Try initializing kernel-local reduction variable copy

---
 .../backend/kernelcreation/freeze.py          | 28 +++++++++++--------
 src/pystencils/codegen/driver.py              | 12 +++++++-
 src/pystencils/codegen/properties.py          |  7 +++--
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 840329013..e0dcba8fd 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -7,6 +7,7 @@ import sympy.core.relational
 import sympy.logic.boolalg
 from sympy.codegen.ast import AssignmentBase, AugmentedAssignment
 
+from ..memory import PsSymbol
 from ...assignment import Assignment
 from ...simp import AssignmentCollection
 from ...sympyextensions import (
@@ -193,32 +194,37 @@ class FreezeExpressions:
         assert isinstance(rhs, PsExpression)
         assert isinstance(lhs, PsSymbolExpr)
 
+        # create kernel-local copy of lhs symbol to work with
+        new_lhs_symbol = PsSymbol(f"{lhs.symbol.name}_local", lhs.dtype)
+        new_lhs = PsSymbolExpr(new_lhs_symbol)
+        self._ctx.add_symbol(new_lhs_symbol)
+
         # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
         new_rhs: PsExpression
-        init_val: Any  # TODO: type?
+        init_val: PsExpression
         match expr.op:
             case "+":
-                init_val = PsConstant(0)
-                new_rhs = add(lhs.clone(), rhs)
+                init_val = PsConstantExpr(PsConstant(0))
+                new_rhs = add(new_lhs.clone(), rhs)
             case "-":
-                init_val = PsConstant(0)
-                new_rhs = sub(lhs.clone(), rhs)
+                init_val = PsConstantExpr(PsConstant(0))
+                new_rhs = sub(new_lhs.clone(), rhs)
             case "*":
-                init_val = PsConstant(1)
-                new_rhs = mul(lhs.clone(), rhs)
+                init_val = PsConstantExpr(PsConstant(1))
+                new_rhs = mul(new_lhs.clone(), rhs)
             case "min":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
-                new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [lhs.clone(), rhs])
+                new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs])
             case "max":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
-                new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [lhs.clone(), rhs])
+                new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
         # set reduction symbol property in context
-        self._ctx.add_reduction_to_symbol(lhs.symbol, ReductionSymbolProperty(expr.op, init_val))
+        self._ctx.add_reduction_to_symbol(new_lhs_symbol, ReductionSymbolProperty(expr.op, init_val, lhs.symbol))
 
-        return PsAssignment(lhs, new_rhs)
+        return PsAssignment(new_lhs, new_rhs)
 
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 28b685b55..0293cce48 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -7,12 +7,13 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
 from .parameters import Parameter
+from ..backend.ast.expressions import PsSymbolExpr
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
 
 from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
-from ..backend.ast.structural import PsBlock, PsLoop
+from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment
 from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -152,6 +153,14 @@ class DefaultKernelCreationDriver:
         if self._intermediates is not None:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
+        #   Init local reduction variable copy
+        # for red, prop in self._ctx.symbols_with_reduction.items():
+        #     kernel_ast.statements = [PsAssignment(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements
+
+        #   Write back result to reduction target variable
+        # for red, prop in self._ctx.symbols_with_reduction.items():
+        #     kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))]
+
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
             kernel_ast = self._transform_for_cpu(kernel_ast)
@@ -450,6 +459,7 @@ def _get_function_params(
         props: set[PsSymbolProperty] = set()
         for prop in symb.properties:
             match prop:
+                # TODO: how to export reduction result (via pointer)?
                 case FieldShape() | FieldStride():
                     props.add(prop)
                 case BufferBasePtr(buf):
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index 0bad4e898..4b8e7f2bf 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 from dataclasses import dataclass
 
 from ..field import Field
-from typing import Any
 
 
 @dataclass(frozen=True)
@@ -19,8 +18,12 @@ class UniqueSymbolProperty(PsSymbolProperty):
 class ReductionSymbolProperty(UniqueSymbolProperty):
     """Property for symbols specifying the operation and initial value for a reduction."""
 
+    from ..backend.memory import PsSymbol
+    from ..backend.ast.expressions import PsExpression
+
     op: str
-    init_val: Any  # TODO: type?
+    init_val: PsExpression
+    orig_symbol: PsSymbol
 
 
 @dataclass(frozen=True)
-- 
GitLab


From 0a9abc2a24ec30a444613178a94382da5355a6ef Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 21 Jan 2025 13:55:35 +0100
Subject: [PATCH 064/180] Swap out neutral init values for reduced assignments
 with min/max op

---
 src/pystencils/backend/kernelcreation/freeze.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index e0dcba8fd..06d98a44e 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -213,10 +213,10 @@ class FreezeExpressions:
                 init_val = PsConstantExpr(PsConstant(1))
                 new_rhs = mul(new_lhs.clone(), rhs)
             case "min":
-                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
                 new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs])
             case "max":
-                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
                 new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
-- 
GitLab


From 3c276118bec2981a9bfc5a0d9654fec358094269 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 21 Jan 2025 17:34:06 +0100
Subject: [PATCH 065/180] Fix declaration of local reduction var and write back
 to original variable

---
 src/pystencils/codegen/driver.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 0293cce48..06a5fd44a 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -13,7 +13,7 @@ from ..types import create_numeric_type, PsIntegerType, PsScalarType
 
 from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
-from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment
+from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment, PsDeclaration
 from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -154,12 +154,16 @@ class DefaultKernelCreationDriver:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
         #   Init local reduction variable copy
-        # for red, prop in self._ctx.symbols_with_reduction.items():
-        #     kernel_ast.statements = [PsAssignment(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements
+        for red, prop in self._ctx.symbols_with_reduction.items():
+            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements
 
         #   Write back result to reduction target variable
-        # for red, prop in self._ctx.symbols_with_reduction.items():
-        #     kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))]
+        for red, prop in self._ctx.symbols_with_reduction.items():
+            kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))]
+
+        # TODO: can this be omitted?
+        typify = Typifier(self._ctx)
+        kernel_ast = typify(kernel_ast)
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
-- 
GitLab


From c51ae2b438d688f871ba45b69476ef0c3b475462 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 21 Jan 2025 18:42:43 +0100
Subject: [PATCH 066/180] Set type of reduced variable to pointer and write
 back via PsMemAcc

---
 src/pystencils/backend/kernelcreation/freeze.py | 15 ++++++++++-----
 src/pystencils/codegen/driver.py                | 10 ++++------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 06d98a44e..d8fb1b91e 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -61,7 +61,7 @@ from ..ast.expressions import (
 from ..ast.vector import PsVecMemAcc
 
 from ..constants import PsConstant
-from ...types import PsNumericType, PsStructType, PsType
+from ...types import PsNumericType, PsStructType, PsType, PsPointerType
 from ..exceptions import PsInputError
 from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
 from ..exceptions import FreezeError
@@ -195,9 +195,9 @@ class FreezeExpressions:
         assert isinstance(lhs, PsSymbolExpr)
 
         # create kernel-local copy of lhs symbol to work with
-        new_lhs_symbol = PsSymbol(f"{lhs.symbol.name}_local", lhs.dtype)
-        new_lhs = PsSymbolExpr(new_lhs_symbol)
-        self._ctx.add_symbol(new_lhs_symbol)
+        new_lhs_symb = PsSymbol(f"{lhs.symbol.name}_local", rhs.dtype)
+        new_lhs = PsSymbolExpr(new_lhs_symb)
+        self._ctx.add_symbol(new_lhs_symb)
 
         # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
         new_rhs: PsExpression
@@ -221,8 +221,13 @@ class FreezeExpressions:
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
+        # replace original symbol with pointer-based type used for export
+        orig_symbol_as_ptr = PsSymbol(lhs.symbol.name, PsPointerType(rhs.dtype))
+        self._ctx.replace_symbol(lhs.symbol, orig_symbol_as_ptr)
+
         # set reduction symbol property in context
-        self._ctx.add_reduction_to_symbol(new_lhs_symbol, ReductionSymbolProperty(expr.op, init_val, lhs.symbol))
+        init_val.dtype = rhs.dtype
+        self._ctx.add_reduction_to_symbol(new_lhs_symb, ReductionSymbolProperty(expr.op, init_val, orig_symbol_as_ptr))
 
         return PsAssignment(new_lhs, new_rhs)
 
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 06a5fd44a..20615ba21 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -7,7 +7,7 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
 from .parameters import Parameter
-from ..backend.ast.expressions import PsSymbolExpr
+from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
 
@@ -159,11 +159,9 @@ class DefaultKernelCreationDriver:
 
         #   Write back result to reduction target variable
         for red, prop in self._ctx.symbols_with_reduction.items():
-            kernel_ast.statements += [PsAssignment(PsSymbolExpr(prop.orig_symbol), PsSymbolExpr(red))]
-
-        # TODO: can this be omitted?
-        typify = Typifier(self._ctx)
-        kernel_ast = typify(kernel_ast)
+            kernel_ast.statements += [PsAssignment(
+                PsMemAcc(PsSymbolExpr(prop.orig_symbol), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))),
+                PsSymbolExpr(red))]
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
-- 
GitLab


From c6eedfcda96e84e8279ee624ba3c113f2339bfbe Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 15:16:32 +0100
Subject: [PATCH 067/180] Split reduction var property into local and
 pointer-based reduction var properties

---
 .../backend/kernelcreation/context.py         | 54 ++++++++++++++-----
 .../backend/kernelcreation/freeze.py          | 31 ++++++-----
 .../backend/transformations/add_pragmas.py    |  4 +-
 src/pystencils/codegen/driver.py              | 10 ++--
 src/pystencils/codegen/properties.py          | 16 ++++--
 5 files changed, 78 insertions(+), 37 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index a8728e6ac..2f46a7421 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -9,7 +9,7 @@ from ...defaults import DEFAULTS
 from ...field import Field, FieldType
 from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType
 
-from ...codegen.properties import ReductionSymbolProperty
+from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable
 
 from ..memory import PsSymbol, PsBuffer
 from ..constants import PsConstant
@@ -77,7 +77,8 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
-        self._symbols_with_reduction: dict[PsSymbol, ReductionSymbolProperty] = dict()
+        self._local_reduction_symbols: dict[PsSymbol, LocalReductionVariable] = dict()
+        self._reduction_ptr_symbols: dict[PsSymbol, ReductionPointerVariable] = dict()
 
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
@@ -172,21 +173,41 @@ class KernelCreationContext:
 
         self._symbols[old.name] = new
 
-    def add_reduction_to_symbol(self, symbol: PsSymbol, reduction: ReductionSymbolProperty):
-        """Adds a reduction property to a symbol.
+    def add_local_reduction_symbol(self, local_symb: PsSymbol, local_var_prop: LocalReductionVariable):
+        """Adds entry for a symbol and its property to the lookup table for local reduction variables.
 
-        The symbol ``symbol`` should not have a reduction property and must exist in the symbol table.
+        The symbol ``symbol`` should not have a 'LocalReductionSymbol' property and shall not exist in the symbol table.
         """
-        if self.find_symbol(symbol.name) is None:
+        if self.find_symbol(local_symb.name) is not None:
             raise PsInternalCompilerError(
-                f"add_reduction_to_symbol: {symbol.name} does not exist in the symbol table"
+                f"add_local_reduction_symbol: {local_symb.name} already exist in the symbol table"
             )
+        self.add_symbol(local_symb)
 
-        if symbol not in self._symbols_with_reduction and not symbol.get_properties(ReductionSymbolProperty):
-            symbol.add_property(reduction)
-            self._symbols_with_reduction[symbol] = reduction
+        if local_symb not in self._local_reduction_symbols and not local_symb.get_properties(LocalReductionVariable):
+            local_symb.add_property(local_var_prop)
+            self._local_reduction_symbols[local_symb] = local_var_prop
         else:
-            raise PsInternalCompilerError(f"add_reduction_to_symbol: {symbol.name} already has a reduction property")
+            raise PsInternalCompilerError(
+                f"add_local_reduction_symbol: {local_symb.name} already exists in local reduction table"
+            )
+
+    def add_reduction_ptr_symbol(self, orig_symb: PsSymbol, ptr_symb: PsSymbol, ptr_var_prop: ReductionPointerVariable):
+        """Replaces reduction symbol with a pointer-based counterpart used for export
+        and adds the new symbol and its property to the lookup table for pointer-based reduction variables
+
+        The symbol ``ptr_symbol`` should not exist in the symbol table.
+        """
+        self.replace_symbol(orig_symb, ptr_symb)
+
+        if ptr_symb not in self._reduction_ptr_symbols and not ptr_symb.get_properties(
+                ReductionPointerVariable):
+            ptr_symb.add_property(ptr_var_prop)
+            self._reduction_ptr_symbols[ptr_symb] = ptr_var_prop
+        else:
+            raise PsInternalCompilerError(
+                f"add_reduction_ptr_symbol: {ptr_symb.name} already exists in pointer-based reduction variable table "
+            )
 
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
@@ -224,9 +245,14 @@ class KernelCreationContext:
         return self._symbols.values()
 
     @property
-    def symbols_with_reduction(self) -> dict[PsSymbol, ReductionSymbolProperty]:
-        """Return a dictionary holding symbols and their reduction property."""
-        return self._symbols_with_reduction
+    def local_reduction_symbols(self) -> dict[PsSymbol, LocalReductionVariable]:
+        """Return a dictionary holding kernel-local reduction symbols and their reduction properties."""
+        return self._local_reduction_symbols
+
+    @property
+    def reduction_pointer_symbols(self) -> dict[PsSymbol, ReductionPointerVariable]:
+        """Return a dictionary holding pointer-based reduction symbols and their reduction properties."""
+        return self._reduction_ptr_symbols
 
     #   Fields and Arrays
 
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index d8fb1b91e..1e9984def 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -66,7 +66,7 @@ from ..exceptions import PsInputError
 from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
 from ..exceptions import FreezeError
 
-from ...codegen.properties import ReductionSymbolProperty
+from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable
 
 
 ExprLike = (
@@ -194,40 +194,45 @@ class FreezeExpressions:
         assert isinstance(rhs, PsExpression)
         assert isinstance(lhs, PsSymbolExpr)
 
+        orig_lhs_symb = lhs.symbol
+        dtype = rhs.dtype # TODO: kernel with (implicit) up/downcasts?
+
+        # replace original symbol with pointer-based type used for export
+        orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype))
+
         # create kernel-local copy of lhs symbol to work with
-        new_lhs_symb = PsSymbol(f"{lhs.symbol.name}_local", rhs.dtype)
+        new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype)
         new_lhs = PsSymbolExpr(new_lhs_symb)
-        self._ctx.add_symbol(new_lhs_symb)
 
         # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
         new_rhs: PsExpression
         init_val: PsExpression
         match expr.op:
             case "+":
-                init_val = PsConstantExpr(PsConstant(0))
+                init_val = PsConstantExpr(PsConstant(0, dtype))
                 new_rhs = add(new_lhs.clone(), rhs)
             case "-":
-                init_val = PsConstantExpr(PsConstant(0))
+                init_val = PsConstantExpr(PsConstant(0, dtype))
                 new_rhs = sub(new_lhs.clone(), rhs)
             case "*":
-                init_val = PsConstantExpr(PsConstant(1))
+                init_val = PsConstantExpr(PsConstant(1, dtype))
                 new_rhs = mul(new_lhs.clone(), rhs)
             case "min":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
+                init_val.dtype = dtype
                 new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs])
             case "max":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
+                init_val.dtype = dtype
                 new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
-        # replace original symbol with pointer-based type used for export
-        orig_symbol_as_ptr = PsSymbol(lhs.symbol.name, PsPointerType(rhs.dtype))
-        self._ctx.replace_symbol(lhs.symbol, orig_symbol_as_ptr)
-
-        # set reduction symbol property in context
-        init_val.dtype = rhs.dtype
-        self._ctx.add_reduction_to_symbol(new_lhs_symb, ReductionSymbolProperty(expr.op, init_val, orig_symbol_as_ptr))
+        # set reduction symbol properties (local/pointer variables) in context
+        self._ctx.add_local_reduction_symbol(new_lhs_symb,
+                                             LocalReductionVariable(expr.op, init_val, orig_lhs_symb_as_ptr))
+        self._ctx.add_reduction_ptr_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr,
+                                           ReductionPointerVariable(expr.op, new_lhs_symb))
 
         return PsAssignment(new_lhs, new_rhs)
 
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index 6d72e1550..44d1d1ede 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -112,8 +112,8 @@ class AddOpenMP:
         pragma_text += " parallel" if not omp_params.omit_parallel_construct else ""
         pragma_text += f" for schedule({omp_params.schedule})"
 
-        if bool(ctx.symbols_with_reduction):
-            for symbol, reduction in ctx.symbols_with_reduction.items():
+        if bool(ctx.local_reduction_symbols):
+            for symbol, reduction in ctx.local_reduction_symbols.items():
                 if isinstance(symbol.dtype, PsScalarType):
                     pragma_text += f" reduction({reduction.op}: {symbol.name})"
                 else:
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 20615ba21..7f90f62ce 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -154,14 +154,14 @@ class DefaultKernelCreationDriver:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
         #   Init local reduction variable copy
-        for red, prop in self._ctx.symbols_with_reduction.items():
-            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(red), prop.init_val)] + kernel_ast.statements
+        for local_red, prop in self._ctx.local_reduction_symbols.items():
+            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), prop.init_val)] + kernel_ast.statements
 
         #   Write back result to reduction target variable
-        for red, prop in self._ctx.symbols_with_reduction.items():
+        for red_ptr, prop in self._ctx.reduction_pointer_symbols.items():
             kernel_ast.statements += [PsAssignment(
-                PsMemAcc(PsSymbolExpr(prop.orig_symbol), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))),
-                PsSymbolExpr(red))]
+                PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))),
+                PsSymbolExpr(prop.local_symbol))]
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index 4b8e7f2bf..1e71c5b98 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -15,15 +15,25 @@ class UniqueSymbolProperty(PsSymbolProperty):
 
 
 @dataclass(frozen=True)
-class ReductionSymbolProperty(UniqueSymbolProperty):
-    """Property for symbols specifying the operation and initial value for a reduction."""
+class LocalReductionVariable(PsSymbolProperty):
+    """Property for symbols specifying the operation and initial value for a kernel-local reduction variable."""
 
     from ..backend.memory import PsSymbol
     from ..backend.ast.expressions import PsExpression
 
     op: str
     init_val: PsExpression
-    orig_symbol: PsSymbol
+    ptr_symbol: PsSymbol
+
+
+@dataclass(frozen=True)
+class ReductionPointerVariable(PsSymbolProperty):
+    """Property for pointer-type symbols exporting the reduction result from the kernel."""
+
+    from ..backend.memory import PsSymbol
+
+    op: str
+    local_symbol: PsSymbol
 
 
 @dataclass(frozen=True)
-- 
GitLab


From 3e0daa67359c7ddc17264b7fd21aa0a0429552e5 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 16:01:00 +0100
Subject: [PATCH 068/180] Propagate properties of reduction pointer symbols to
 kernel parameters

---
 src/pystencils/codegen/driver.py           | 5 +++--
 src/pystencils/jit/cpu_extension_module.py | 7 +++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 7f90f62ce..f414b953e 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -5,7 +5,7 @@ from dataclasses import dataclass, replace
 from .target import Target
 from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
-from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
+from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable
 from .parameters import Parameter
 from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
 
@@ -461,7 +461,8 @@ def _get_function_params(
         props: set[PsSymbolProperty] = set()
         for prop in symb.properties:
             match prop:
-                # TODO: how to export reduction result (via pointer)?
+                case ReductionPointerVariable():
+                    props.add(prop)
                 case FieldShape() | FieldStride():
                     props.add(prop)
                 case BufferBasePtr(buf):
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index befb033e6..c2c969eaa 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -13,7 +13,7 @@ from ..codegen import (
     Kernel,
     Parameter,
 )
-from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride
+from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride, ReductionPointerVariable
 from ..types import (
     PsType,
     PsUnsignedIntegerType,
@@ -265,7 +265,10 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         return self._array_buffers[field]
 
     def extract_scalar(self, param: Parameter) -> str:
-        if param not in self._scalar_extractions:
+        if any(isinstance(e, ReductionPointerVariable) for e in param.properties):
+            # TODO: implement
+            pass
+        elif param not in self._scalar_extractions:
             extract_func = self._scalar_extractor(param.dtype)
             code = self.TMPL_EXTRACT_SCALAR.format(
                 name=param.name,
-- 
GitLab


From 777ab888d5032d7630827f91fd26c388a9a09db2 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 16:01:51 +0100
Subject: [PATCH 069/180] Use literals for C macros used for the numeric limits

---
 .../backend/platforms/generic_cpu.py          | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 2b4309627..58b9c7946 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -4,6 +4,7 @@ from typing import Sequence
 from pystencils.backend.ast.expressions import PsCall
 
 from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions
+from ..literals import PsLiteral
 from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType
 
 from .platform import Platform
@@ -25,7 +26,7 @@ from ..ast.expressions import (
     PsLookup,
     PsGe,
     PsLe,
-    PsTernary,
+    PsTernary, PsLiteralExpr,
 )
 from ..ast.vector import PsVecMemAcc
 from ...types import PsVectorType, PsCustomType
@@ -43,7 +44,7 @@ class GenericCpu(Platform):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"<math.h>", "<limits.h>"}
+        return {"<math.h>", "<limits.h>", "<float.h>"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
@@ -63,12 +64,25 @@ class GenericCpu(Platform):
         arg_types = (dtype,) * func.num_args
 
         if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
-            cfunc: CFunction
-            cfunc = CFunction(f"{dtype.c_string()}_{func.function_name}".capitalize(), arg_types, dtype)
-            call.function = cfunc
-            return call
+            # get type prefix for macro
+            # TODO: there must be a better way...
+            tpe = ""
+            match dtype:
+                case PsIeeeFloatType():
+                    match dtype.width:
+                        case 32:
+                            tpe = "FLT"
+                        case 64:
+                            tpe = "DBL"
+                case _:
+                    raise MaterializationError(
+                        f"No implementation available for function {func} on data type {dtype}"
+                    )
+
+            return PsLiteralExpr(PsLiteral(f"{tpe}_{func.function_name}".upper(), dtype))
 
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
+            cfunc: CFunction
             match func:
                 case (
                     MathFunctions.Exp
-- 
GitLab


From f1c556e6f93d5fa042e12e8a0a9c57f3bdea47b7 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 16:30:43 +0100
Subject: [PATCH 070/180] Integrate reduction pointers to parameters.py

---
 src/pystencils/codegen/parameters.py       | 16 ++++++++++++++--
 src/pystencils/jit/cpu_extension_module.py | 17 ++++++++++++-----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py
index d8411266e..094553517 100644
--- a/src/pystencils/codegen/parameters.py
+++ b/src/pystencils/codegen/parameters.py
@@ -1,14 +1,14 @@
 from __future__ import annotations
 
 from warnings import warn
-from typing import Sequence, Iterable
+from typing import Sequence, Iterable, Optional
 
 from .properties import (
     PsSymbolProperty,
     _FieldProperty,
     FieldShape,
     FieldStride,
-    FieldBasePtr,
+    FieldBasePtr, ReductionPointerVariable,
 )
 from ..types import PsType
 from ..field import Field
@@ -39,6 +39,9 @@ class Parameter:
                 key=lambda f: f.name,
             )
         )
+        self._reduction_ptr: Optional[ReductionPointerVariable] = next(
+            (e for e in self._properties if isinstance(e, ReductionPointerVariable)), None
+        )
 
     @property
     def name(self):
@@ -79,6 +82,11 @@ class Parameter:
         """Set of fields associated with this parameter."""
         return self._fields
 
+    @property
+    def reduction_pointer(self) -> Optional[ReductionPointerVariable]:
+        """Reduction pointer associated with this parameter."""
+        return self._reduction_ptr
+
     def get_properties(
         self, prop_type: type[PsSymbolProperty] | tuple[type[PsSymbolProperty], ...]
     ) -> set[PsSymbolProperty]:
@@ -105,6 +113,10 @@ class Parameter:
         )
         return bool(self.get_properties(FieldBasePtr))
 
+    @property
+    def is_reduction_pointer(self) -> bool:
+        return bool(self._reduction_ptr)
+
     @property
     def is_field_stride(self) -> bool:  # pragma: no cover
         warn(
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index c2c969eaa..f9c04200c 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -206,6 +206,8 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         self._array_assoc_var_extractions: dict[Parameter, str] = dict()
         self._scalar_extractions: dict[Parameter, str] = dict()
 
+        self._reduction_ptrs: dict[Parameter, str] = dict()
+
         self._constraint_checks: list[str] = []
 
         self._call: str | None = None
@@ -265,10 +267,7 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         return self._array_buffers[field]
 
     def extract_scalar(self, param: Parameter) -> str:
-        if any(isinstance(e, ReductionPointerVariable) for e in param.properties):
-            # TODO: implement
-            pass
-        elif param not in self._scalar_extractions:
+        if param not in self._scalar_extractions:
             extract_func = self._scalar_extractor(param.dtype)
             code = self.TMPL_EXTRACT_SCALAR.format(
                 name=param.name,
@@ -279,6 +278,12 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
         return param.name
 
+    def extract_reduction_ptr(self, param: Parameter) -> str:
+        if param not in self._reduction_ptrs:
+            # TODO: implement
+            pass
+        return param.name
+
     def extract_array_assoc_var(self, param: Parameter) -> str:
         if param not in self._array_assoc_var_extractions:
             field = param.fields[0]
@@ -306,7 +311,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         return param.name
 
     def extract_parameter(self, param: Parameter):
-        if param.is_field_parameter:
+        if param.is_reduction_pointer:
+            self.extract_reduction_ptr(param)
+        elif param.is_field_parameter:
             self.extract_array_assoc_var(param)
         else:
             self.extract_scalar(param)
-- 
GitLab


From ba697180cac45133f756364ef8798d8437852026 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 17:23:02 +0100
Subject: [PATCH 071/180] Rewire existing code extraction of fields to support
 reduction pointer extraction

---
 src/pystencils/jit/cpu_extension_module.py | 53 ++++++++++++----------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index f9c04200c..d8d90c924 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -199,9 +199,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 """
 
     def __init__(self) -> None:
-        self._array_buffers: dict[Field, str] = dict()
-        self._array_extractions: dict[Field, str] = dict()
-        self._array_frees: dict[Field, str] = dict()
+        self._array_buffers: dict[Any, str] = dict()
+        self._array_extractions: dict[Any, str] = dict()
+        self._array_frees: dict[Any, str] = dict()
 
         self._array_assoc_var_extractions: dict[Parameter, str] = dict()
         self._scalar_extractions: dict[Parameter, str] = dict()
@@ -235,36 +235,37 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         else:
             return None
 
-    def extract_field(self, field: Field) -> str:
+    def extract_buffer(self, buffer: Any, name: str, dtype: PsType) -> str:
         """Adds an array, and returns the name of the underlying Py_Buffer."""
-        if field not in self._array_extractions:
-            extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=field.name)
+        if buffer not in self._array_extractions:
+            extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=name)
 
             #   Check array type
-            type_char = self._type_char(field.dtype)
+            type_char = self._type_char(dtype)
             if type_char is not None:
-                dtype_cond = f"buffer_{field.name}.format[0] == '{type_char}'"
+                dtype_cond = f"buffer_{name}.format[0] == '{type_char}'"
                 extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
                     cond=dtype_cond,
                     what="data type",
-                    name=field.name,
-                    expected=str(field.dtype),
+                    name=name,
+                    expected=str(dtype),
                 )
 
             #   Check item size
-            itemsize = field.dtype.itemsize
-            item_size_cond = f"buffer_{field.name}.itemsize == {itemsize}"
-            extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
-                cond=item_size_cond, what="itemsize", name=field.name, expected=itemsize
-            )
+            itemsize = dtype.itemsize
+            if itemsize is not None:  # itemsize of pointer not known (TODO?)
+                item_size_cond = f"buffer_{name}.itemsize == {itemsize}"
+                extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
+                    cond=item_size_cond, what="itemsize", name=name, expected=itemsize
+                )
 
-            self._array_buffers[field] = f"buffer_{field.name}"
-            self._array_extractions[field] = extraction_code
+            self._array_buffers[buffer] = f"buffer_{name}"
+            self._array_extractions[buffer] = extraction_code
 
-            release_code = f"PyBuffer_Release(&buffer_{field.name});"
-            self._array_frees[field] = release_code
+            release_code = f"PyBuffer_Release(&buffer_{name});"
+            self._array_frees[buffer] = release_code
 
-        return self._array_buffers[field]
+        return self._array_buffers[buffer]
 
     def extract_scalar(self, param: Parameter) -> str:
         if param not in self._scalar_extractions:
@@ -280,14 +281,20 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
     def extract_reduction_ptr(self, param: Parameter) -> str:
         if param not in self._reduction_ptrs:
-            # TODO: implement
-            pass
+            ptr = param.reduction_pointer
+            buffer = self.extract_buffer(ptr, param.name, param.dtype)
+            code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;"
+
+            assert code is not None
+
+            self._array_assoc_var_extractions[param] = code
+
         return param.name
 
     def extract_array_assoc_var(self, param: Parameter) -> str:
         if param not in self._array_assoc_var_extractions:
             field = param.fields[0]
-            buffer = self.extract_field(field)
+            buffer = self.extract_buffer(field, field.name, field.dtype)
             code: str | None = None
 
             for prop in param.properties:
-- 
GitLab


From 3e595df6c79cc1a7a8c2ff4ab86825e81aadbf43 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 22 Jan 2025 17:35:50 +0100
Subject: [PATCH 072/180] Refine test_reduction.py to check for result
 correctness

---
 tests/kernelcreation/test_reduction.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index c41d250f4..b97343e72 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -5,6 +5,15 @@ import sympy as sp
 import pystencils as ps
 from pystencils.sympyextensions import reduced_assign
 
+INIT=2
+SIZE=15
+SOLUTION = {
+    "+": INIT * SIZE,
+    "-": INIT * -SIZE,
+    "*": INIT**SIZE,
+    "min": INIT,
+    "max": INIT
+}
 
 @pytest.mark.parametrize('dtype', ["float64"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
@@ -24,6 +33,7 @@ def test_reduction(dtype, op):
 
     ps.show_code(ast_reduction)
 
-    array = np.ones((10,), dtype=dtype)
-    kernel_reduction(x=array, w=0)
-    # TODO: check if "w = #points"
\ No newline at end of file
+    array = np.full((SIZE,), INIT, dtype=dtype)
+    reduction_array = np.zeros(1, dtype=dtype)
+    kernel_reduction(x=array, w=reduction_array)
+    assert np.allclose(reduction_array, SOLUTION[op])
\ No newline at end of file
-- 
GitLab


From b352a2e2e8c2d7f4eeb0861dacff5d703ae51869 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 23 Jan 2025 18:18:24 +0100
Subject: [PATCH 073/180] Fix lint for jit/cpu_extension_module.py

---
 src/pystencils/jit/cpu_extension_module.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index d8d90c924..6ec62c28d 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -13,7 +13,7 @@ from ..codegen import (
     Kernel,
     Parameter,
 )
-from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride, ReductionPointerVariable
+from ..codegen.properties import FieldBasePtr, FieldShape, FieldStride
 from ..types import (
     PsType,
     PsUnsignedIntegerType,
@@ -21,7 +21,6 @@ from ..types import (
     PsIeeeFloatType,
 )
 from ..types.quick import Fp, SInt, UInt
-from ..field import Field
 
 
 class PsKernelExtensioNModule:
-- 
GitLab


From 4c726aa6aa2df6312252cb848ace95e790d62331 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 24 Jan 2025 14:43:32 +0100
Subject: [PATCH 074/180] Prepare reduction test for GPU support

---
 tests/kernelcreation/test_reduction.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index b97343e72..b56a24a19 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy as np
 import sympy as sp
+import cupy as cp
 
 import pystencils as ps
 from pystencils.sympyextensions import reduced_assign
@@ -18,6 +19,9 @@ SOLUTION = {
 @pytest.mark.parametrize('dtype', ["float64"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction(dtype, op):
+
+    gpu_avail = True
+
     x = ps.fields(f'x: {dtype}[1d]')
     w = sp.Symbol("w")
 
@@ -25,7 +29,7 @@ def test_reduction(dtype, op):
 
     reduction_assignment = reduced_assign(w, op, x.center())
 
-    config = ps.CreateKernelConfig(cpu_openmp=True)
+    config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True)
 
     ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype)
     #code_reduction = ps.get_code_str(ast_reduction)
@@ -35,5 +39,13 @@ def test_reduction(dtype, op):
 
     array = np.full((SIZE,), INIT, dtype=dtype)
     reduction_array = np.zeros(1, dtype=dtype)
-    kernel_reduction(x=array, w=reduction_array)
-    assert np.allclose(reduction_array, SOLUTION[op])
\ No newline at end of file
+
+    if gpu_avail:
+        array_gpu = cp.asarray(array)
+        reduction_array_gpu = cp.asarray(reduction_array)
+
+        kernel_reduction(x=array_gpu, w=reduction_array_gpu)
+        assert np.allclose(reduction_array_gpu.get(), SOLUTION[op])
+    else:
+        kernel_reduction(x=array, w=reduction_array)
+        assert np.allclose(reduction_array, SOLUTION[op])
\ No newline at end of file
-- 
GitLab


From f0d2fde6848f9cddcf0c3b38ca169f2e85abc093 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 24 Jan 2025 16:20:37 +0100
Subject: [PATCH 075/180] Encapsulate mapping of binop strings to actual
 operands and now also use for considering initial value of passed reduction
 pointer value

---
 .../backend/kernelcreation/freeze.py          | 25 ++++-----------
 src/pystencils/codegen/driver.py              |  7 +++--
 .../sympyextensions/binop_mapping.py          | 31 +++++++++++++++++++
 tests/kernelcreation/test_reduction.py        | 29 ++++++++---------
 4 files changed, 56 insertions(+), 36 deletions(-)
 create mode 100644 src/pystencils/sympyextensions/binop_mapping.py

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 1e9984def..f5f207acf 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -14,6 +14,7 @@ from ...sympyextensions import (
     integer_functions,
     ConditionalFieldAccess,
 )
+from ...sympyextensions.binop_mapping import binop_str_to_expr
 from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType
 from ...sympyextensions.pointers import AddressOf, mem_acc
 from ...sympyextensions.reduction import ReducedAssignment
@@ -173,19 +174,7 @@ class FreezeExpressions:
         assert isinstance(lhs, PsExpression)
         assert isinstance(rhs, PsExpression)
 
-        match expr.op:
-            case "+=":
-                op = add
-            case "-=":
-                op = sub
-            case "*=":
-                op = mul
-            case "/=":
-                op = truediv
-            case _:
-                raise FreezeError(f"Unsupported augmented assignment: {expr.op}.")
-
-        return PsAssignment(lhs, op(lhs.clone(), rhs))
+        return PsAssignment(lhs, binop_str_to_expr(expr.op[0], lhs.clone(), rhs))
 
     def map_ReducedAssignment(self, expr: ReducedAssignment):
         lhs = self.visit(expr.lhs)
@@ -204,27 +193,25 @@ class FreezeExpressions:
         new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype)
         new_lhs = PsSymbolExpr(new_lhs_symb)
 
-        # match for reduction operation and set neutral init_val and new rhs (similar to augmented assignment)
+        # get new rhs from augmented assignment
+        new_rhs: PsExpression = binop_str_to_expr(expr.op, new_lhs.clone(), rhs)
+
+        # match for reduction operation and set neutral init_val
         new_rhs: PsExpression
         init_val: PsExpression
         match expr.op:
             case "+":
                 init_val = PsConstantExpr(PsConstant(0, dtype))
-                new_rhs = add(new_lhs.clone(), rhs)
             case "-":
                 init_val = PsConstantExpr(PsConstant(0, dtype))
-                new_rhs = sub(new_lhs.clone(), rhs)
             case "*":
                 init_val = PsConstantExpr(PsConstant(1, dtype))
-                new_rhs = mul(new_lhs.clone(), rhs)
             case "min":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
                 init_val.dtype = dtype
-                new_rhs = PsCall(PsMathFunction(MathFunctions.Min), [new_lhs.clone(), rhs])
             case "max":
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
                 init_val.dtype = dtype
-                new_rhs = PsCall(PsMathFunction(MathFunctions.Max), [new_lhs.clone(), rhs])
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index f414b953e..8b8ecd15b 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -7,7 +7,8 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable
 from .parameters import Parameter
-from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
+from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr, PsExpression
+from ..sympyextensions.binop_mapping import binop_str_to_expr
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
 
@@ -159,9 +160,9 @@ class DefaultKernelCreationDriver:
 
         #   Write back result to reduction target variable
         for red_ptr, prop in self._ctx.reduction_pointer_symbols.items():
+            ptr_access = PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
             kernel_ast.statements += [PsAssignment(
-                PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype))),
-                PsSymbolExpr(prop.local_symbol))]
+                ptr_access, binop_str_to_expr(prop.op, ptr_access, PsSymbolExpr(prop.local_symbol)))]
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
diff --git a/src/pystencils/sympyextensions/binop_mapping.py b/src/pystencils/sympyextensions/binop_mapping.py
new file mode 100644
index 000000000..1cb2a3ab5
--- /dev/null
+++ b/src/pystencils/sympyextensions/binop_mapping.py
@@ -0,0 +1,31 @@
+from operator import truediv, mul, sub, add
+
+from src.pystencils.backend.ast.expressions import PsCall, PsExpression
+from src.pystencils.backend.exceptions import FreezeError
+from src.pystencils.backend.functions import MathFunctions, PsMathFunction
+
+_available_operator_interface: set[str] = {'+', '-', '*', '/'}
+
+
+def binop_str_to_expr(op: str, op1, op2) -> PsExpression:
+    if op in _available_operator_interface:
+        match op:
+            case "+":
+                operator = add
+            case "-":
+                operator = sub
+            case "*":
+                operator = mul
+            case "/":
+                operator = truediv
+            case _:
+                raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.")
+        return operator(op1, op2)
+    else:
+        match op:
+            case "min":
+                return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2])
+            case "max":
+                return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2])
+            case _:
+                raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.")
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index b56a24a19..c01dce5a6 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -6,21 +6,22 @@ import cupy as cp
 import pystencils as ps
 from pystencils.sympyextensions import reduced_assign
 
-INIT=2
-SIZE=15
+INIT_W = 5
+INIT_ARR = 2
+SIZE = 15
 SOLUTION = {
-    "+": INIT * SIZE,
-    "-": INIT * -SIZE,
-    "*": INIT**SIZE,
-    "min": INIT,
-    "max": INIT
+    "+": INIT_W + INIT_ARR * SIZE,
+    "-": INIT_W - INIT_ARR * -SIZE,
+    "*": INIT_W * INIT_ARR ** SIZE,
+    "min": min(INIT_W, INIT_ARR),
+    "max": max(INIT_W, INIT_ARR),
 }
 
+
 @pytest.mark.parametrize('dtype', ["float64"])
-@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
+@pytest.mark.parametrize("op", ["+", "-", "*"]) #, "min", "max"]) # TODO: min/max broken due to error in BasePrinter
 def test_reduction(dtype, op):
-
-    gpu_avail = True
+    gpu_avail = False
 
     x = ps.fields(f'x: {dtype}[1d]')
     w = sp.Symbol("w")
@@ -32,13 +33,13 @@ def test_reduction(dtype, op):
     config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True)
 
     ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype)
-    #code_reduction = ps.get_code_str(ast_reduction)
+    # code_reduction = ps.get_code_str(ast_reduction)
     kernel_reduction = ast_reduction.compile()
 
     ps.show_code(ast_reduction)
 
-    array = np.full((SIZE,), INIT, dtype=dtype)
-    reduction_array = np.zeros(1, dtype=dtype)
+    array = np.full((SIZE,), INIT_ARR, dtype=dtype)
+    reduction_array = np.full((1,), INIT_W, dtype=dtype)
 
     if gpu_avail:
         array_gpu = cp.asarray(array)
@@ -48,4 +49,4 @@ def test_reduction(dtype, op):
         assert np.allclose(reduction_array_gpu.get(), SOLUTION[op])
     else:
         kernel_reduction(x=array, w=reduction_array)
-        assert np.allclose(reduction_array, SOLUTION[op])
\ No newline at end of file
+        assert np.allclose(reduction_array, SOLUTION[op])
-- 
GitLab


From 96b5cbf286a29882b496faee3b9fe3be481d8bb3 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 28 Jan 2025 12:44:21 +0100
Subject: [PATCH 076/180] Fix lint

---
 src/pystencils/backend/kernelcreation/freeze.py | 5 ++---
 src/pystencils/codegen/driver.py                | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index f5f207acf..1238f16af 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -1,9 +1,8 @@
 from typing import overload, cast, Any
 from functools import reduce
-from operator import add, mul, sub, truediv
+from operator import add, mul, sub
 
 import sympy as sp
-import sympy.core.relational
 import sympy.logic.boolalg
 from sympy.codegen.ast import AssignmentBase, AugmentedAssignment
 
@@ -184,7 +183,7 @@ class FreezeExpressions:
         assert isinstance(lhs, PsSymbolExpr)
 
         orig_lhs_symb = lhs.symbol
-        dtype = rhs.dtype # TODO: kernel with (implicit) up/downcasts?
+        dtype = rhs.dtype  # TODO: kernel with (implicit) up/downcasts?
 
         # replace original symbol with pointer-based type used for export
         orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype))
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 8b8ecd15b..b47ad8a9e 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -7,7 +7,7 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable
 from .parameters import Parameter
-from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr, PsExpression
+from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
 from ..sympyextensions.binop_mapping import binop_str_to_expr
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
-- 
GitLab


From 3daaa5e5a1f92a5482cb838cd791ed062dae1398 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 28 Jan 2025 12:47:15 +0100
Subject: [PATCH 077/180] Fix typecheck

---
 src/pystencils/sympyextensions/__init__.py      | 2 ++
 src/pystencils/sympyextensions/binop_mapping.py | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 6ab24e936..8d832ba2a 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -2,6 +2,7 @@ from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc
 from .pointers import mem_acc
 from .reduction import reduced_assign
+from .binop_mapping import binop_str_to_expr
 
 from .math import (
     prod,
@@ -35,6 +36,7 @@ from .math import (
 __all__ = [
     "ConditionalFieldAccess",
     "reduced_assign",
+    "binop_str_to_expr",
     "TypedSymbol",
     "CastFunc",
     "mem_acc",
diff --git a/src/pystencils/sympyextensions/binop_mapping.py b/src/pystencils/sympyextensions/binop_mapping.py
index 1cb2a3ab5..04cfb6107 100644
--- a/src/pystencils/sympyextensions/binop_mapping.py
+++ b/src/pystencils/sympyextensions/binop_mapping.py
@@ -1,8 +1,8 @@
 from operator import truediv, mul, sub, add
 
-from src.pystencils.backend.ast.expressions import PsCall, PsExpression
-from src.pystencils.backend.exceptions import FreezeError
-from src.pystencils.backend.functions import MathFunctions, PsMathFunction
+from ..backend.ast.expressions import PsCall, PsExpression
+from ..backend.exceptions import FreezeError
+from ..backend.functions import MathFunctions, PsMathFunction
 
 _available_operator_interface: set[str] = {'+', '-', '*', '/'}
 
-- 
GitLab


From c73deaf6d54da2b95310dc1e606ed132b465c874 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 28 Jan 2025 13:11:14 +0100
Subject: [PATCH 078/180] Fix mypy errors and move binop mapping function

---
 src/pystencils/__init__.py                            |  2 ++
 src/pystencils/backend/kernelcreation/freeze.py       |  5 +++--
 src/pystencils/{sympyextensions => }/binop_mapping.py |  6 +++---
 src/pystencils/codegen/driver.py                      | 10 +++++-----
 src/pystencils/sympyextensions/__init__.py            |  2 --
 5 files changed, 13 insertions(+), 12 deletions(-)
 rename src/pystencils/{sympyextensions => }/binop_mapping.py (85%)

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 916a61392..3e8e8d8e4 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -45,6 +45,7 @@ from .sympyextensions.reduction import (
     MinReducedssignment,
     MaxReducedssignment
 )
+from .binop_mapping import binop_str_to_expr
 
 __all__ = [
     "Field",
@@ -75,6 +76,7 @@ __all__ = [
     "inspect",
     "AssignmentCollection",
     "Assignment",
+    "binop_str_to_expr",
     "AddAugmentedAssignment",
     "AddReducedAssignment",
     "SubReducedAssignment",
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 1238f16af..68868e143 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -13,7 +13,7 @@ from ...sympyextensions import (
     integer_functions,
     ConditionalFieldAccess,
 )
-from ...sympyextensions.binop_mapping import binop_str_to_expr
+from ...binop_mapping import binop_str_to_expr
 from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType
 from ...sympyextensions.pointers import AddressOf, mem_acc
 from ...sympyextensions.reduction import ReducedAssignment
@@ -185,6 +185,8 @@ class FreezeExpressions:
         orig_lhs_symb = lhs.symbol
         dtype = rhs.dtype  # TODO: kernel with (implicit) up/downcasts?
 
+        assert isinstance(dtype, PsNumericType)
+
         # replace original symbol with pointer-based type used for export
         orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype))
 
@@ -196,7 +198,6 @@ class FreezeExpressions:
         new_rhs: PsExpression = binop_str_to_expr(expr.op, new_lhs.clone(), rhs)
 
         # match for reduction operation and set neutral init_val
-        new_rhs: PsExpression
         init_val: PsExpression
         match expr.op:
             case "+":
diff --git a/src/pystencils/sympyextensions/binop_mapping.py b/src/pystencils/binop_mapping.py
similarity index 85%
rename from src/pystencils/sympyextensions/binop_mapping.py
rename to src/pystencils/binop_mapping.py
index 04cfb6107..060fa40aa 100644
--- a/src/pystencils/sympyextensions/binop_mapping.py
+++ b/src/pystencils/binop_mapping.py
@@ -1,8 +1,8 @@
 from operator import truediv, mul, sub, add
 
-from ..backend.ast.expressions import PsCall, PsExpression
-from ..backend.exceptions import FreezeError
-from ..backend.functions import MathFunctions, PsMathFunction
+from .backend.ast.expressions import PsExpression, PsCall
+from .backend.exceptions import FreezeError
+from .backend.functions import PsMathFunction, MathFunctions
 
 _available_operator_interface: set[str] = {'+', '-', '*', '/'}
 
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index b47ad8a9e..d68bfbcac 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -7,8 +7,8 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable
 from .parameters import Parameter
+from ..binop_mapping import binop_str_to_expr
 from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
-from ..sympyextensions.binop_mapping import binop_str_to_expr
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
 
@@ -155,14 +155,14 @@ class DefaultKernelCreationDriver:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
         #   Init local reduction variable copy
-        for local_red, prop in self._ctx.local_reduction_symbols.items():
-            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), prop.init_val)] + kernel_ast.statements
+        for local_red, local_prop in self._ctx.local_reduction_symbols.items():
+            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), local_prop.init_val)] + kernel_ast.statements
 
         #   Write back result to reduction target variable
-        for red_ptr, prop in self._ctx.reduction_pointer_symbols.items():
+        for red_ptr, ptr_prop in self._ctx.reduction_pointer_symbols.items():
             ptr_access = PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
             kernel_ast.statements += [PsAssignment(
-                ptr_access, binop_str_to_expr(prop.op, ptr_access, PsSymbolExpr(prop.local_symbol)))]
+                ptr_access, binop_str_to_expr(ptr_prop.op, ptr_access, PsSymbolExpr(ptr_prop.local_symbol)))]
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 8d832ba2a..6ab24e936 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -2,7 +2,6 @@ from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc
 from .pointers import mem_acc
 from .reduction import reduced_assign
-from .binop_mapping import binop_str_to_expr
 
 from .math import (
     prod,
@@ -36,7 +35,6 @@ from .math import (
 __all__ = [
     "ConditionalFieldAccess",
     "reduced_assign",
-    "binop_str_to_expr",
     "TypedSymbol",
     "CastFunc",
     "mem_acc",
-- 
GitLab


From f71ce708a1aaa876a700e880d8cd1b63a0d080ee Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 28 Jan 2025 18:00:53 +0100
Subject: [PATCH 079/180] Enforce usage of typed symbols for reductions

---
 src/pystencils/backend/kernelcreation/freeze.py | 4 +++-
 tests/kernelcreation/test_reduction.py          | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 68868e143..de272cf44 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -176,6 +176,8 @@ class FreezeExpressions:
         return PsAssignment(lhs, binop_str_to_expr(expr.op[0], lhs.clone(), rhs))
 
     def map_ReducedAssignment(self, expr: ReducedAssignment):
+        assert isinstance(expr.lhs, TypedSymbol)
+
         lhs = self.visit(expr.lhs)
         rhs = self.visit(expr.rhs)
 
@@ -183,7 +185,7 @@ class FreezeExpressions:
         assert isinstance(lhs, PsSymbolExpr)
 
         orig_lhs_symb = lhs.symbol
-        dtype = rhs.dtype  # TODO: kernel with (implicit) up/downcasts?
+        dtype = lhs.dtype
 
         assert isinstance(dtype, PsNumericType)
 
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index c01dce5a6..8095f4e1d 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -24,7 +24,7 @@ def test_reduction(dtype, op):
     gpu_avail = False
 
     x = ps.fields(f'x: {dtype}[1d]')
-    w = sp.Symbol("w")
+    w = ps.TypedSymbol("w", dtype)
 
     # kernel with reduction assignment
 
-- 
GitLab


From e94c4980ed860b127a27743fd9727192d667c906 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 28 Jan 2025 19:18:00 +0100
Subject: [PATCH 080/180] Adapt reduction assignment interface and employ enums
 instead of strings for the binary operation employed

---
 src/pystencils/__init__.py                    | 22 +++----
 .../backend/kernelcreation/freeze.py          | 27 +++++---
 .../backend/transformations/add_pragmas.py    |  2 +-
 src/pystencils/codegen/driver.py              |  2 +-
 src/pystencils/codegen/properties.py          |  5 +-
 ...inop_mapping.py => compound_op_mapping.py} | 18 +++---
 src/pystencils/sympyextensions/__init__.py    |  6 +-
 src/pystencils/sympyextensions/reduction.py   | 63 ++++++++++++-------
 tests/kernelcreation/test_reduction.py        |  7 +--
 9 files changed, 90 insertions(+), 62 deletions(-)
 rename src/pystencils/{binop_mapping.py => compound_op_mapping.py} (65%)

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 3e8e8d8e4..6aa305a16 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -39,13 +39,12 @@ from .sympyextensions.typed_sympy import TypedSymbol, DynamicType
 from .sympyextensions import SymbolCreator
 from .datahandling import create_data_handling
 from .sympyextensions.reduction import (
-    AddReducedAssignment,
-    SubReducedAssignment,
-    MulReducedAssignment,
-    MinReducedssignment,
-    MaxReducedssignment
+    AddReductionAssignment,
+    SubReductionAssignment,
+    MulReductionAssignment,
+    MinReductionAssignment,
+    MaxReductionAssignment,
 )
-from .binop_mapping import binop_str_to_expr
 
 __all__ = [
     "Field",
@@ -76,13 +75,12 @@ __all__ = [
     "inspect",
     "AssignmentCollection",
     "Assignment",
-    "binop_str_to_expr",
     "AddAugmentedAssignment",
-    "AddReducedAssignment",
-    "SubReducedAssignment",
-    "MulReducedAssignment",
-    "MinReducedssignment",
-    "MaxReducedssignment",
+    "AddReductionAssignment",
+    "SubReductionAssignment",
+    "MulReductionAssignment",
+    "MinReductionAssignment",
+    "MaxReductionAssignment",
     "assignment_from_stencil",
     "SymbolCreator",
     "create_data_handling",
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index de272cf44..4bf136562 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -13,10 +13,10 @@ from ...sympyextensions import (
     integer_functions,
     ConditionalFieldAccess,
 )
-from ...binop_mapping import binop_str_to_expr
+from ...compound_op_mapping import compound_op_to_expr
 from ...sympyextensions.typed_sympy import TypedSymbol, CastFunc, DynamicType
 from ...sympyextensions.pointers import AddressOf, mem_acc
-from ...sympyextensions.reduction import ReducedAssignment
+from ...sympyextensions.reduction import ReductionAssignment, ReductionOp
 from ...field import Field, FieldType
 
 from .context import KernelCreationContext
@@ -173,9 +173,16 @@ class FreezeExpressions:
         assert isinstance(lhs, PsExpression)
         assert isinstance(rhs, PsExpression)
 
-        return PsAssignment(lhs, binop_str_to_expr(expr.op[0], lhs.clone(), rhs))
+        _str_to_compound_op: dict[str, ReductionOp] = {
+            "+=": ReductionOp.Add,
+            "-=": ReductionOp.Sub,
+            "*=": ReductionOp.Mul,
+            "/=": ReductionOp.Div,
+        }
 
-    def map_ReducedAssignment(self, expr: ReducedAssignment):
+        return PsAssignment(lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs))
+
+    def map_ReductionAssignment(self, expr: ReductionAssignment):
         assert isinstance(expr.lhs, TypedSymbol)
 
         lhs = self.visit(expr.lhs)
@@ -197,21 +204,21 @@ class FreezeExpressions:
         new_lhs = PsSymbolExpr(new_lhs_symb)
 
         # get new rhs from augmented assignment
-        new_rhs: PsExpression = binop_str_to_expr(expr.op, new_lhs.clone(), rhs)
+        new_rhs: PsExpression = compound_op_to_expr(expr.op, new_lhs.clone(), rhs)
 
         # match for reduction operation and set neutral init_val
         init_val: PsExpression
         match expr.op:
-            case "+":
+            case ReductionOp.Add:
                 init_val = PsConstantExpr(PsConstant(0, dtype))
-            case "-":
+            case ReductionOp.Sub:
                 init_val = PsConstantExpr(PsConstant(0, dtype))
-            case "*":
+            case ReductionOp.Mul:
                 init_val = PsConstantExpr(PsConstant(1, dtype))
-            case "min":
+            case ReductionOp.Min:
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
                 init_val.dtype = dtype
-            case "max":
+            case ReductionOp.Max:
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
                 init_val.dtype = dtype
             case _:
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index 44d1d1ede..f4046d87d 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -115,7 +115,7 @@ class AddOpenMP:
         if bool(ctx.local_reduction_symbols):
             for symbol, reduction in ctx.local_reduction_symbols.items():
                 if isinstance(symbol.dtype, PsScalarType):
-                    pragma_text += f" reduction({reduction.op}: {symbol.name})"
+                    pragma_text += f" reduction({reduction.op.value}: {symbol.name})"
                 else:
                     NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.")
 
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index d68bfbcac..6e0611a4b 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -7,7 +7,7 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable
 from .parameters import Parameter
-from ..binop_mapping import binop_str_to_expr
+from ..compound_op_mapping import compound_op_to_expr
 from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index 1e71c5b98..d3c2435ed 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 
 from ..field import Field
+from ..sympyextensions.reduction import ReductionOp
 
 
 @dataclass(frozen=True)
@@ -21,7 +22,7 @@ class LocalReductionVariable(PsSymbolProperty):
     from ..backend.memory import PsSymbol
     from ..backend.ast.expressions import PsExpression
 
-    op: str
+    op: ReductionOp
     init_val: PsExpression
     ptr_symbol: PsSymbol
 
@@ -32,7 +33,7 @@ class ReductionPointerVariable(PsSymbolProperty):
 
     from ..backend.memory import PsSymbol
 
-    op: str
+    op: ReductionOp
     local_symbol: PsSymbol
 
 
diff --git a/src/pystencils/binop_mapping.py b/src/pystencils/compound_op_mapping.py
similarity index 65%
rename from src/pystencils/binop_mapping.py
rename to src/pystencils/compound_op_mapping.py
index 060fa40aa..eb10b3381 100644
--- a/src/pystencils/binop_mapping.py
+++ b/src/pystencils/compound_op_mapping.py
@@ -1,31 +1,33 @@
+from enum import Enum
 from operator import truediv, mul, sub, add
 
 from .backend.ast.expressions import PsExpression, PsCall
 from .backend.exceptions import FreezeError
 from .backend.functions import PsMathFunction, MathFunctions
+from .sympyextensions.reduction import ReductionOp
 
-_available_operator_interface: set[str] = {'+', '-', '*', '/'}
+_available_operator_interface: set[ReductionOp] = {ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Div}
 
 
-def binop_str_to_expr(op: str, op1, op2) -> PsExpression:
+def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
     if op in _available_operator_interface:
         match op:
-            case "+":
+            case ReductionOp.Add:
                 operator = add
-            case "-":
+            case ReductionOp.Sub:
                 operator = sub
-            case "*":
+            case ReductionOp.Mul:
                 operator = mul
-            case "/":
+            case ReductionOp.Div:
                 operator = truediv
             case _:
                 raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.")
         return operator(op1, op2)
     else:
         match op:
-            case "min":
+            case ReductionOp.Min:
                 return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2])
-            case "max":
+            case ReductionOp.Max:
                 return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2])
             case _:
                 raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.")
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 6ab24e936..eb90f4bed 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,7 +1,7 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc
 from .pointers import mem_acc
-from .reduction import reduced_assign
+from .reduction import reduction_assignment, reduction_assignment_from_str, ReductionOp
 
 from .math import (
     prod,
@@ -34,7 +34,9 @@ from .math import (
 
 __all__ = [
     "ConditionalFieldAccess",
-    "reduced_assign",
+    "reduction_assignment",
+    "reduction_assignment_from_str",
+    "ReductionOp",
     "TypedSymbol",
     "CastFunc",
     "mem_acc",
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index c9e5bfdfb..9d8aecb5b 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -1,54 +1,73 @@
+from enum import Enum
+
 from sympy.codegen.ast import AssignmentBase
 
 
-class ReducedAssignment(AssignmentBase):
+class ReductionOp(Enum):
+    Add = "+"
+    Sub = "-"
+    Mul = "*"
+    Div = "/"
+    Min = "min"
+    Max = "max"
+
+
+class ReductionAssignment(AssignmentBase):
     """
     Base class for reduced assignments.
 
     Attributes:
     ===========
 
-    binop : str
-       Symbol for binary operation being applied in the assignment, such as "+",
-       "*", etc.
+    binop : CompoundOp
+       Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc.
     """
-    binop = None  # type: str
+    binop = None  # type: ReductionOp
 
     @property
     def op(self):
         return self.binop
 
 
-class AddReducedAssignment(ReducedAssignment):
-    binop = '+'
+class AddReductionAssignment(ReductionAssignment):
+    binop = ReductionOp.Add
 
 
-class SubReducedAssignment(ReducedAssignment):
-    binop = '-'
+class SubReductionAssignment(ReductionAssignment):
+    binop = ReductionOp.Sub
 
 
-class MulReducedAssignment(ReducedAssignment):
-    binop = '*'
+class MulReductionAssignment(ReductionAssignment):
+    binop = ReductionOp.Mul
 
 
-class MinReducedssignment(ReducedAssignment):
-    binop = 'min'
+class MinReductionAssignment(ReductionAssignment):
+    binop = ReductionOp.Min
 
 
-class MaxReducedssignment(ReducedAssignment):
-    binop = 'max'
+class MaxReductionAssignment(ReductionAssignment):
+    binop = ReductionOp.Max
 
 
-# Mapping from binary op strings to AugmentedAssignment subclasses
-reduced_assign_classes = {
+# Mapping from ReductionOp enum to ReductionAssigment classes
+_reduction_assignment_classes = {
     cls.binop: cls for cls in [
-        AddReducedAssignment, SubReducedAssignment, MulReducedAssignment,
-        MinReducedssignment, MaxReducedssignment
+        AddReductionAssignment, SubReductionAssignment, MulReductionAssignment,
+        MinReductionAssignment, MaxReductionAssignment
     ]
 }
 
+# Mapping from ReductionOp str to ReductionAssigment classes
+_reduction_assignment_classes_for_str = {
+    cls.value: cls for cls in _reduction_assignment_classes
+}
 
-def reduced_assign(lhs, op, rhs):
-    if op not in reduced_assign_classes:
+
+def reduction_assignment(lhs, op: ReductionOp, rhs):
+    if op not in _reduction_assignment_classes:
         raise ValueError("Unrecognized operator %s" % op)
-    return reduced_assign_classes[op](lhs, rhs)
+    return _reduction_assignment_classes[op](lhs, rhs)
+
+
+def reduction_assignment_from_str(lhs, op: str, rhs):
+    return reduction_assignment(lhs, _reduction_assignment_classes_for_str[op], rhs)
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 8095f4e1d..c84417ac7 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -1,10 +1,9 @@
 import pytest
 import numpy as np
-import sympy as sp
 import cupy as cp
 
 import pystencils as ps
-from pystencils.sympyextensions import reduced_assign
+from pystencils.sympyextensions import reduction_assignment_from_str
 
 INIT_W = 5
 INIT_ARR = 2
@@ -28,11 +27,11 @@ def test_reduction(dtype, op):
 
     # kernel with reduction assignment
 
-    reduction_assignment = reduced_assign(w, op, x.center())
+    red_assign = reduction_assignment_from_str(w, op, x.center())
 
     config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True)
 
-    ast_reduction = ps.create_kernel([reduction_assignment], config, default_dtype=dtype)
+    ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype)
     # code_reduction = ps.get_code_str(ast_reduction)
     kernel_reduction = ast_reduction.compile()
 
-- 
GitLab


From 2424c15725cd621fd8bfa573f928b82255a17693 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 28 Jan 2025 20:05:51 +0100
Subject: [PATCH 081/180] Move reduction properties for local and ptr variable
 into single dataclass

---
 .../backend/kernelcreation/context.py         | 63 +++++++------------
 .../backend/kernelcreation/freeze.py          | 24 ++++---
 .../backend/transformations/add_pragmas.py    |  6 +-
 src/pystencils/codegen/driver.py              | 24 ++++---
 src/pystencils/codegen/parameters.py          | 14 +----
 src/pystencils/codegen/properties.py          | 23 -------
 src/pystencils/jit/cpu_extension_module.py    | 16 ++---
 tests/kernelcreation/test_reduction.py        |  2 +-
 8 files changed, 60 insertions(+), 112 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 2f46a7421..868a7852c 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -1,16 +1,17 @@
 from __future__ import annotations
 
+from dataclasses import dataclass
 from typing import Iterable, Iterator, Any
 from itertools import chain, count
 from collections import namedtuple, defaultdict
 import re
 
+from ..ast.expressions import PsExpression
 from ...defaults import DEFAULTS
 from ...field import Field, FieldType
+from ...sympyextensions import ReductionOp
 from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType
 
-from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable
-
 from ..memory import PsSymbol, PsBuffer
 from ..constants import PsConstant
 from ...types import (
@@ -46,6 +47,16 @@ class FieldsInKernel:
 FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array"))
 
 
+@dataclass(frozen=True)
+class ReductionInfo:
+
+    op: ReductionOp
+    init_val: PsExpression
+
+    orig_symbol: PsSymbol
+    ptr_symbol: PsSymbol
+
+
 class KernelCreationContext:
     """Manages the translation process from the SymPy frontend to the backend AST, and collects
     all necessary information for the translation:
@@ -77,8 +88,7 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
-        self._local_reduction_symbols: dict[PsSymbol, LocalReductionVariable] = dict()
-        self._reduction_ptr_symbols: dict[PsSymbol, ReductionPointerVariable] = dict()
+        self._symbols_reduction_info: dict[PsSymbol, ReductionInfo] = dict()
 
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
@@ -173,41 +183,17 @@ class KernelCreationContext:
 
         self._symbols[old.name] = new
 
-    def add_local_reduction_symbol(self, local_symb: PsSymbol, local_var_prop: LocalReductionVariable):
-        """Adds entry for a symbol and its property to the lookup table for local reduction variables.
+    def add_symbol_reduction_info(self, local_symb: PsSymbol, reduction_info: ReductionInfo):
+        """Adds entry for a symbol and its reduction info to its corresponding lookup table.
 
-        The symbol ``symbol`` should not have a 'LocalReductionSymbol' property and shall not exist in the symbol table.
+        The symbol ``symbol`` shall not exist in the symbol table already.
         """
-        if self.find_symbol(local_symb.name) is not None:
-            raise PsInternalCompilerError(
-                f"add_local_reduction_symbol: {local_symb.name} already exist in the symbol table"
-            )
-        self.add_symbol(local_symb)
-
-        if local_symb not in self._local_reduction_symbols and not local_symb.get_properties(LocalReductionVariable):
-            local_symb.add_property(local_var_prop)
-            self._local_reduction_symbols[local_symb] = local_var_prop
-        else:
+        if local_symb in self._symbols_reduction_info:
             raise PsInternalCompilerError(
-                f"add_local_reduction_symbol: {local_symb.name} already exists in local reduction table"
+                f"add_symbol_reduction_info: {local_symb.name} already exist in the symbol table"
             )
 
-    def add_reduction_ptr_symbol(self, orig_symb: PsSymbol, ptr_symb: PsSymbol, ptr_var_prop: ReductionPointerVariable):
-        """Replaces reduction symbol with a pointer-based counterpart used for export
-        and adds the new symbol and its property to the lookup table for pointer-based reduction variables
-
-        The symbol ``ptr_symbol`` should not exist in the symbol table.
-        """
-        self.replace_symbol(orig_symb, ptr_symb)
-
-        if ptr_symb not in self._reduction_ptr_symbols and not ptr_symb.get_properties(
-                ReductionPointerVariable):
-            ptr_symb.add_property(ptr_var_prop)
-            self._reduction_ptr_symbols[ptr_symb] = ptr_var_prop
-        else:
-            raise PsInternalCompilerError(
-                f"add_reduction_ptr_symbol: {ptr_symb.name} already exists in pointer-based reduction variable table "
-            )
+        self._symbols_reduction_info[local_symb] = reduction_info
 
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
@@ -245,14 +231,9 @@ class KernelCreationContext:
         return self._symbols.values()
 
     @property
-    def local_reduction_symbols(self) -> dict[PsSymbol, LocalReductionVariable]:
+    def symbols_reduction_info(self) -> dict[PsSymbol, ReductionInfo]:
         """Return a dictionary holding kernel-local reduction symbols and their reduction properties."""
-        return self._local_reduction_symbols
-
-    @property
-    def reduction_pointer_symbols(self) -> dict[PsSymbol, ReductionPointerVariable]:
-        """Return a dictionary holding pointer-based reduction symbols and their reduction properties."""
-        return self._reduction_ptr_symbols
+        return self._symbols_reduction_info
 
     #   Fields and Arrays
 
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 4bf136562..5bb7f8b08 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -19,7 +19,7 @@ from ...sympyextensions.pointers import AddressOf, mem_acc
 from ...sympyextensions.reduction import ReductionAssignment, ReductionOp
 from ...field import Field, FieldType
 
-from .context import KernelCreationContext
+from .context import KernelCreationContext, ReductionInfo
 
 from ..ast.structural import (
     PsAstNode,
@@ -66,8 +66,6 @@ from ..exceptions import PsInputError
 from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
 from ..exceptions import FreezeError
 
-from ...codegen.properties import LocalReductionVariable, ReductionPointerVariable
-
 
 ExprLike = (
     sp.Expr
@@ -210,25 +208,25 @@ class FreezeExpressions:
         init_val: PsExpression
         match expr.op:
             case ReductionOp.Add:
-                init_val = PsConstantExpr(PsConstant(0, dtype))
+                init_val = PsConstantExpr(PsConstant(0))
             case ReductionOp.Sub:
-                init_val = PsConstantExpr(PsConstant(0, dtype))
+                init_val = PsConstantExpr(PsConstant(0))
             case ReductionOp.Mul:
-                init_val = PsConstantExpr(PsConstant(1, dtype))
+                init_val = PsConstantExpr(PsConstant(1))
             case ReductionOp.Min:
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
-                init_val.dtype = dtype
             case ReductionOp.Max:
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
-                init_val.dtype = dtype
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
-        # set reduction symbol properties (local/pointer variables) in context
-        self._ctx.add_local_reduction_symbol(new_lhs_symb,
-                                             LocalReductionVariable(expr.op, init_val, orig_lhs_symb_as_ptr))
-        self._ctx.add_reduction_ptr_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr,
-                                           ReductionPointerVariable(expr.op, new_lhs_symb))
+        reduction_info = ReductionInfo(expr.op, init_val, orig_lhs_symb, orig_lhs_symb_as_ptr)
+
+        # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info
+        self._ctx.add_symbol(new_lhs_symb)
+        self._ctx.add_symbol_reduction_info(new_lhs_symb, reduction_info)
+        self._ctx.replace_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr)
+
 
         return PsAssignment(new_lhs, new_rhs)
 
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index f4046d87d..d72008d56 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -112,10 +112,10 @@ class AddOpenMP:
         pragma_text += " parallel" if not omp_params.omit_parallel_construct else ""
         pragma_text += f" for schedule({omp_params.schedule})"
 
-        if bool(ctx.local_reduction_symbols):
-            for symbol, reduction in ctx.local_reduction_symbols.items():
+        if bool(ctx.symbols_reduction_info):
+            for symbol, reduction_info in ctx.symbols_reduction_info.items():
                 if isinstance(symbol.dtype, PsScalarType):
-                    pragma_text += f" reduction({reduction.op.value}: {symbol.name})"
+                    pragma_text += f" reduction({reduction_info.op.value}: {symbol.name})"
                 else:
                     NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.")
 
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 6e0611a4b..ba7df317a 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -5,7 +5,7 @@ from dataclasses import dataclass, replace
 from .target import Target
 from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
-from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr, ReductionPointerVariable
+from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
 from .parameters import Parameter
 from ..compound_op_mapping import compound_op_to_expr
 from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
@@ -154,15 +154,21 @@ class DefaultKernelCreationDriver:
         if self._intermediates is not None:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
-        #   Init local reduction variable copy
-        for local_red, local_prop in self._ctx.local_reduction_symbols.items():
-            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(local_red), local_prop.init_val)] + kernel_ast.statements
+        #   Extensions for reductions
+        for symbol, reduction_info in self._ctx.symbols_reduction_info.items():
+            # Init local reduction variable copy
+            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(symbol),
+                                                   reduction_info.init_val)] + kernel_ast.statements
 
-        #   Write back result to reduction target variable
-        for red_ptr, ptr_prop in self._ctx.reduction_pointer_symbols.items():
-            ptr_access = PsMemAcc(PsSymbolExpr(red_ptr), PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
+            # Write back result to reduction target variable
+            ptr_access = PsMemAcc(PsSymbolExpr(reduction_info.ptr_symbol),
+                                  PsConstantExpr(PsConstant(0)))
             kernel_ast.statements += [PsAssignment(
-                ptr_access, binop_str_to_expr(ptr_prop.op, ptr_access, PsSymbolExpr(ptr_prop.local_symbol)))]
+                ptr_access, compound_op_to_expr(reduction_info.op, ptr_access, PsSymbolExpr(symbol)))]
+
+            # TODO: only newly introduced nodes
+            typify = Typifier(self._ctx)
+            kernel_ast = typify(kernel_ast)
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
@@ -462,8 +468,6 @@ def _get_function_params(
         props: set[PsSymbolProperty] = set()
         for prop in symb.properties:
             match prop:
-                case ReductionPointerVariable():
-                    props.add(prop)
                 case FieldShape() | FieldStride():
                     props.add(prop)
                 case BufferBasePtr(buf):
diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py
index 094553517..e6a513cc7 100644
--- a/src/pystencils/codegen/parameters.py
+++ b/src/pystencils/codegen/parameters.py
@@ -8,7 +8,7 @@ from .properties import (
     _FieldProperty,
     FieldShape,
     FieldStride,
-    FieldBasePtr, ReductionPointerVariable,
+    FieldBasePtr,
 )
 from ..types import PsType
 from ..field import Field
@@ -39,9 +39,6 @@ class Parameter:
                 key=lambda f: f.name,
             )
         )
-        self._reduction_ptr: Optional[ReductionPointerVariable] = next(
-            (e for e in self._properties if isinstance(e, ReductionPointerVariable)), None
-        )
 
     @property
     def name(self):
@@ -82,11 +79,6 @@ class Parameter:
         """Set of fields associated with this parameter."""
         return self._fields
 
-    @property
-    def reduction_pointer(self) -> Optional[ReductionPointerVariable]:
-        """Reduction pointer associated with this parameter."""
-        return self._reduction_ptr
-
     def get_properties(
         self, prop_type: type[PsSymbolProperty] | tuple[type[PsSymbolProperty], ...]
     ) -> set[PsSymbolProperty]:
@@ -113,10 +105,6 @@ class Parameter:
         )
         return bool(self.get_properties(FieldBasePtr))
 
-    @property
-    def is_reduction_pointer(self) -> bool:
-        return bool(self._reduction_ptr)
-
     @property
     def is_field_stride(self) -> bool:  # pragma: no cover
         warn(
diff --git a/src/pystencils/codegen/properties.py b/src/pystencils/codegen/properties.py
index d3c2435ed..d377fb3d3 100644
--- a/src/pystencils/codegen/properties.py
+++ b/src/pystencils/codegen/properties.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 from dataclasses import dataclass
 
 from ..field import Field
-from ..sympyextensions.reduction import ReductionOp
 
 
 @dataclass(frozen=True)
@@ -15,28 +14,6 @@ class UniqueSymbolProperty(PsSymbolProperty):
     """Base class for unique properties, of which only one instance may be registered at a time."""
 
 
-@dataclass(frozen=True)
-class LocalReductionVariable(PsSymbolProperty):
-    """Property for symbols specifying the operation and initial value for a kernel-local reduction variable."""
-
-    from ..backend.memory import PsSymbol
-    from ..backend.ast.expressions import PsExpression
-
-    op: ReductionOp
-    init_val: PsExpression
-    ptr_symbol: PsSymbol
-
-
-@dataclass(frozen=True)
-class ReductionPointerVariable(PsSymbolProperty):
-    """Property for pointer-type symbols exporting the reduction result from the kernel."""
-
-    from ..backend.memory import PsSymbol
-
-    op: ReductionOp
-    local_symbol: PsSymbol
-
-
 @dataclass(frozen=True)
 class FieldShape(PsSymbolProperty):
     """Symbol acts as a shape parameter to a field."""
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index 6ec62c28d..44185f4ed 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -18,7 +18,7 @@ from ..types import (
     PsType,
     PsUnsignedIntegerType,
     PsSignedIntegerType,
-    PsIeeeFloatType,
+    PsIeeeFloatType, PsPointerType,
 )
 from ..types.quick import Fp, SInt, UInt
 
@@ -205,7 +205,7 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         self._array_assoc_var_extractions: dict[Parameter, str] = dict()
         self._scalar_extractions: dict[Parameter, str] = dict()
 
-        self._reduction_ptrs: dict[Parameter, str] = dict()
+        self._pointer_extractions: dict[Parameter, str] = dict()
 
         self._constraint_checks: list[str] = []
 
@@ -278,9 +278,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
         return param.name
 
-    def extract_reduction_ptr(self, param: Parameter) -> str:
-        if param not in self._reduction_ptrs:
-            ptr = param.reduction_pointer
+    def extract_ptr(self, param: Parameter) -> str:
+        if param not in self._pointer_extractions:
+            ptr = param.symbol
             buffer = self.extract_buffer(ptr, param.name, param.dtype)
             code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;"
 
@@ -317,10 +317,10 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         return param.name
 
     def extract_parameter(self, param: Parameter):
-        if param.is_reduction_pointer:
-            self.extract_reduction_ptr(param)
-        elif param.is_field_parameter:
+        if param.is_field_parameter:
             self.extract_array_assoc_var(param)
+        elif isinstance(param.dtype, PsPointerType):
+            self.extract_ptr(param)
         else:
             self.extract_scalar(param)
 
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index c84417ac7..69b75e711 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -18,7 +18,7 @@ SOLUTION = {
 
 
 @pytest.mark.parametrize('dtype', ["float64"])
-@pytest.mark.parametrize("op", ["+", "-", "*"]) #, "min", "max"]) # TODO: min/max broken due to error in BasePrinter
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction(dtype, op):
     gpu_avail = False
 
-- 
GitLab


From 06dc234497d8860c4c7fde704ba26c8c3da030a1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 29 Jan 2025 11:48:26 +0100
Subject: [PATCH 082/180] Use std::numeric_limits as NumericLimitsFunctions
 backend for cpu

---
 .../backend/platforms/generic_cpu.py          | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 58b9c7946..b145b6f76 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -44,7 +44,7 @@ class GenericCpu(Platform):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"<math.h>", "<limits.h>", "<float.h>"}
+        return {"<math.h>", "<limits>"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
@@ -64,22 +64,7 @@ class GenericCpu(Platform):
         arg_types = (dtype,) * func.num_args
 
         if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
-            # get type prefix for macro
-            # TODO: there must be a better way...
-            tpe = ""
-            match dtype:
-                case PsIeeeFloatType():
-                    match dtype.width:
-                        case 32:
-                            tpe = "FLT"
-                        case 64:
-                            tpe = "DBL"
-                case _:
-                    raise MaterializationError(
-                        f"No implementation available for function {func} on data type {dtype}"
-                    )
-
-            return PsLiteralExpr(PsLiteral(f"{tpe}_{func.function_name}".upper(), dtype))
+            return PsLiteralExpr(PsLiteral(f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype))
 
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
             cfunc: CFunction
-- 
GitLab


From a9da7d432d3bb5ed03da5f08f7b5dbd94c17ff7d Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 29 Jan 2025 11:56:00 +0100
Subject: [PATCH 083/180] Fix lint [skip ci]

---
 src/pystencils/backend/kernelcreation/freeze.py | 1 -
 src/pystencils/codegen/parameters.py            | 2 +-
 src/pystencils/compound_op_mapping.py           | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 5bb7f8b08..c8d84c1b4 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -227,7 +227,6 @@ class FreezeExpressions:
         self._ctx.add_symbol_reduction_info(new_lhs_symb, reduction_info)
         self._ctx.replace_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr)
 
-
         return PsAssignment(new_lhs, new_rhs)
 
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
diff --git a/src/pystencils/codegen/parameters.py b/src/pystencils/codegen/parameters.py
index e6a513cc7..d8411266e 100644
--- a/src/pystencils/codegen/parameters.py
+++ b/src/pystencils/codegen/parameters.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from warnings import warn
-from typing import Sequence, Iterable, Optional
+from typing import Sequence, Iterable
 
 from .properties import (
     PsSymbolProperty,
diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/compound_op_mapping.py
index eb10b3381..1eadfa6f0 100644
--- a/src/pystencils/compound_op_mapping.py
+++ b/src/pystencils/compound_op_mapping.py
@@ -1,4 +1,3 @@
-from enum import Enum
 from operator import truediv, mul, sub, add
 
 from .backend.ast.expressions import PsExpression, PsCall
-- 
GitLab


From 53807242b8f2eae36b75abb3d101d53b3948157a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 30 Jan 2025 16:20:25 +0100
Subject: [PATCH 084/180] Remove orig_symbol from reduction info as it is not
 needed

---
 src/pystencils/backend/kernelcreation/context.py | 2 --
 src/pystencils/backend/kernelcreation/freeze.py  | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 868a7852c..67d5b1c1d 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -52,8 +52,6 @@ class ReductionInfo:
 
     op: ReductionOp
     init_val: PsExpression
-
-    orig_symbol: PsSymbol
     ptr_symbol: PsSymbol
 
 
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index c8d84c1b4..cfa145e5a 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -220,7 +220,7 @@ class FreezeExpressions:
             case _:
                 raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
 
-        reduction_info = ReductionInfo(expr.op, init_val, orig_lhs_symb, orig_lhs_symb_as_ptr)
+        reduction_info = ReductionInfo(expr.op, init_val, orig_lhs_symb_as_ptr)
 
         # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info
         self._ctx.add_symbol(new_lhs_symb)
-- 
GitLab


From dd8f421d774f59d1dd36d4fdcb76a51375f4bda9 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 30 Jan 2025 16:38:46 +0100
Subject: [PATCH 085/180] Introduce functions to be unfolded by platform into
 code blocks for reduction init and write-back

---
 src/pystencils/backend/functions.py           | 43 +++++++++++++++++++
 .../backend/kernelcreation/typification.py    |  4 +-
 .../backend/platforms/generic_cpu.py          | 41 ++++++++++++++++--
 src/pystencils/backend/platforms/platform.py  | 11 +++++
 .../transformations/select_functions.py       |  4 +-
 src/pystencils/codegen/driver.py              | 29 +++++++------
 tests/kernelcreation/test_reduction.py        |  4 +-
 7 files changed, 113 insertions(+), 23 deletions(-)

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index 18c2277cf..201321693 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -30,6 +30,7 @@ from typing import Any, Sequence, TYPE_CHECKING
 from abc import ABC
 from enum import Enum
 
+from ..sympyextensions import ReductionOp
 from ..types import PsType
 from .exceptions import PsInternalCompilerError
 
@@ -134,6 +135,48 @@ class PsMathFunction(PsFunction):
         return hash(self._func)
 
 
+class ReductionFunctions(Enum):
+    """Function representing different steps in kernels with reductions supported by the backend.
+
+    Each platform has to materialize these functions to a concrete implementation.
+    """
+
+    InitLocalCopy = ("InitLocalCopy", 2)
+    WriteBackToPtr = ("WriteBackToPtr", 2)
+
+    def __init__(self, func_name, num_args):
+        self.function_name = func_name
+        self.num_args = num_args
+
+
+class PsReductionFunction(PsFunction):
+
+    def __init__(self, func: ReductionFunctions, op: ReductionOp) -> None:
+        super().__init__(func.function_name, func.num_args)
+        self._func = func
+        self._op = op
+
+    @property
+    def func(self) -> ReductionFunctions:
+        return self._func
+
+    @property
+    def op(self) -> ReductionOp:
+        return self._op
+
+    def __str__(self) -> str:
+        return f"{self._func.function_name}"
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, PsReductionFunction):
+            return False
+
+        return self._func == other._func
+
+    def __hash__(self) -> int:
+        return hash(self._func)
+
+
 class CFunction(PsFunction):
     """A concrete C function.
 
diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index 62feca265..059817bfd 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -50,7 +50,7 @@ from ..ast.expressions import (
     PsNot,
 )
 from ..ast.vector import PsVecBroadcast, PsVecMemAcc
-from ..functions import PsMathFunction, CFunction
+from ..functions import PsMathFunction, CFunction, PsReductionFunction
 from ..ast.util import determine_memory_object
 from ..exceptions import TypificationError
 
@@ -590,7 +590,7 @@ class Typifier:
 
             case PsCall(function, args):
                 match function:
-                    case PsMathFunction():
+                    case PsMathFunction() | PsReductionFunction():
                         for arg in args:
                             self.visit_expr(arg, tc)
                         tc.infer_dtype(expr)
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index b145b6f76..33cb28711 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -1,11 +1,14 @@
 from abc import ABC, abstractmethod
 from typing import Sequence
 
-from pystencils.backend.ast.expressions import PsCall
+from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr
 
-from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions
+from ..ast import PsAstNode
+from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions, \
+    PsReductionFunction
 from ..literals import PsLiteral
-from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType
+from ...compound_op_mapping import compound_op_to_expr
+from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType
 
 from .platform import Platform
 from ..exceptions import MaterializationError
@@ -18,7 +21,7 @@ from ..kernelcreation.iteration_space import (
 )
 
 from ..constants import PsConstant
-from ..ast.structural import PsDeclaration, PsLoop, PsBlock
+from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsAssignment
 from ..ast.expressions import (
     PsSymbolExpr,
     PsExpression,
@@ -56,6 +59,36 @@ class GenericCpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
+    def unfold_function(
+        self, call: PsCall
+    ) -> PsAstNode:
+        assert isinstance(call.function, PsReductionFunction)
+
+        func = call.function.func
+
+        match func:
+            case ReductionFunctions.InitLocalCopy:
+                symbol_expr, init_val = call.args
+                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression)
+
+                return PsDeclaration(symbol_expr, init_val)
+            case ReductionFunctions.WriteBackToPtr:
+                ptr_expr, symbol_expr = call.args
+                op = call.function.op
+
+                assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
+                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
+
+                ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
+
+                # TODO: can this be avoided somehow?
+                potential_call = compound_op_to_expr(op, ptr_access, symbol_expr)
+                if isinstance(potential_call, PsCall):
+                    potential_call.dtype = symbol_expr.dtype
+                    potential_call = self.select_function(potential_call)
+
+                return PsAssignment(ptr_access, potential_call)
+
     def select_function(self, call: PsCall) -> PsExpression:
         assert isinstance(call.function, PsMathFunction)
 
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index 2c7ee1c5f..732f37bbc 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Any
 
+from ..ast import PsAstNode
 from ..ast.structural import PsBlock
 from ..ast.expressions import PsCall, PsExpression
 
@@ -40,3 +41,13 @@ class Platform(ABC):
         If no viable implementation exists, raise a `MaterializationError`.
         """
         pass
+
+    @abstractmethod
+    def unfold_function(
+        self, call: PsCall
+    ) -> PsAstNode:
+        """Unfolds an implementation for the given function on the given data type.
+
+        If no viable implementation exists, raise a `MaterializationError`.
+        """
+        pass
diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py
index e41c345ae..0045de87b 100644
--- a/src/pystencils/backend/transformations/select_functions.py
+++ b/src/pystencils/backend/transformations/select_functions.py
@@ -1,7 +1,7 @@
 from ..platforms import Platform
 from ..ast import PsAstNode
 from ..ast.expressions import PsCall
-from ..functions import PsMathFunction
+from ..functions import PsMathFunction, PsReductionFunction
 
 
 class SelectFunctions:
@@ -19,5 +19,7 @@ class SelectFunctions:
 
         if isinstance(node, PsCall) and isinstance(node.function, PsMathFunction):
             return self._platform.select_function(node)
+        elif isinstance(node, PsCall) and isinstance(node.function, PsReductionFunction):
+            return self._platform.unfold_function(node)
         else:
             return node
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index ba7df317a..9a80439e7 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -7,14 +7,14 @@ from .config import CreateKernelConfig, OpenMpConfig, VectorizationConfig, AUTO
 from .kernel import Kernel, GpuKernel, GpuThreadsRange
 from .properties import PsSymbolProperty, FieldShape, FieldStride, FieldBasePtr
 from .parameters import Parameter
-from ..compound_op_mapping import compound_op_to_expr
-from ..backend.ast.expressions import PsSymbolExpr, PsMemAcc, PsConstantExpr
+from ..backend.functions import PsReductionFunction, ReductionFunctions
+from ..backend.ast.expressions import PsSymbolExpr, PsCall
 
 from ..types import create_numeric_type, PsIntegerType, PsScalarType
 
 from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
-from ..backend.ast.structural import PsBlock, PsLoop, PsAssignment, PsDeclaration
+from ..backend.ast.structural import PsBlock, PsLoop
 from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -156,19 +156,20 @@ class DefaultKernelCreationDriver:
 
         #   Extensions for reductions
         for symbol, reduction_info in self._ctx.symbols_reduction_info.items():
-            # Init local reduction variable copy
-            kernel_ast.statements = [PsDeclaration(PsSymbolExpr(symbol),
-                                                   reduction_info.init_val)] + kernel_ast.statements
+            typify = Typifier(self._ctx)
+            symbol_expr = typify(PsSymbolExpr(symbol))
+            ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol))
+            init_val = typify(reduction_info.init_val)
 
-            # Write back result to reduction target variable
-            ptr_access = PsMemAcc(PsSymbolExpr(reduction_info.ptr_symbol),
-                                  PsConstantExpr(PsConstant(0)))
-            kernel_ast.statements += [PsAssignment(
-                ptr_access, compound_op_to_expr(reduction_info.op, ptr_access, PsSymbolExpr(symbol)))]
+            init_local_copy = PsCall(PsReductionFunction(ReductionFunctions.InitLocalCopy, reduction_info.op),
+                                     [symbol_expr, init_val])
+            write_back_ptr = PsCall(PsReductionFunction(ReductionFunctions.WriteBackToPtr, reduction_info.op),
+                                    [ptr_symbol_expr, symbol_expr])
 
-            # TODO: only newly introduced nodes
-            typify = Typifier(self._ctx)
-            kernel_ast = typify(kernel_ast)
+            # Init local reduction variable copy
+            kernel_ast.statements = [init_local_copy] + kernel_ast.statements
+            # Write back result to reduction target variable
+            kernel_ast.statements += [write_back_ptr]
 
         #   Target-Specific optimizations
         if self._cfg.target.is_cpu():
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 69b75e711..b24058571 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -32,11 +32,11 @@ def test_reduction(dtype, op):
     config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True)
 
     ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype)
+    ps.show_code(ast_reduction)
+
     # code_reduction = ps.get_code_str(ast_reduction)
     kernel_reduction = ast_reduction.compile()
 
-    ps.show_code(ast_reduction)
-
     array = np.full((SIZE,), INIT_ARR, dtype=dtype)
     reduction_array = np.full((1,), INIT_W, dtype=dtype)
 
-- 
GitLab


From 6e08683b7e5fe681986a677a014cbcadafa017f2 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 30 Jan 2025 18:04:02 +0100
Subject: [PATCH 086/180] Add dummy implementations for unfold_function in
 cuda/sycl platforms

---
 src/pystencils/backend/platforms/cuda.py | 6 ++++++
 src/pystencils/backend/platforms/sycl.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index f146cfbfd..bb42e1f9b 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 from warnings import warn
 from typing import TYPE_CHECKING
 
+from ..ast import PsAstNode
 from ...types import constify
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
@@ -134,6 +135,11 @@ class CudaPlatform(GenericGpu):
             f"No implementation available for function {func} on data type {dtype}"
         )
 
+    def unfold_function(
+        self, call: PsCall
+    ) -> PsAstNode:
+        pass
+
     #   Internals
 
     def _prepend_dense_translation(
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index 9c04d6074..dd38aeb48 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
 
+from ..ast import PsAstNode
 from ..functions import CFunction, PsMathFunction, MathFunctions
 from ..kernelcreation.iteration_space import (
     IterationSpace,
@@ -108,6 +109,11 @@ class SyclPlatform(GenericGpu):
             f"No implementation available for function {func} on data type {dtype}"
         )
 
+    def unfold_function(
+        self, call: PsCall
+    ) -> PsAstNode:
+        pass
+
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
     ) -> tuple[PsBlock, GpuThreadsRange]:
-- 
GitLab


From e15d3cf7a043fc45b78747d85e035dcd314bcd42 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 30 Jan 2025 19:15:13 +0100
Subject: [PATCH 087/180] Add first CUDA reduction impl using atomic operations

---
 src/pystencils/backend/platforms/cuda.py | 31 +++++++++++++++++++++---
 src/pystencils/include/gpu_defines.h     | 28 +++++++++++++++++++++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index bb42e1f9b..95480de93 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -3,7 +3,7 @@ from warnings import warn
 from typing import TYPE_CHECKING
 
 from ..ast import PsAstNode
-from ...types import constify
+from ...types import constify, PsPointerType, PsScalarType, PsCustomType
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
 
@@ -23,12 +23,12 @@ from ..ast.expressions import (
     PsCast,
     PsCall,
     PsLookup,
-    PsBufferAcc,
+    PsBufferAcc, PsSymbolExpr
 )
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
-from ..functions import PsMathFunction, MathFunctions, CFunction
+from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions
 
 if TYPE_CHECKING:
     from ...codegen import GpuIndexingConfig, GpuThreadsRange
@@ -138,7 +138,30 @@ class CudaPlatform(GenericGpu):
     def unfold_function(
         self, call: PsCall
     ) -> PsAstNode:
-        pass
+        assert isinstance(call.function, PsReductionFunction)
+
+        func = call.function.func
+
+        match func:
+            case ReductionFunctions.InitLocalCopy:
+                symbol_expr, init_val = call.args
+                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression)
+
+                return PsDeclaration(symbol_expr, init_val)
+            case ReductionFunctions.WriteBackToPtr:
+                ptr_expr, symbol_expr = call.args
+                op = call.function.op
+
+                assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
+                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
+
+                call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void"))
+                call.args = [ptr_expr, symbol_expr]
+
+                if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
+                    NotImplementedError("atomicMul is only available for float32/64 datatypes")
+
+                return call
 
     #   Internals
 
diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h
index 67e7722e9..04eeace47 100644
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
@@ -10,3 +10,31 @@ typedef __hip_int8_t int8_t;
 typedef __hip_uint16_t uint16_t;
 typedef __hip_int16_t int16_t;
 #endif
+
+#ifdef __CUDA_ARCH__
+// Implementation of atomic multiplication
+// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
+__device__ double atomicMul(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int oldValue = *address_as_ull, assumed;
+    do {
+      assumed = oldValue;
+      oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val *
+                           __longlong_as_double(assumed)));
+    } while (assumed != oldValue);
+
+    return __longlong_as_double(oldValue);
+}
+
+__device__ float atomicMul(float* address, float val) {
+    int* address_as_int = (int*)address;
+    int old = *address_as_int;
+    int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
+    } while (assumed != old);
+
+    return __int_as_float(old);
+}
+#endif
-- 
GitLab


From 10def05e256d2c316f5063e48bbbfebe2c84ea85 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 30 Jan 2025 19:59:46 +0100
Subject: [PATCH 088/180] Fix typecheck

---
 src/pystencils/backend/platforms/cuda.py | 2 +-
 src/pystencils/backend/platforms/sycl.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 95480de93..bf5b91b82 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -156,7 +156,7 @@ class CudaPlatform(GenericGpu):
                 assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
 
                 call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void"))
-                call.args = [ptr_expr, symbol_expr]
+                call.args = (ptr_expr, symbol_expr)
 
                 if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
                     NotImplementedError("atomicMul is only available for float32/64 datatypes")
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index dd38aeb48..2ea2934f3 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -112,7 +112,9 @@ class SyclPlatform(GenericGpu):
     def unfold_function(
         self, call: PsCall
     ) -> PsAstNode:
-        pass
+        raise MaterializationError(
+            f"No implementation available for function {call.function.name}"
+        )
 
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
-- 
GitLab


From 0fb11858f2c65fc46c8dca469c75c28bf283dfdb Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 30 Jan 2025 20:06:00 +0100
Subject: [PATCH 089/180] Add CUDA backend for numeric limits

---
 src/pystencils/backend/platforms/cuda.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index bf5b91b82..ef3c11598 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -28,7 +28,8 @@ from ..ast.expressions import (
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
-from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions
+from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions, \
+    NumericLimitsFunctions
 
 if TYPE_CHECKING:
     from ...codegen import GpuIndexingConfig, GpuThreadsRange
@@ -64,7 +65,7 @@ class CudaPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return {'"gpu_defines.h"'}
+        return {'"gpu_defines.h"', "<cuda/std/limits>"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
@@ -83,6 +84,9 @@ class CudaPlatform(GenericGpu):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
+        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
+            return PsLiteralExpr(PsLiteral(f"::cuda::std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype))
+
         if isinstance(dtype, PsIeeeFloatType):
             match func:
                 case (
-- 
GitLab


From 616f609f24551439403c13b0d282fab147099f9f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 31 Jan 2025 14:51:45 +0100
Subject: [PATCH 090/180] Fix lint [skip ci]

---
 src/pystencils/backend/platforms/cuda.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index ef3c11598..3fe5be229 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -85,7 +85,8 @@ class CudaPlatform(GenericGpu):
         arg_types = (dtype,) * func.num_args
 
         if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
-            return PsLiteralExpr(PsLiteral(f"::cuda::std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype))
+            return PsLiteralExpr(
+                PsLiteral(f"::cuda::std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype))
 
         if isinstance(dtype, PsIeeeFloatType):
             match func:
-- 
GitLab


From 4c7fd40921a2486f8164efc84e455b59138ae6d1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 31 Jan 2025 14:56:08 +0100
Subject: [PATCH 091/180] Fix lint [skip ci]

---
 src/pystencils/backend/platforms/cuda.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 3fe5be229..73c4b3b47 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -160,7 +160,8 @@ class CudaPlatform(GenericGpu):
                 assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
                 assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
 
-                call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype], PsCustomType("void"))
+                call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
+                                          PsCustomType("void"))
                 call.args = (ptr_expr, symbol_expr)
 
                 if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
-- 
GitLab


From 826ee8e26f2bb09b95bb473b347d06cd6a36207c Mon Sep 17 00:00:00 2001
From: Richard Angersbach <iwia025h@csnhr.nhr.fau.de>
Date: Tue, 4 Feb 2025 16:30:13 +0100
Subject: [PATCH 092/180] Try supporting pointer dtypes for reductions in cupy
 gpu jit

---
 src/pystencils/jit/gpu_cupy.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index c208ac219..467e86be7 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -11,7 +11,7 @@ except ImportError:
 from ..codegen import Target
 from ..field import FieldType
 
-from ..types import PsType
+from ..types import PsType, PsPointerType
 from .jit import JitBase, JitError, KernelWrapper
 from ..codegen import (
     Kernel,
@@ -183,6 +183,9 @@ class CupyKernelWrapper(KernelWrapper):
                                 kparam.dtype,
                             )
                             break
+            elif isinstance(kparam.dtype, PsPointerType):
+                val = kwargs[kparam.name]
+                args.append(val)
             else:
                 #   scalar parameter
                 val: Any = kwargs[kparam.name]
-- 
GitLab


From f60d9d5df3c87c54d587ad7496025e70ee2388f0 Mon Sep 17 00:00:00 2001
From: Richard Angersbach <iwia025h@csnhr.nhr.fau.de>
Date: Tue, 4 Feb 2025 16:31:16 +0100
Subject: [PATCH 093/180] Minor adaptations for reduction test

---
 tests/kernelcreation/test_reduction.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index b24058571..be2589912 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -18,9 +18,9 @@ SOLUTION = {
 
 
 @pytest.mark.parametrize('dtype', ["float64"])
-@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
-def test_reduction(dtype, op):
-    gpu_avail = False
+@pytest.mark.parametrize("op", ["+"]) #, "-", "*", "min", "max"
+def test_reduction(target, dtype, op):
+    gpu_avail = target is ps.Target.GPU
 
     x = ps.fields(f'x: {dtype}[1d]')
     w = ps.TypedSymbol("w", dtype)
-- 
GitLab


From d8ae900242264392479f4405678d9a1f1b177890 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 4 Feb 2025 17:49:37 +0100
Subject: [PATCH 094/180] Use predefined macro values for numeric limits in
 cuda backend

---
 src/pystencils/backend/platforms/cuda.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 73c4b3b47..fa246c128 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -65,7 +65,7 @@ class CudaPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return {'"gpu_defines.h"', "<cuda/std/limits>"}
+        return {'"gpu_defines.h"'}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
@@ -85,8 +85,11 @@ class CudaPlatform(GenericGpu):
         arg_types = (dtype,) * func.num_args
 
         if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
+            assert isinstance(dtype, PsIeeeFloatType)
+            defines = { NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY" }
+
             return PsLiteralExpr(
-                PsLiteral(f"::cuda::std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype))
+                PsLiteral(defines[func.function_name], dtype))
 
         if isinstance(dtype, PsIeeeFloatType):
             match func:
-- 
GitLab


From a2a59d40b66390cebe849870d1b9cf058da82850 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 4 Feb 2025 18:04:15 +0100
Subject: [PATCH 095/180] Wrap statement around generated atomic call [skip ci]

---
 src/pystencils/backend/platforms/cuda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index fa246c128..a89225a08 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -16,7 +16,7 @@ from ..kernelcreation import (
 )
 
 from ..kernelcreation.context import KernelCreationContext
-from ..ast.structural import PsBlock, PsConditional, PsDeclaration
+from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement
 from ..ast.expressions import (
     PsExpression,
     PsLiteralExpr,
@@ -170,7 +170,7 @@ class CudaPlatform(GenericGpu):
                 if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
                     NotImplementedError("atomicMul is only available for float32/64 datatypes")
 
-                return call
+                return PsStatement(call)
 
     #   Internals
 
-- 
GitLab


From 5caafdd05712621bc990ce7ef13c08f232f260af Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 10 Feb 2025 15:13:51 +0100
Subject: [PATCH 096/180] Add guard for INFINITY numeric limit macro used by
 cuda backend

---
 src/pystencils/backend/platforms/cuda.py | 3 +--
 src/pystencils/include/gpu_defines.h     | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 1a8fdc482..1af8917cc 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -90,8 +90,7 @@ class CudaPlatform(GenericGpu):
             assert isinstance(dtype, PsIeeeFloatType)
             defines = { NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY" }
 
-            return PsLiteralExpr(
-                PsLiteral(defines[func.function_name], dtype))
+            return PsLiteralExpr(PsLiteral(defines[func], dtype))
 
         if isinstance(dtype, PsIeeeFloatType):
             match func:
diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h
index 04eeace47..8f961e25b 100644
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
@@ -1,8 +1,10 @@
 #pragma once
 
 #define POS_INFINITY __int_as_float(0x7f800000)
-#define INFINITY POS_INFINITY
 #define NEG_INFINITY __int_as_float(0xff800000)
+#ifndef INFINITY
+#define INFINITY POS_INFINITY
+#endif
 
 #ifdef __HIPCC_RTC__
 typedef __hip_uint8_t uint8_t;
-- 
GitLab


From a71e0d318a6f96bef38ce4b7b260d8f3fd73d91d Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 10 Feb 2025 18:03:52 +0100
Subject: [PATCH 097/180] Temporarily change default CUDA block size for CUDA
 jit

---
 src/pystencils/jit/gpu_cupy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 54fb41173..331b58ce5 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -242,7 +242,7 @@ class CupyKernelWrapper(KernelWrapper):
 
 class CupyJit(JitBase):
 
-    def __init__(self, default_block_size: Sequence[int] = (128, 2, 1)):
+    def __init__(self, default_block_size: Sequence[int] = (128, 1, 1)):
         self._runtime_headers = {"<cstdint>"}
 
         if len(default_block_size) > 3:
-- 
GitLab


From 60a348f1f3f39d184da872007756c3aeed13ccee Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 10 Feb 2025 18:30:29 +0100
Subject: [PATCH 098/180] Support atomic sub, min, max for fp reductions using
 custom implementations with CAS mechanism

---
 src/pystencils/backend/platforms/cuda.py | 12 ++++-
 src/pystencils/include/gpu_defines.h     | 59 +++++++++++++++++++++++-
 tests/kernelcreation/test_reduction.py   |  2 +-
 3 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 1af8917cc..f9fbdfa56 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -3,6 +3,7 @@ from warnings import warn
 from typing import TYPE_CHECKING
 
 from ..ast import PsAstNode
+from ...sympyextensions.reduction import ReductionOp
 from ...types import constify, PsPointerType, PsScalarType, PsCustomType
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
@@ -165,9 +166,16 @@ class CudaPlatform(GenericGpu):
                 assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
                 assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
 
-                call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
+                match op:
+                    case ReductionOp.Sub:
+                        # workaround for unsupported atomicSub: use atomic add and invert sign
+                        call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype],
                                           PsCustomType("void"))
-                call.args = (ptr_expr, symbol_expr)
+                        call.args = (ptr_expr, -symbol_expr)
+                    case _:
+                        call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
+                                                  PsCustomType("void"))
+                        call.args = (ptr_expr, symbol_expr)
 
                 if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
                     NotImplementedError("atomicMul is only available for float32/64 datatypes")
diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h
index 8f961e25b..5525bbc69 100644
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
@@ -14,8 +14,11 @@ typedef __hip_int16_t int16_t;
 #endif
 
 #ifdef __CUDA_ARCH__
-// Implementation of atomic multiplication
-// See https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
+// No direct implementation of atomic multiplication, minimum and maximum available
+// -> add support by custom implementations using a CAS mechanism
+
+// - atomicMul (double/float)
+//   see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
 __device__ double atomicMul(double* address, double val) {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
     unsigned long long int oldValue = *address_as_ull, assumed;
@@ -39,4 +42,56 @@ __device__ float atomicMul(float* address, float val) {
 
     return __int_as_float(old);
 }
+
+// - atomicMin (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMin(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val < __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMin(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val < __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
+
+// - atomicMax (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMax(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val > __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMax(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val > __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
 #endif
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index be2589912..07fb94a7e 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -18,7 +18,7 @@ SOLUTION = {
 
 
 @pytest.mark.parametrize('dtype', ["float64"])
-@pytest.mark.parametrize("op", ["+"]) #, "-", "*", "min", "max"
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction(target, dtype, op):
     gpu_avail = target is ps.Target.GPU
 
-- 
GitLab


From d4b7e78fca17ca130c55959e330a161d07ebba80 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 11 Feb 2025 17:16:23 +0100
Subject: [PATCH 099/180] Add initial implementation for horizontal reductions
 for vectorization

---
 src/pystencils/backend/ast/vector.py          | 40 +++++++++++++++++++
 src/pystencils/backend/emission/ir_printer.py | 11 ++++-
 .../backend/kernelcreation/typification.py    | 18 ++++++++-
 src/pystencils/backend/platforms/x86.py       | 16 ++++++--
 .../transformations/loop_vectorizer.py        | 31 +++++++++++---
 .../transformations/select_intrinsics.py      | 14 ++++++-
 tests/kernelcreation/test_reduction.py        |  5 ++-
 7 files changed, 122 insertions(+), 13 deletions(-)

diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 705d25094..8ff1ff8a0 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -5,6 +5,7 @@ from typing import cast
 from .astnode import PsAstNode
 from .expressions import PsExpression, PsLvalue, PsUnOp
 from .util import failing_cast
+from ...sympyextensions import ReductionOp
 
 from ...types import PsVectorType
 
@@ -42,6 +43,45 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
         )
 
 
+class PsVecHorizontal(PsUnOp, PsVectorOp):
+    """Extracts scalar value from N vector lanes."""
+
+    __match_args__ = ("lanes", "operand", "operation")
+
+    def __init__(self, lanes: int, operand: PsExpression, reduction_op: ReductionOp):
+        super().__init__(operand)
+        self._lanes = lanes
+        self._reduction_operation = reduction_op
+
+    @property
+    def lanes(self) -> int:
+        return self._lanes
+
+    @lanes.setter
+    def lanes(self, n: int):
+        self._lanes = n
+
+    @property
+    def reduction_operation(self) -> ReductionOp:
+        return self._reduction_operation
+
+    @reduction_operation.setter
+    def reduction_operation(self, op: ReductionOp):
+        self._reduction_operation = op
+
+    def _clone_expr(self) -> PsVecHorizontal:
+        return PsVecHorizontal(self._lanes, self._operand.clone(), self._operation.clone())
+
+    def structurally_equal(self, other: PsAstNode) -> bool:
+        if not isinstance(other, PsVecHorizontal):
+            return False
+        return (
+                super().structurally_equal(other)
+                and self._lanes == other._lanes
+                and self._operation == other._operation
+        )
+
+
 class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
     """Pointer-based vectorized memory access.
     
diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py
index ffb65181c..04084dd3b 100644
--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -10,7 +10,7 @@ from .base_printer import BasePrinter, Ops, LR
 
 from ..ast import PsAstNode
 from ..ast.expressions import PsBufferAcc
-from ..ast.vector import PsVecMemAcc, PsVecBroadcast
+from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal
 
 if TYPE_CHECKING:
     from ...codegen import Kernel
@@ -77,6 +77,15 @@ class IRAstPrinter(BasePrinter):
                     f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest
                 )
 
+            case PsVecHorizontal(lanes, operand, reduction_op):
+                pc.push_op(Ops.Weakest, LR.Middle)
+                operand_code = self.visit(operand, pc)
+                pc.pop_op()
+
+                return pc.parenthesize(
+                    f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({operand_code})", Ops.Weakest
+                )
+
             case _:
                 return super().visit(node, pc)
 
diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index 059817bfd..25fb55a0b 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -49,7 +49,7 @@ from ..ast.expressions import (
     PsNeg,
     PsNot,
 )
-from ..ast.vector import PsVecBroadcast, PsVecMemAcc
+from ..ast.vector import PsVecBroadcast, PsVecMemAcc, PsVecHorizontal
 from ..functions import PsMathFunction, CFunction, PsReductionFunction
 from ..ast.util import determine_memory_object
 from ..exceptions import TypificationError
@@ -640,6 +640,22 @@ class Typifier:
 
                 tc.apply_dtype(PsVectorType(op_tc.target_type, lanes), expr)
 
+            case PsVecHorizontal():
+                op_tc = TypeContext()
+                self.visit_expr(expr.operand, op_tc)
+
+                if op_tc.target_type is None:
+                    raise TypificationError(
+                        f"Unable to determine type of argument to vector horizontal: {expr.operand}"
+                    )
+
+                if not isinstance(op_tc.target_type, PsVectorType):
+                    raise TypificationError(
+                        f"Illegal type in argument to vector horizontal: {op_tc.target_type}"
+                    )
+
+                tc.apply_dtype(op_tc.target_type.scalar_type, expr)
+
             case _:
                 raise NotImplementedError(f"Can't typify {expr}")
 
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index 7d2fe650f..acd397155 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -17,8 +17,8 @@ from ..ast.expressions import (
     PsCast,
     PsCall,
 )
-from ..ast.vector import PsVecMemAcc, PsVecBroadcast
-from ...types import PsCustomType, PsVectorType, PsPointerType
+from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal
+from ...types import PsCustomType, PsVectorType, PsPointerType, PsType
 from ..constants import PsConstant
 
 from ..exceptions import MaterializationError
@@ -160,7 +160,14 @@ class X86VectorCpu(GenericVectorCpu):
     ) -> PsExpression:
         match expr:
             case PsUnOp() | PsBinOp():
-                func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype())
+                vtype: PsType
+                if isinstance(expr, PsVecHorizontal):
+                    # expression itself is scalar, but argument is a vector
+                    vtype = expr.operand.get_dtype()
+                else:
+                    vtype = expr.get_dtype()
+
+                func = _x86_op_intrin(self._vector_arch, expr, vtype)
                 intrinsic = func(*operands)
                 intrinsic.dtype = func.return_type
                 return intrinsic
@@ -343,6 +350,9 @@ def _x86_op_intrin(
             if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4:
                 suffix += "x"
             atype = vtype.scalar_type
+        case PsVecHorizontal():
+            opstr = f"horizontal_{op.reduction_operation.name.lower()}"
+            rtype = vtype.scalar_type
         case PsAdd():
             opstr = "add"
         case PsSub():
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index e1e4fea50..39d72adb4 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -7,9 +7,9 @@ from ...types import PsVectorType, PsScalarType
 from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
-from ..ast.structural import PsLoop, PsBlock, PsDeclaration
-from ..ast.expressions import PsExpression, PsTernary, PsGt
-from ..ast.vector import PsVecBroadcast
+from ..ast.structural import PsLoop, PsBlock, PsDeclaration, PsAssignment
+from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr
+from ..ast.vector import PsVecBroadcast, PsVecHorizontal
 from ..ast.analysis import collect_undefined_symbols
 
 from .ast_vectorizer import VectorizationAxis, VectorizationContext, AstVectorizer
@@ -134,6 +134,21 @@ class LoopVectorizer:
         #   Prepare vectorization context
         vc = VectorizationContext(self._ctx, self._lanes, axis)
 
+        #   Prepare reductions
+        simd_init_local_reduction_vars = []
+        simd_writeback_local_reduction_vars = []
+        for symb, reduction_info in self._ctx.symbols_reduction_info.items():
+            # Vectorize symbol for local copy
+            vector_symb = vc.vectorize_symbol(symb)
+
+            # Declare and init vector
+            simd_init_local_reduction_vars += [self._type_fold(PsDeclaration(
+                PsSymbolExpr(vector_symb), PsVecBroadcast(self._lanes, PsSymbolExpr(symb))))]
+
+            # Write back vectorization result
+            simd_writeback_local_reduction_vars += [self._type_fold(PsAssignment(
+                PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(vector_symb), reduction_info.op)))]
+
         #   Generate vectorized loop body
         simd_body = self._vectorize_ast(loop.body, vc)
 
@@ -224,10 +239,14 @@ class LoopVectorizer:
                 )
 
                 return PsBlock(
+                    simd_init_local_reduction_vars +
                     [
                         simd_stop_decl,
                         simd_step_decl,
-                        simd_loop,
+                        simd_loop
+                    ] +
+                    simd_writeback_local_reduction_vars +
+                    [
                         trailing_start_decl,
                         trailing_loop,
                     ]
@@ -238,11 +257,13 @@ class LoopVectorizer:
 
             case LoopVectorizer.TrailingItersTreatment.NONE:
                 return PsBlock(
+                    simd_init_local_reduction_vars +
                     [
                         simd_stop_decl,
                         simd_step_decl,
                         simd_loop,
-                    ]
+                    ] +
+                    simd_writeback_local_reduction_vars
                 )
 
     @overload
diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py
index 060192810..7a03e293a 100644
--- a/src/pystencils/backend/transformations/select_intrinsics.py
+++ b/src/pystencils/backend/transformations/select_intrinsics.py
@@ -7,7 +7,7 @@ from ..ast.structural import PsAstNode, PsDeclaration, PsAssignment, PsStatement
 from ..ast.expressions import PsExpression, PsCall, PsCast, PsLiteral
 from ...types import PsCustomType, PsVectorType, constify, deconstify
 from ..ast.expressions import PsSymbolExpr, PsConstantExpr, PsUnOp, PsBinOp
-from ..ast.vector import PsVecMemAcc
+from ..ast.vector import PsVecMemAcc, PsVecHorizontal
 from ..exceptions import MaterializationError
 from ..functions import CFunction, PsMathFunction
 
@@ -86,6 +86,10 @@ class SelectIntrinsics:
                 new_rhs = self.visit_expr(rhs, sc)
                 return PsStatement(self._platform.vector_store(lhs, new_rhs))
 
+            case PsAssignment(lhs, rhs) if isinstance(rhs, PsVecHorizontal):
+                new_rhs = self.visit_expr(rhs, sc)
+                return PsAssignment(lhs, new_rhs)
+
             case _:
                 node.children = [self.visit(c, sc) for c in node.children]
 
@@ -93,7 +97,13 @@ class SelectIntrinsics:
 
     def visit_expr(self, expr: PsExpression, sc: SelectionContext) -> PsExpression:
         if not isinstance(expr.dtype, PsVectorType):
-            return expr
+            # special case: result type of horizontal reduction is scalar
+            if isinstance(expr, PsVecHorizontal):
+                op = self.visit_expr(expr.operand, sc)
+                print(op)
+                return self._platform.op_intrinsic(expr, [op])
+            else:
+                return expr
 
         match expr:
             case PsSymbolExpr(symb):
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index be2589912..f64ba154a 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -29,7 +29,10 @@ def test_reduction(target, dtype, op):
 
     red_assign = reduction_assignment_from_str(w, op, x.center())
 
-    config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail else ps.CreateKernelConfig(cpu_openmp=True)
+    vectorize_info = {'instruction_set': 'avx', 'assume_inner_stride_one': True}
+
+    config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail \
+        else ps.CreateKernelConfig(cpu_openmp=True, cpu_vectorize_info=vectorize_info)
 
     ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype)
     ps.show_code(ast_reduction)
-- 
GitLab


From b4b105be0ff674bd1cacaea75e5f92bf8a7fda3c Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 11 Feb 2025 17:21:54 +0100
Subject: [PATCH 100/180] Minor fix

---
 tests/kernelcreation/test_reduction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 67a844821..12dc4ba1c 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -31,8 +31,8 @@ def test_reduction(target, dtype, op):
 
     vectorize_info = {'instruction_set': 'avx', 'assume_inner_stride_one': True}
 
-    config = ps.CreateKernelConfig(target=ps.Target.GPU) if gpu_avail \
-        else ps.CreateKernelConfig(cpu_openmp=True, cpu_vectorize_info=vectorize_info)
+    config = ps.CreateKernelConfig(target=target) if gpu_avail \
+        else ps.CreateKernelConfig(target=target, cpu_openmp=True, cpu_vectorize_info=vectorize_info)
 
     ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype)
     ps.show_code(ast_reduction)
-- 
GitLab


From eb7823a5b28589a45f968ac6415ee2a02543a3ab Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 12 Feb 2025 14:13:05 +0100
Subject: [PATCH 101/180] Minor refactor of reduction ops

---
 src/pystencils/backend/ast/vector.py           | 16 ++++++++--------
 src/pystencils/backend/functions.py            | 10 +++++-----
 .../backend/kernelcreation/freeze.py           |  9 +++++----
 .../backend/platforms/generic_cpu.py           |  2 +-
 src/pystencils/backend/platforms/x86.py        |  2 +-
 src/pystencils/compound_op_mapping.py          | 10 +++++-----
 src/pystencils/sympyextensions/reduction.py    | 18 +++++++++---------
 7 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 8ff1ff8a0..4e6b2ff00 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -51,7 +51,7 @@ class PsVecHorizontal(PsUnOp, PsVectorOp):
     def __init__(self, lanes: int, operand: PsExpression, reduction_op: ReductionOp):
         super().__init__(operand)
         self._lanes = lanes
-        self._reduction_operation = reduction_op
+        self._reduction_op = reduction_op
 
     @property
     def lanes(self) -> int:
@@ -62,15 +62,15 @@ class PsVecHorizontal(PsUnOp, PsVectorOp):
         self._lanes = n
 
     @property
-    def reduction_operation(self) -> ReductionOp:
-        return self._reduction_operation
+    def reduction_op(self) -> ReductionOp:
+        return self._reduction_op
 
-    @reduction_operation.setter
-    def reduction_operation(self, op: ReductionOp):
-        self._reduction_operation = op
+    @reduction_op.setter
+    def reduction_op(self, op: ReductionOp):
+        self._reduction_op = op
 
     def _clone_expr(self) -> PsVecHorizontal:
-        return PsVecHorizontal(self._lanes, self._operand.clone(), self._operation.clone())
+        return PsVecHorizontal(self._lanes, self._operand.clone(), self._reduction_op)
 
     def structurally_equal(self, other: PsAstNode) -> bool:
         if not isinstance(other, PsVecHorizontal):
@@ -78,7 +78,7 @@ class PsVecHorizontal(PsUnOp, PsVectorOp):
         return (
                 super().structurally_equal(other)
                 and self._lanes == other._lanes
-                and self._operation == other._operation
+                and self._reduction_op == other._reduction_op
         )
 
 
diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index e1f742386..d28ef5f44 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -152,18 +152,18 @@ class ReductionFunctions(Enum):
 
 class PsReductionFunction(PsFunction):
 
-    def __init__(self, func: ReductionFunctions, op: ReductionOp) -> None:
+    def __init__(self, func: ReductionFunctions, reduction_op: ReductionOp) -> None:
         super().__init__(func.function_name, func.num_args)
         self._func = func
-        self._op = op
+        self._reduction_op = reduction_op
 
     @property
     def func(self) -> ReductionFunctions:
         return self._func
 
     @property
-    def op(self) -> ReductionOp:
-        return self._op
+    def reduction_op(self) -> ReductionOp:
+        return self._reduction_op
 
     def __str__(self) -> str:
         return f"{self._func.function_name}"
@@ -172,7 +172,7 @@ class PsReductionFunction(PsFunction):
         if not isinstance(other, PsReductionFunction):
             return False
 
-        return self._func == other._func
+        return self._func == other._func and self._reduction_op == other._reduction_op
 
     def __hash__(self) -> int:
         return hash(self._func)
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 675655802..ce65cd85d 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -189,6 +189,7 @@ class FreezeExpressions:
         assert isinstance(rhs, PsExpression)
         assert isinstance(lhs, PsSymbolExpr)
 
+        op = expr.reduction_op
         orig_lhs_symb = lhs.symbol
         dtype = lhs.dtype
 
@@ -202,11 +203,11 @@ class FreezeExpressions:
         new_lhs = PsSymbolExpr(new_lhs_symb)
 
         # get new rhs from augmented assignment
-        new_rhs: PsExpression = compound_op_to_expr(expr.op, new_lhs.clone(), rhs)
+        new_rhs: PsExpression = compound_op_to_expr(op, new_lhs.clone(), rhs)
 
         # match for reduction operation and set neutral init_val
         init_val: PsExpression
-        match expr.op:
+        match op:
             case ReductionOp.Add:
                 init_val = PsConstantExpr(PsConstant(0))
             case ReductionOp.Sub:
@@ -218,9 +219,9 @@ class FreezeExpressions:
             case ReductionOp.Max:
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
             case _:
-                raise FreezeError(f"Unsupported reduced assignment: {expr.op}.")
+                raise FreezeError(f"Unsupported reduced assignment: {op}.")
 
-        reduction_info = ReductionInfo(expr.op, init_val, orig_lhs_symb_as_ptr)
+        reduction_info = ReductionInfo(op, init_val, orig_lhs_symb_as_ptr)
 
         # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info
         self._ctx.add_symbol(new_lhs_symb)
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index aa6e22b85..7655572de 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -74,7 +74,7 @@ class GenericCpu(Platform):
                 return PsDeclaration(symbol_expr, init_val)
             case ReductionFunctions.WriteBackToPtr:
                 ptr_expr, symbol_expr = call.args
-                op = call.function.op
+                op = call.function.reduction_op
 
                 assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
                 assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index 0727b65b9..59c3a178f 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -354,7 +354,7 @@ def _x86_op_intrin(
                 suffix += "x"
             atype = vtype.scalar_type
         case PsVecHorizontal():
-            opstr = f"horizontal_{op.reduction_operation.name.lower()}"
+            opstr = f"horizontal_{op.reduction_op.name.lower()}"
             rtype = vtype.scalar_type
         case PsAdd():
             opstr = "add"
diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/compound_op_mapping.py
index 1eadfa6f0..2dd88fc94 100644
--- a/src/pystencils/compound_op_mapping.py
+++ b/src/pystencils/compound_op_mapping.py
@@ -1,6 +1,6 @@
 from operator import truediv, mul, sub, add
 
-from .backend.ast.expressions import PsExpression, PsCall
+from .backend.ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv
 from .backend.exceptions import FreezeError
 from .backend.functions import PsMathFunction, MathFunctions
 from .sympyextensions.reduction import ReductionOp
@@ -12,13 +12,13 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
     if op in _available_operator_interface:
         match op:
             case ReductionOp.Add:
-                operator = add
+                operator = PsAdd
             case ReductionOp.Sub:
-                operator = sub
+                operator = PsSub
             case ReductionOp.Mul:
-                operator = mul
+                operator = PsMul
             case ReductionOp.Div:
-                operator = truediv
+                operator = PsDiv
             case _:
                 raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.")
         return operator(op1, op2)
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index 9d8aecb5b..25ae5c0ac 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -22,36 +22,36 @@ class ReductionAssignment(AssignmentBase):
     binop : CompoundOp
        Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc.
     """
-    binop = None  # type: ReductionOp
+    reduction_op = None  # type: ReductionOp
 
     @property
-    def op(self):
-        return self.binop
+    def reduction_op(self):
+        return self.reduction_op
 
 
 class AddReductionAssignment(ReductionAssignment):
-    binop = ReductionOp.Add
+    reduction_op = ReductionOp.Add
 
 
 class SubReductionAssignment(ReductionAssignment):
-    binop = ReductionOp.Sub
+    reduction_op = ReductionOp.Sub
 
 
 class MulReductionAssignment(ReductionAssignment):
-    binop = ReductionOp.Mul
+    reduction_op = ReductionOp.Mul
 
 
 class MinReductionAssignment(ReductionAssignment):
-    binop = ReductionOp.Min
+    reduction_op = ReductionOp.Min
 
 
 class MaxReductionAssignment(ReductionAssignment):
-    binop = ReductionOp.Max
+    reduction_op = ReductionOp.Max
 
 
 # Mapping from ReductionOp enum to ReductionAssigment classes
 _reduction_assignment_classes = {
-    cls.binop: cls for cls in [
+    cls.reduction_op: cls for cls in [
         AddReductionAssignment, SubReductionAssignment, MulReductionAssignment,
         MinReductionAssignment, MaxReductionAssignment
     ]
-- 
GitLab


From 8e0a74784ccb0d22c89364871ee630ba67239b2e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 12 Feb 2025 15:22:45 +0100
Subject: [PATCH 102/180] Refactor PsVecHorizontal as PsBinOp

---
 src/pystencils/backend/ast/vector.py          | 29 ++++++++++---
 src/pystencils/backend/emission/ir_printer.py |  8 ++--
 .../backend/kernelcreation/typification.py    | 43 ++++++++++++-------
 src/pystencils/backend/platforms/x86.py       | 15 +++++--
 .../transformations/loop_vectorizer.py        |  3 +-
 .../transformations/select_intrinsics.py      |  6 +--
 .../include/simd_horizontal_helpers.h         | 11 +++++
 7 files changed, 82 insertions(+), 33 deletions(-)
 create mode 100644 src/pystencils/include/simd_horizontal_helpers.h

diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 4e6b2ff00..14249e1e8 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 from typing import cast
 
 from .astnode import PsAstNode
-from .expressions import PsExpression, PsLvalue, PsUnOp
+from .expressions import PsExpression, PsLvalue, PsUnOp, PsBinOp
 from .util import failing_cast
 from ...sympyextensions import ReductionOp
 
@@ -43,13 +43,14 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
         )
 
 
-class PsVecHorizontal(PsUnOp, PsVectorOp):
+class PsVecHorizontal(PsBinOp, PsVectorOp):
     """Extracts scalar value from N vector lanes."""
 
-    __match_args__ = ("lanes", "operand", "operation")
+    __match_args__ = ("lanes", "scalar_operand", "vector_operand", "operation")
 
-    def __init__(self, lanes: int, operand: PsExpression, reduction_op: ReductionOp):
-        super().__init__(operand)
+    def __init__(self, lanes: int, scalar_operand: PsExpression, vector_operand: PsExpression,
+                 reduction_op: ReductionOp):
+        super().__init__(scalar_operand, vector_operand)
         self._lanes = lanes
         self._reduction_op = reduction_op
 
@@ -61,6 +62,22 @@ class PsVecHorizontal(PsUnOp, PsVectorOp):
     def lanes(self, n: int):
         self._lanes = n
 
+    @property
+    def scalar_operand(self) -> PsExpression:
+        return self._op1
+
+    @scalar_operand.setter
+    def scalar_operand(self, op: PsExpression):
+        self._op1 = op
+
+    @property
+    def vector_operand(self) -> PsExpression:
+        return self._op2
+
+    @vector_operand.setter
+    def vector_operand(self, op: PsExpression):
+        self._op2 = op
+
     @property
     def reduction_op(self) -> ReductionOp:
         return self._reduction_op
@@ -70,7 +87,7 @@ class PsVecHorizontal(PsUnOp, PsVectorOp):
         self._reduction_op = op
 
     def _clone_expr(self) -> PsVecHorizontal:
-        return PsVecHorizontal(self._lanes, self._operand.clone(), self._reduction_op)
+        return PsVecHorizontal(self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op)
 
     def structurally_equal(self, other: PsAstNode) -> bool:
         if not isinstance(other, PsVecHorizontal):
diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py
index 04084dd3b..1508e6d94 100644
--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -77,13 +77,15 @@ class IRAstPrinter(BasePrinter):
                     f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest
                 )
 
-            case PsVecHorizontal(lanes, operand, reduction_op):
+            case PsVecHorizontal(lanes, scalar_operand, vector_operand, reduction_op):
                 pc.push_op(Ops.Weakest, LR.Middle)
-                operand_code = self.visit(operand, pc)
+                scalar_operand_code = self.visit(scalar_operand, pc)
+                vector_operand_code = self.visit(vector_operand, pc)
                 pc.pop_op()
 
                 return pc.parenthesize(
-                    f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({operand_code})", Ops.Weakest
+                    f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({scalar_operand_code, vector_operand_code})",
+                    Ops.Weakest
                 )
 
             case _:
diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index 25fb55a0b..544746ef6 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -579,6 +579,33 @@ class Typifier:
                 else:
                     tc.apply_dtype(PsBoolType(), expr)
 
+            case PsVecHorizontal():
+                # bin op consisting of a scalar and a vector that is converted to a scalar
+                # -> whole expression should be treated as scalar
+
+                scalar_op_tc = TypeContext()
+                self.visit_expr(expr.scalar_operand, scalar_op_tc)
+
+                vector_op_tc = TypeContext()
+                self.visit_expr(expr.vector_operand, vector_op_tc)
+
+                if scalar_op_tc.target_type is None or vector_op_tc.target_type is None:
+                    raise TypificationError(
+                        f"Unable to determine type of argument to vector horizontal: {expr}"
+                    )
+
+                if not isinstance(scalar_op_tc.target_type, PsScalarType):
+                    raise TypificationError(
+                        f"Illegal type in scalar operand (op1) to vector horizontal: {scalar_op_tc.target_type}"
+                    )
+
+                if not isinstance(vector_op_tc.target_type, PsVectorType):
+                    raise TypificationError(
+                        f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}"
+                    )
+
+                tc.apply_dtype(scalar_op_tc.target_type, expr)
+
             case PsBinOp(op1, op2):
                 self.visit_expr(op1, tc)
                 self.visit_expr(op2, tc)
@@ -640,22 +667,6 @@ class Typifier:
 
                 tc.apply_dtype(PsVectorType(op_tc.target_type, lanes), expr)
 
-            case PsVecHorizontal():
-                op_tc = TypeContext()
-                self.visit_expr(expr.operand, op_tc)
-
-                if op_tc.target_type is None:
-                    raise TypificationError(
-                        f"Unable to determine type of argument to vector horizontal: {expr.operand}"
-                    )
-
-                if not isinstance(op_tc.target_type, PsVectorType):
-                    raise TypificationError(
-                        f"Illegal type in argument to vector horizontal: {op_tc.target_type}"
-                    )
-
-                tc.apply_dtype(op_tc.target_type.scalar_type, expr)
-
             case _:
                 raise NotImplementedError(f"Can't typify {expr}")
 
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index 59c3a178f..ee14d1689 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Sequence
+from typing import Sequence, Tuple
 from enum import Enum
 from functools import cache
 
@@ -132,6 +132,8 @@ class X86VectorCpu(GenericVectorCpu):
         else:
             headers = {"<immintrin.h>"}
 
+        headers.update({'"simd_horizontal_helpers.h"'})
+
         return super().required_headers | headers
 
     def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType:
@@ -162,8 +164,8 @@ class X86VectorCpu(GenericVectorCpu):
             case PsUnOp() | PsBinOp():
                 vtype: PsType
                 if isinstance(expr, PsVecHorizontal):
-                    # expression itself is scalar, but argument is a vector
-                    vtype = expr.operand.get_dtype()
+                    # return type of expression itself is scalar, but input argument to intrinsic is a vector
+                    vtype = expr.vector_operand.get_dtype()
                 else:
                     vtype = expr.get_dtype()
 
@@ -346,6 +348,7 @@ def _x86_op_intrin(
     prefix = varch.intrin_prefix(vtype)
     suffix = varch.intrin_suffix(vtype)
     rtype = atype = varch.intrin_type(vtype)
+    atypes: Tuple[PsType, ...] = ()
 
     match op:
         case PsVecBroadcast():
@@ -356,6 +359,7 @@ def _x86_op_intrin(
         case PsVecHorizontal():
             opstr = f"horizontal_{op.reduction_op.name.lower()}"
             rtype = vtype.scalar_type
+            atypes = (vtype.scalar_type, vtype)
         case PsAdd():
             opstr = "add"
         case PsSub():
@@ -418,4 +422,7 @@ def _x86_op_intrin(
             )
 
     num_args = 1 if isinstance(op, PsUnOp) else 2
-    return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype)
+    if not atypes:
+        atypes = (atype,) * num_args
+
+    return CFunction(f"{prefix}_{opstr}_{suffix}", atypes, rtype)
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index 39d72adb4..ab28507c2 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -147,7 +147,8 @@ class LoopVectorizer:
 
             # Write back vectorization result
             simd_writeback_local_reduction_vars += [self._type_fold(PsAssignment(
-                PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(vector_symb), reduction_info.op)))]
+                PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(symb), PsSymbolExpr(vector_symb),
+                                                    reduction_info.op)))]
 
         #   Generate vectorized loop body
         simd_body = self._vectorize_ast(loop.body, vc)
diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py
index 7a03e293a..49fb9bb08 100644
--- a/src/pystencils/backend/transformations/select_intrinsics.py
+++ b/src/pystencils/backend/transformations/select_intrinsics.py
@@ -99,9 +99,9 @@ class SelectIntrinsics:
         if not isinstance(expr.dtype, PsVectorType):
             # special case: result type of horizontal reduction is scalar
             if isinstance(expr, PsVecHorizontal):
-                op = self.visit_expr(expr.operand, sc)
-                print(op)
-                return self._platform.op_intrinsic(expr, [op])
+                scalar_op = expr.scalar_operand
+                vector_op_to_scalar = self.visit_expr(expr.vector_operand, sc)
+                return self._platform.op_intrinsic(expr, [scalar_op, vector_op_to_scalar])
             else:
                 return expr
 
diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h
new file mode 100644
index 000000000..6a80f2107
--- /dev/null
+++ b/src/pystencils/include/simd_horizontal_helpers.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <immintrin.h>
+
+#define QUALIFIERS inline
+
+QUALIFIERS double _mm256_horizontal_add_pd(double a, __m256d b) {
+    __m256d _v = b;
+    __m256d _h = _mm256_hadd_pd(_v,_v);
+    return a + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h)));
+}
\ No newline at end of file
-- 
GitLab


From 7306f4ddfbb9066494480f14cdb211ae657f79ba Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 12 Feb 2025 15:41:34 +0100
Subject: [PATCH 103/180] Minor fix

---
 src/pystencils/backend/platforms/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index f9fbdfa56..90efebe61 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -161,7 +161,7 @@ class CudaPlatform(GenericGpu):
                 return PsDeclaration(symbol_expr, init_val)
             case ReductionFunctions.WriteBackToPtr:
                 ptr_expr, symbol_expr = call.args
-                op = call.function.op
+                op = call.function.reduction_op
 
                 assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
                 assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
-- 
GitLab


From 58cdb79206931dfe15a4fdb7f656d8d047a2a6b8 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 17 Feb 2025 15:30:42 +0100
Subject: [PATCH 104/180] Fix bug with doubly inverted sign for subtraction
 reductions

---
 src/pystencils/backend/platforms/cuda.py        | 5 +++--
 src/pystencils/backend/platforms/generic_cpu.py | 6 +++++-
 src/pystencils/backend/platforms/x86.py         | 5 ++++-
 tests/kernelcreation/test_reduction.py          | 2 +-
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 90efebe61..9877cea44 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -168,10 +168,11 @@ class CudaPlatform(GenericGpu):
 
                 match op:
                     case ReductionOp.Sub:
-                        # workaround for unsupported atomicSub: use atomic add and invert sign
+                        # workaround for unsupported atomicSub: use atomic add
+                        # similar to OpenMP reductions: local copies (negative sign) are added at the end
                         call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype],
                                           PsCustomType("void"))
-                        call.args = (ptr_expr, -symbol_expr)
+                        call.args = (ptr_expr, symbol_expr)
                     case _:
                         call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
                                                   PsCustomType("void"))
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 7655572de..1e7468e33 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -8,6 +8,7 @@ from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsF
     PsReductionFunction
 from ..literals import PsLiteral
 from ...compound_op_mapping import compound_op_to_expr
+from ...sympyextensions import ReductionOp
 from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType
 
 from .platform import Platform
@@ -81,8 +82,11 @@ class GenericCpu(Platform):
 
                 ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
 
+                # inspired by OpenMP: local reduction variable (negative sign) is added at the end
+                actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
+
                 # TODO: can this be avoided somehow?
-                potential_call = compound_op_to_expr(op, ptr_access, symbol_expr)
+                potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr)
                 if isinstance(potential_call, PsCall):
                     potential_call.dtype = symbol_expr.dtype
                     potential_call = self.select_function(potential_call)
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index ee14d1689..02b5ea6db 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -18,6 +18,7 @@ from ..ast.expressions import (
     PsCall,
 )
 from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal
+from ...sympyextensions import ReductionOp
 from ...types import PsCustomType, PsVectorType, PsPointerType, PsType
 from ..constants import PsConstant
 
@@ -357,7 +358,9 @@ def _x86_op_intrin(
                 suffix += "x"
             atype = vtype.scalar_type
         case PsVecHorizontal():
-            opstr = f"horizontal_{op.reduction_op.name.lower()}"
+            # horizontal add instead of sub avoids double inversion of sign
+            actual_op = ReductionOp.Add if op.reduction_op == ReductionOp.Sub else op.reduction_op
+            opstr = f"horizontal_{actual_op.name.lower()}"
             rtype = vtype.scalar_type
             atypes = (vtype.scalar_type, vtype)
         case PsAdd():
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 12dc4ba1c..537eb4b67 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -10,7 +10,7 @@ INIT_ARR = 2
 SIZE = 15
 SOLUTION = {
     "+": INIT_W + INIT_ARR * SIZE,
-    "-": INIT_W - INIT_ARR * -SIZE,
+    "-": INIT_W - INIT_ARR * SIZE,
     "*": INIT_W * INIT_ARR ** SIZE,
     "min": min(INIT_W, INIT_ARR),
     "max": max(INIT_W, INIT_ARR),
-- 
GitLab


From fe3cd6cd19f2c11166e2b8f670a0a8a337b898ef Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 17 Feb 2025 16:02:05 +0100
Subject: [PATCH 105/180] Add generator for SIMD horizontal operations and the
 emitted code.

---
 .../include/simd_horizontal_helpers.h         | 232 ++++++++++++-
 util/generate_simd_horizontal_op.py           | 309 ++++++++++++++++++
 2 files changed, 535 insertions(+), 6 deletions(-)
 create mode 100644 util/generate_simd_horizontal_op.py

diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h
index 6a80f2107..cd4bd5730 100644
--- a/src/pystencils/include/simd_horizontal_helpers.h
+++ b/src/pystencils/include/simd_horizontal_helpers.h
@@ -1,11 +1,231 @@
 #pragma once
 
+#include <cmath>
+
+#if defined(__SSE3__)
+#include <immintrin.h>
+
+inline double _mm_horizontal_add_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	return dst + _mm_cvtsd_f64(_mm_hadd_pd(_v, _v));
+}
+
+inline float _mm_horizontal_add_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_hadd_ps(_v, _v);
+	return dst + _mm_cvtss_f32(_mm_add_ps(_h, _mm_movehdup_ps(_h)));
+}
+
+inline double _mm_horizontal_mul_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	double _r = _mm_cvtsd_f64(_mm_mul_pd(_v, _mm_shuffle_pd(_v, _v, 1)));
+	return dst * _r;
+}
+
+inline float _mm_horizontal_mul_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_mul_ps(_v, _mm_shuffle_ps(_v, _v, 177));
+	float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return dst * _r;
+}
+
+inline double _mm_horizontal_min_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	double _r = _mm_cvtsd_f64(_mm_min_pd(_v, _mm_shuffle_pd(_v, _v, 1)));
+	return fmin(_r, dst);
+}
+
+inline float _mm_horizontal_min_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_min_ps(_v, _mm_shuffle_ps(_v, _v, 177));
+	float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmin(_r, dst);
+}
+
+inline double _mm_horizontal_max_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	double _r = _mm_cvtsd_f64(_mm_max_pd(_v, _mm_shuffle_pd(_v, _v, 1)));
+	return fmax(_r, dst);
+}
+
+inline float _mm_horizontal_max_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_max_ps(_v, _mm_shuffle_ps(_v, _v, 177));
+	float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmax(_r, dst);
+}
+
+#endif
+
+#if defined(__AVX__)
+#include <immintrin.h>
+
+inline double _mm256_horizontal_add_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m256d _h = _mm256_hadd_pd(_v, _v);
+	return dst + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h)));
+}
+
+inline float _mm256_horizontal_add_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m256 _h = _mm256_hadd_ps(_v, _v);
+	__m128  _i = _mm_add_ps(_mm256_extractf128_ps(_h,1), _mm256_castps256_ps128(_h));
+	return dst + _mm_cvtss_f32(_mm_hadd_ps(_i,_i));
+}
+
+inline double _mm256_horizontal_mul_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m128d _w = _mm_mul_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v));
+	double _r = _mm_cvtsd_f64(_mm_mul_pd(_w, _mm_permute_pd(_w,1))); 
+	return dst * _r;
+}
+
+inline float _mm256_horizontal_mul_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m128 _w = _mm_mul_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v));
+	__m128 _h = _mm_mul_ps(_w, _mm_shuffle_ps(_w, _w, 177));
+	float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return dst * _r;
+}
+
+inline double _mm256_horizontal_min_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m128d _w = _mm_min_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v));
+	double _r = _mm_cvtsd_f64(_mm_min_pd(_w, _mm_permute_pd(_w,1))); 
+	return fmin(_r, dst);
+}
+
+inline float _mm256_horizontal_min_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m128 _w = _mm_min_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v));
+	__m128 _h = _mm_min_ps(_w, _mm_shuffle_ps(_w, _w, 177));
+	float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmin(_r, dst);
+}
+
+inline double _mm256_horizontal_max_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m128d _w = _mm_max_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v));
+	double _r = _mm_cvtsd_f64(_mm_max_pd(_w, _mm_permute_pd(_w,1))); 
+	return fmax(_r, dst);
+}
+
+inline float _mm256_horizontal_max_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m128 _w = _mm_max_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v));
+	__m128 _h = _mm_max_ps(_w, _mm_shuffle_ps(_w, _w, 177));
+	float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmax(_r, dst);
+}
+
+#endif
+
+#if defined(__AVX512VL__)
 #include <immintrin.h>
 
-#define QUALIFIERS inline
+inline double _mm512_horizontal_add_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_add_pd(src);
+	return dst + _r;
+}
+
+inline float _mm512_horizontal_add_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_add_ps(src);
+	return dst + _r;
+}
+
+inline double _mm512_horizontal_mul_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_mul_pd(src);
+	return dst * _r;
+}
+
+inline float _mm512_horizontal_mul_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_mul_ps(src);
+	return dst * _r;
+}
+
+inline double _mm512_horizontal_min_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_min_pd(src);
+	return fmin(_r, dst);
+}
+
+inline float _mm512_horizontal_min_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_min_ps(src);
+	return fmin(_r, dst);
+}
+
+inline double _mm512_horizontal_max_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_max_pd(src);
+	return fmax(_r, dst);
+}
+
+inline float _mm512_horizontal_max_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_max_ps(src);
+	return fmax(_r, dst);
+}
+
+#endif
+
+#if defined(_M_ARM64)
+#include <arm_neon.h>
+
+inline double vgetq_horizontal_add_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r += vgetq_lane_f64(_v,1);
+	return dst + _r;
+}
+
+inline float vget_horizontal_add_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vadd_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r += vget_lane_f32(_w,1);
+	return dst + _r;
+}
+
+inline double vgetq_horizontal_mul_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r *= vgetq_lane_f64(_v,1);
+	return dst * _r;
+}
+
+inline float vget_horizontal_mul_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vmul_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r *= vget_lane_f32(_w,1);
+	return dst * _r;
+}
+
+inline double vgetq_horizontal_min_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r = fmin(_r, vgetq_lane_f64(_v,1));
+	return fmin(_r, dst);
+}
+
+inline float vget_horizontal_min_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vmin_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r = fmin(_r, vget_lane_f32(_w,1));
+	return fmin(_r, dst);
+}
+
+inline double vgetq_horizontal_max_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r = fmax(_r, vgetq_lane_f64(_v,1));
+	return fmax(_r, dst);
+}
+
+inline float vget_horizontal_max_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vmax_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r = fmax(_r, vget_lane_f32(_w,1));
+	return fmax(_r, dst);
+}
 
-QUALIFIERS double _mm256_horizontal_add_pd(double a, __m256d b) {
-    __m256d _v = b;
-    __m256d _h = _mm256_hadd_pd(_v,_v);
-    return a + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h)));
-}
\ No newline at end of file
+#endif
\ No newline at end of file
diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py
new file mode 100644
index 000000000..aebbf35bb
--- /dev/null
+++ b/util/generate_simd_horizontal_op.py
@@ -0,0 +1,309 @@
+from enum import Enum
+
+FCT_QUALIFIERS = "inline"
+
+
+class InstructionSets(Enum):
+    SSE3 = "SSE3"
+    AVX = "AVX"
+    AVX512 = "AVX512"
+    NEON = "NEON"
+
+    def __str__(self):
+        return self.value
+
+
+class ReductionOps(Enum):
+    Add = ("add", "+")
+    Mul = ("mul", "*")
+    Min = ("min", "min")
+    Max = ("max", "max")
+
+    def __init__(self, op_name, op_str):
+        self.op_name = op_name
+        self.op_str = op_str
+
+
+class ScalarTypes(Enum):
+    Double = "double"
+    Float = "float"
+
+    def __str__(self):
+        return self.value
+
+
+class VectorTypes(Enum):
+    SSE3_128d = "__m128d"
+    SSE3_128 = "__m128"
+
+    AVX_256d = "__m256d"
+    AVX_256 = "__m256"
+    AVX_128 = "__m128"
+
+    AVX_512d = "__m512d"
+    AVX_512 = "__m512"
+
+    NEON_64x2 = "float64x2_t"
+    NEON_32x4 = "float32x4_t"
+
+    def __str__(self):
+        return self.value
+
+
+class Variable:
+    def __init__(self, name: str, dtype: ScalarTypes | VectorTypes):
+        self._name = name
+        self._dtype = dtype
+
+    def __str__(self):
+        return f"{self._dtype} {self._name}"
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def dtype(self) -> ScalarTypes | VectorTypes:
+        return self._dtype
+
+
+def get_intrin_from_vector_type(vtype: VectorTypes) -> InstructionSets:
+    match vtype:
+        case VectorTypes.SSE3_128 | VectorTypes.SSE3_128d:
+            return InstructionSets.SSE3
+        case VectorTypes.AVX_256 | VectorTypes.AVX_256d:
+            return InstructionSets.AVX
+        case VectorTypes.AVX_512 | VectorTypes.AVX_512d:
+            return InstructionSets.AVX512
+        case VectorTypes.NEON_32x4 | VectorTypes.NEON_64x2:
+            return InstructionSets.NEON
+
+
+def intrin_prefix(instruction_set: InstructionSets, double_prec: bool):
+    match instruction_set:
+        case InstructionSets.SSE3:
+            return "_mm"
+        case InstructionSets.AVX:
+            return "_mm256"
+        case InstructionSets.AVX512:
+            return "_mm512"
+        case InstructionSets.NEON:
+            return "vgetq" if double_prec else "vget"
+        case _:
+            raise ValueError(f"Unknown instruction set {instruction_set}")
+
+
+def intrin_suffix(instruction_set: InstructionSets, double_prec: bool):
+    if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]:
+        return "pd" if double_prec else "ps"
+    elif instruction_set in [InstructionSets.NEON]:
+        return "f64" if double_prec else "f32"
+    else:
+        raise ValueError(f"Unknown instruction set {instruction_set}")
+
+
+def generate_hadd_intrin(instruction_set: InstructionSets, double_prec: bool, v: str):
+    return f"{intrin_prefix(instruction_set, double_prec)}_hadd_{intrin_suffix(instruction_set, double_prec)}({v}, {v})"
+
+
+def generate_shuffle_intrin(instruction_set: InstructionSets, double_prec: bool, v: str, offset):
+    return f"_mm_shuffle_{intrin_suffix(instruction_set, double_prec)}({v}, {v}, {offset})"
+
+
+def generate_op_intrin(instruction_set: InstructionSets, double_prec: bool, reduction_op: ReductionOps, a: str, b: str):
+    return f"_mm_{reduction_op.op_name}_{intrin_suffix(instruction_set, double_prec)}({a}, {b})"
+
+
+def generate_cvts_intrin(double_prec: bool, v: str):
+    convert_suffix = "f64" if double_prec else "f32"
+    intrin_suffix = "d" if double_prec else "s"
+    return f"_mm_cvts{intrin_suffix}_{convert_suffix}({v})"
+
+
+def generate_fct_name(instruction_set: InstructionSets, double_prec: bool, op: ReductionOps):
+    prefix = intrin_prefix(instruction_set, double_prec)
+    suffix = intrin_suffix(instruction_set, double_prec)
+    return f"{prefix}_horizontal_{op.op_name}_{suffix}"
+
+
+def generate_fct_decl(instruction_set: InstructionSets, op: ReductionOps, svar: Variable, vvar: Variable):
+    double_prec = svar.dtype is ScalarTypes.Double
+    return f"{FCT_QUALIFIERS} {svar.dtype} {generate_fct_name(instruction_set, double_prec, op)}({svar}, {vvar}) {{ \n"
+
+
+# SSE & AVX provide horizontal add 'hadd' intrinsic that allows for specialized handling
+def generate_simd_horizontal_add(scalar_var: Variable, vector_var: Variable):
+    reduction_op = ReductionOps.Add
+    instruction_set = get_intrin_from_vector_type(vector_var.dtype)
+    double_prec = scalar_var.dtype is ScalarTypes.Double
+
+    sname = scalar_var.name
+    vtype = vector_var.dtype
+    vname = vector_var.name
+
+    simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b)
+    hadd = lambda var: generate_hadd_intrin(instruction_set, double_prec, var)
+    cvts = lambda var: generate_cvts_intrin(double_prec, var)
+
+    # function body
+    body = f"\t{vtype} _v = {vname};\n"
+    match instruction_set:
+        case InstructionSets.SSE3:
+            if double_prec:
+                body += f"\treturn {sname} + {cvts(hadd('_v'))};\n"
+            else:
+                body += f"\t{vtype} _h = {hadd('_v')};\n" \
+                        f"\treturn {sname} + {cvts(simd_op('_h', '_mm_movehdup_ps(_h)'))};\n"
+
+        case InstructionSets.AVX:
+            if double_prec:
+                body += f"\t{vtype} _h = {hadd('_v')};\n" \
+                        f"\treturn {sname} + {cvts(simd_op('_mm256_extractf128_pd(_h,1)', '_mm256_castpd256_pd128(_h)'))};\n"
+            else:
+                add_i = "_mm_hadd_ps(_i,_i)"
+                body += f"\t{vtype} _h = {hadd('_v')};\n" \
+                        f"\t__m128  _i = {simd_op('_mm256_extractf128_ps(_h,1)', '_mm256_castps256_ps128(_h)')};\n" \
+                        f"\treturn {sname} + {cvts(add_i)};\n"
+
+        case _:
+            raise ValueError(f"No specialized version of horizontal_add available for {instruction_set}")
+
+    # function decl
+    decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var)
+
+    return decl + body + "}\n"
+
+
+def generate_simd_horizontal_op(reduction_op: ReductionOps, scalar_var: Variable, vector_var: Variable):
+    instruction_set = get_intrin_from_vector_type(vector_var.dtype)
+    double_prec = scalar_var.dtype is ScalarTypes.Double
+
+    # generate specialized version for add operation
+    if reduction_op == ReductionOps.Add and instruction_set in [InstructionSets.SSE3, InstructionSets.AVX]:
+        return generate_simd_horizontal_add(scalar_var, vector_var)
+
+    sname = scalar_var.name
+    stype = scalar_var.dtype
+    vtype = vector_var.dtype
+    vname = vector_var.name
+
+    opname = reduction_op.op_name
+    opstr = reduction_op.op_str
+
+    reduction_function = f"f{opname}" \
+        if reduction_op in [ReductionOps.Max, ReductionOps.Min] else None
+
+    simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b)
+    cvts = lambda var: generate_cvts_intrin(double_prec, var)
+    shuffle = lambda var, offset: generate_shuffle_intrin(instruction_set, double_prec, var, offset)
+
+    # function body
+    body = f"\t{vtype} _v = {vname};\n" if instruction_set != InstructionSets.AVX512 else ""
+    match instruction_set:
+        case InstructionSets.SSE3:
+            if double_prec:
+                body += f"\t{stype} _r = {cvts(simd_op('_v', shuffle('_v', 1)))};\n"
+            else:
+                body += f"\t{vtype} _h = {simd_op('_v', shuffle('_v', 177))};\n" \
+                        f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n"
+
+        case InstructionSets.AVX:
+            if double_prec:
+                body += f"\t__m128d _w = {simd_op('_mm256_extractf128_pd(_v,1)', '_mm256_castpd256_pd128(_v)')};\n" \
+                        f"\t{stype} _r = {cvts(simd_op('_w', '_mm_permute_pd(_w,1)'))}; \n"
+            else:
+                body += f"\t__m128 _w = {simd_op('_mm256_extractf128_ps(_v,1)', '_mm256_castps256_ps128(_v)')};\n" \
+                        f"\t__m128 _h = {simd_op('_w', shuffle('_w', 177))};\n" \
+                        f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n"
+
+        case InstructionSets.AVX512:
+            suffix = intrin_suffix(instruction_set, double_prec)
+            body += f"\t{stype} _r = _mm512_reduce_{opname}_{suffix}({vname});\n"
+
+        case InstructionSets.NEON:
+            if double_prec:
+                body += f"\t{stype} _r = vgetq_lane_f64(_v,0);\n"
+                if reduction_function:
+                    body += f"\t_r = {reduction_function}(_r, vgetq_lane_f64(_v,1));\n"
+                else:
+                    body += f"\t_r {opstr}= vgetq_lane_f64(_v,1);\n"
+            else:
+                body += f"\tfloat32x2_t _w = v{opname}_f32(vget_high_f32(_v), vget_low_f32(_v));\n" \
+                        f"\t{stype} _r = vgetq_lane_f32(_w,0);\n"
+                if reduction_function:
+                    body += f"\t_r = {reduction_function}(_r, vget_lane_f32(_w,1));\n"
+                else:
+                    body += f"\t_r {opstr}= vget_lane_f32(_w,1);\n"
+
+        case _:
+            raise ValueError(f"Unsupported instruction set {instruction_set}")
+
+    # finalize reduction
+    if reduction_function:
+        body += f"\treturn {reduction_function}(_r, {sname});\n"
+    else:
+        body += f"\treturn {sname} {opstr} _r;\n"
+
+    # function decl
+    decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var)
+
+    return decl + body + "}\n"
+
+
+stypes = {
+    True: ScalarTypes.Double,
+    False: ScalarTypes.Float
+}
+
+vtypes_for_instruction_set = {
+    InstructionSets.SSE3: {
+        True: VectorTypes.SSE3_128d,
+        False: VectorTypes.SSE3_128
+    },
+    InstructionSets.AVX: {
+        True: VectorTypes.AVX_256d,
+        False: VectorTypes.AVX_256
+    },
+    InstructionSets.AVX512: {
+        True: VectorTypes.AVX_512d,
+        False: VectorTypes.AVX_512
+    },
+    InstructionSets.NEON: {
+        True: VectorTypes.NEON_64x2,
+        False: VectorTypes.NEON_32x4
+    },
+}
+
+guards_for_instruction_sets = {
+    InstructionSets.SSE3: "__SSE3__",
+    InstructionSets.AVX: "__AVX__",
+    InstructionSets.AVX512: '__AVX512VL__',
+    InstructionSets.NEON: '_M_ARM64',
+}
+
+code = """#pragma once
+
+#include <cmath>
+
+"""
+
+for instruction_set in InstructionSets:
+    code += f"#if defined({guards_for_instruction_sets[instruction_set]})\n"
+
+    if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]:
+        code += "#include <immintrin.h>\n\n"
+    elif instruction_set == InstructionSets.NEON:
+        code += "#include <arm_neon.h>\n\n"
+    else:
+        ValueError(f"Missing header include for instruction set {instruction_set}")
+
+    for reduction_op in ReductionOps:
+        for double_prec in [True, False]:
+            scalar_var = Variable("dst", stypes[double_prec])
+            vector_var = Variable("src", vtypes_for_instruction_set[instruction_set][double_prec])
+
+            code += generate_simd_horizontal_op(reduction_op, scalar_var, vector_var) + "\n"
+
+    code += "#endif\n\n"
+
+print(code)
-- 
GitLab


From 71881893e551cfa3c5ce2217b8fad0ef751e3613 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 17 Feb 2025 17:16:09 +0100
Subject: [PATCH 106/180] Split reduction test into separate CPU/GPU tests

---
 tests/kernelcreation/test_reduction.py | 52 ++++++++++++++++----------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 537eb4b67..9fd385e02 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -17,38 +17,50 @@ SOLUTION = {
 }
 
 
-@pytest.mark.parametrize('dtype', ["float64"])
-@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
-def test_reduction(target, dtype, op):
-    gpu_avail = target is ps.Target.GPU
-
+# get AST for kernel with reduction assignment
+def get_reduction_assign_ast(dtype, op, config):
     x = ps.fields(f'x: {dtype}[1d]')
     w = ps.TypedSymbol("w", dtype)
 
-    # kernel with reduction assignment
-
     red_assign = reduction_assignment_from_str(w, op, x.center())
 
-    vectorize_info = {'instruction_set': 'avx', 'assume_inner_stride_one': True}
+    return ps.create_kernel([red_assign], config, default_dtype=dtype)
+
 
-    config = ps.CreateKernelConfig(target=target) if gpu_avail \
-        else ps.CreateKernelConfig(target=target, cpu_openmp=True, cpu_vectorize_info=vectorize_info)
+@pytest.mark.parametrize('instruction_set', ['sse', 'avx'])
+@pytest.mark.parametrize('dtype', ["float64", "float32"])
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
+def test_reduction_cpu(instruction_set, dtype, op):
+
+    vectorize_info = {'instruction_set': instruction_set, 'assume_inner_stride_one': True}
 
-    ast_reduction = ps.create_kernel([red_assign], config, default_dtype=dtype)
+    config = ps.CreateKernelConfig(target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info)
+
+    ast_reduction = get_reduction_assign_ast(dtype, op, config)
     ps.show_code(ast_reduction)
+    kernel_reduction = ast_reduction.compile()
 
-    # code_reduction = ps.get_code_str(ast_reduction)
+    array = np.full((SIZE,), INIT_ARR, dtype=dtype)
+    reduction_array = np.full((1,), INIT_W, dtype=dtype)
+
+    kernel_reduction(x=array, w=reduction_array)
+    assert np.allclose(reduction_array, SOLUTION[op])
+
+
+@pytest.mark.parametrize('dtype', ["float64", "float32"])
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
+def test_reduction_gpu(dtype, op):
+    config = ps.CreateKernelConfig(target=ps.Target.GPU)
+
+    ast_reduction = get_reduction_assign_ast(dtype, op, config)
+    ps.show_code(ast_reduction)
     kernel_reduction = ast_reduction.compile()
 
     array = np.full((SIZE,), INIT_ARR, dtype=dtype)
     reduction_array = np.full((1,), INIT_W, dtype=dtype)
 
-    if gpu_avail:
-        array_gpu = cp.asarray(array)
-        reduction_array_gpu = cp.asarray(reduction_array)
+    array_gpu = cp.asarray(array)
+    reduction_array_gpu = cp.asarray(reduction_array)
 
-        kernel_reduction(x=array_gpu, w=reduction_array_gpu)
-        assert np.allclose(reduction_array_gpu.get(), SOLUTION[op])
-    else:
-        kernel_reduction(x=array, w=reduction_array)
-        assert np.allclose(reduction_array, SOLUTION[op])
+    kernel_reduction(x=array_gpu, w=reduction_array_gpu)
+    assert np.allclose(reduction_array_gpu.get(), SOLUTION[op])
-- 
GitLab


From 13569a616ef3eb6a28a698e8163f0748d4a4c0c0 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 17 Feb 2025 17:31:23 +0100
Subject: [PATCH 107/180] Fix lint

---
 src/pystencils/backend/ast/vector.py           |  6 ++----
 src/pystencils/backend/platforms/cuda.py       |  6 +++---
 .../backend/transformations/loop_vectorizer.py | 18 +++++++++---------
 src/pystencils/compound_op_mapping.py          |  2 --
 src/pystencils/jit/gpu_cupy.py                 |  3 +--
 src/pystencils/sympyextensions/reduction.py    | 10 +++++++---
 6 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 14249e1e8..5121987a8 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -92,11 +92,9 @@ class PsVecHorizontal(PsBinOp, PsVectorOp):
     def structurally_equal(self, other: PsAstNode) -> bool:
         if not isinstance(other, PsVecHorizontal):
             return False
-        return (
-                super().structurally_equal(other)
+        return (super().structurally_equal(other)
                 and self._lanes == other._lanes
-                and self._reduction_op == other._reduction_op
-        )
+                and self._reduction_op == other._reduction_op)
 
 
 class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 9877cea44..e8c8f6a3a 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -89,7 +89,7 @@ class CudaPlatform(GenericGpu):
 
         if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
             assert isinstance(dtype, PsIeeeFloatType)
-            defines = { NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY" }
+            defines = {NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY"}
 
             return PsLiteralExpr(PsLiteral(defines[func], dtype))
 
@@ -170,8 +170,8 @@ class CudaPlatform(GenericGpu):
                     case ReductionOp.Sub:
                         # workaround for unsupported atomicSub: use atomic add
                         # similar to OpenMP reductions: local copies (negative sign) are added at the end
-                        call.function = CFunction(f"atomicAdd", [ptr_expr.dtype, symbol_expr.dtype],
-                                          PsCustomType("void"))
+                        call.function = CFunction("atomicAdd", [ptr_expr.dtype, symbol_expr.dtype],
+                                                  PsCustomType("void"))
                         call.args = (ptr_expr, symbol_expr)
                     case _:
                         call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index ab28507c2..b78114553 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -240,14 +240,14 @@ class LoopVectorizer:
                 )
 
                 return PsBlock(
-                    simd_init_local_reduction_vars +
-                    [
+                    simd_init_local_reduction_vars
+                    + [
                         simd_stop_decl,
                         simd_step_decl,
                         simd_loop
-                    ] +
-                    simd_writeback_local_reduction_vars +
-                    [
+                    ]
+                    + simd_writeback_local_reduction_vars
+                    + [
                         trailing_start_decl,
                         trailing_loop,
                     ]
@@ -258,13 +258,13 @@ class LoopVectorizer:
 
             case LoopVectorizer.TrailingItersTreatment.NONE:
                 return PsBlock(
-                    simd_init_local_reduction_vars +
-                    [
+                    simd_init_local_reduction_vars
+                    + [
                         simd_stop_decl,
                         simd_step_decl,
                         simd_loop,
-                    ] +
-                    simd_writeback_local_reduction_vars
+                    ]
+                    + simd_writeback_local_reduction_vars
                 )
 
     @overload
diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/compound_op_mapping.py
index 2dd88fc94..f256369f9 100644
--- a/src/pystencils/compound_op_mapping.py
+++ b/src/pystencils/compound_op_mapping.py
@@ -1,5 +1,3 @@
-from operator import truediv, mul, sub, add
-
 from .backend.ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv
 from .backend.exceptions import FreezeError
 from .backend.functions import PsMathFunction, MathFunctions
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 331b58ce5..0792b6c01 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -11,7 +11,6 @@ except ImportError:
 from ..codegen import Target
 from ..field import FieldType
 
-from ..types import PsType, PsPointerType
 from .jit import JitBase, JitError, KernelWrapper
 from ..codegen import (
     Kernel,
@@ -19,7 +18,7 @@ from ..codegen import (
     Parameter,
 )
 from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr
-from ..types import PsStructType, PsPointerType
+from ..types import PsType, PsStructType, PsPointerType
 
 from ..include import get_pystencils_include_path
 
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index 25ae5c0ac..cebfcb2f7 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -19,14 +19,18 @@ class ReductionAssignment(AssignmentBase):
     Attributes:
     ===========
 
-    binop : CompoundOp
+    reduction_op : ReductionOp
        Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc.
     """
-    reduction_op = None  # type: ReductionOp
+    _reduction_op = None  # type: ReductionOp
 
     @property
     def reduction_op(self):
-        return self.reduction_op
+        return self._reduction_op
+
+    @reduction_op.setter
+    def reduction_op(self, op):
+        self._reduction_op = op
 
 
 class AddReductionAssignment(ReductionAssignment):
-- 
GitLab


From d14898373c0503bd9bbde0d3c0ee35888519f11f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 17 Feb 2025 17:54:40 +0100
Subject: [PATCH 108/180] Fix typecheck

---
 src/pystencils/backend/ast/vector.py       |  2 +-
 src/pystencils/backend/platforms/cuda.py   | 13 ++++++++++---
 src/pystencils/jit/cpu_extension_module.py |  6 +++++-
 src/pystencils/jit/gpu_cupy.py             |  2 +-
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 5121987a8..4f5224133 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -46,7 +46,7 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
 class PsVecHorizontal(PsBinOp, PsVectorOp):
     """Extracts scalar value from N vector lanes."""
 
-    __match_args__ = ("lanes", "scalar_operand", "vector_operand", "operation")
+    __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op")
 
     def __init__(self, lanes: int, scalar_operand: PsExpression, vector_operand: PsExpression,
                  reduction_op: ReductionOp):
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index e8c8f6a3a..12a18b41b 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -87,11 +87,18 @@ class CudaPlatform(GenericGpu):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
+        if isinstance(dtype, PsScalarType) and func in NumericLimitsFunctions:
             assert isinstance(dtype, PsIeeeFloatType)
-            defines = {NumericLimitsFunctions.Min: "NEG_INFINITY", NumericLimitsFunctions.Max: "POS_INFINITY"}
 
-            return PsLiteralExpr(PsLiteral(defines[func], dtype))
+            match func:
+                case NumericLimitsFunctions.Min:
+                    define = "NEG_INFINITY"
+                case NumericLimitsFunctions.Max:
+                    define = "POS_INFINITY"
+                case _:
+                    raise MaterializationError(f"Cannot materialize call to function {func}")
+
+            return PsLiteralExpr(PsLiteral(define, dtype))
 
         if isinstance(dtype, PsIeeeFloatType):
             match func:
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index bdf99b7ad..03260f649 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -286,7 +286,11 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
     def extract_ptr(self, param: Parameter) -> str:
         if param not in self._pointer_extractions:
             ptr = param.symbol
-            self._buffer_types[ptr] = ptr.dtype.base_type
+            ptr_dtype = ptr.dtype
+
+            assert isinstance(ptr_dtype, PsPointerType)
+
+            self._buffer_types[ptr] = ptr_dtype.base_type
             self.extract_buffer(ptr, param.name)
             buffer = self.get_buffer(param.name)
             code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;"
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 0792b6c01..6b0ccf02f 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -197,7 +197,7 @@ class CupyKernelWrapper(KernelWrapper):
                 args.append(val)
             else:
                 #   scalar parameter
-                val: Any = kwargs[kparam.name]
+                val = kwargs[kparam.name]
                 add_arg(kparam.name, val, kparam.dtype)
 
         #   Determine launch grid
-- 
GitLab


From ef185b4e48f0c0ca40aaba84c928289995537f45 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 17 Feb 2025 18:18:42 +0100
Subject: [PATCH 109/180] Fix ImportError for cupy

---
 tests/kernelcreation/test_reduction.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 9fd385e02..1824ea095 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -1,6 +1,10 @@
 import pytest
 import numpy as np
-import cupy as cp
+
+try:
+    import cupy as cp
+except ImportError:
+    pass
 
 import pystencils as ps
 from pystencils.sympyextensions import reduction_assignment_from_str
-- 
GitLab


From 0c40ed634587f5854d11f7851c7ba9eb5911f4ff Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 17 Feb 2025 18:31:41 +0100
Subject: [PATCH 110/180] Use import or skip mechanism for cupy

---
 tests/kernelcreation/test_reduction.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 1824ea095..ec23297b0 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -1,11 +1,6 @@
 import pytest
 import numpy as np
 
-try:
-    import cupy as cp
-except ImportError:
-    pass
-
 import pystencils as ps
 from pystencils.sympyextensions import reduction_assignment_from_str
 
@@ -35,7 +30,6 @@ def get_reduction_assign_ast(dtype, op, config):
 @pytest.mark.parametrize('dtype', ["float64", "float32"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction_cpu(instruction_set, dtype, op):
-
     vectorize_info = {'instruction_set': instruction_set, 'assume_inner_stride_one': True}
 
     config = ps.CreateKernelConfig(target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info)
@@ -54,6 +48,9 @@ def test_reduction_cpu(instruction_set, dtype, op):
 @pytest.mark.parametrize('dtype', ["float64", "float32"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction_gpu(dtype, op):
+    pytest.importorskip('cupy')
+    import cupy as cp
+
     config = ps.CreateKernelConfig(target=ps.Target.GPU)
 
     ast_reduction = get_reduction_assign_ast(dtype, op, config)
-- 
GitLab


From 77a2226818946a666f98b23e95a76c16b77c1a82 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Feb 2025 12:57:49 +0100
Subject: [PATCH 111/180] Avoid duplicate definition of atomicMin/Max for HIP

---
 src/pystencils/include/gpu_defines.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/include/gpu_defines.h b/src/pystencils/include/gpu_defines.h
index 5525bbc69..34cff79de 100644
--- a/src/pystencils/include/gpu_defines.h
+++ b/src/pystencils/include/gpu_defines.h
@@ -13,10 +13,11 @@ typedef __hip_uint16_t uint16_t;
 typedef __hip_int16_t int16_t;
 #endif
 
-#ifdef __CUDA_ARCH__
-// No direct implementation of atomic multiplication, minimum and maximum available
+// No direct implementation for all atomic operations available
 // -> add support by custom implementations using a CAS mechanism
 
+#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__)
+
 // - atomicMul (double/float)
 //   see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
 __device__ double atomicMul(double* address, double val) {
@@ -43,6 +44,10 @@ __device__ float atomicMul(float* address, float val) {
     return __int_as_float(old);
 }
 
+#endif
+
+#ifdef __CUDA_ARCH__
+
 // - atomicMin (double/float)
 //   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
 __device__ __forceinline__ double atomicMin(double *address, double val)
@@ -94,4 +99,5 @@ __device__ __forceinline__ float atomicMax(float *address, float val)
     }
     return __int_as_float(ret);
 }
+
 #endif
-- 
GitLab


From d10c65d43919f5ee611fc4a0ef7c79f3c3e65822 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Tue, 18 Feb 2025 17:29:22 +0100
Subject: [PATCH 112/180] Catch CUDARuntimeError for mising CUDA capable device
 in reduction GPU test

---
 tests/kernelcreation/test_reduction.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index ec23297b0..992c328d7 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -2,6 +2,7 @@ import pytest
 import numpy as np
 
 import pystencils as ps
+from cupy_backends.cuda.api.runtime import CUDARuntimeError
 from pystencils.sympyextensions import reduction_assignment_from_str
 
 INIT_W = 5
@@ -48,8 +49,15 @@ def test_reduction_cpu(instruction_set, dtype, op):
 @pytest.mark.parametrize('dtype', ["float64", "float32"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction_gpu(dtype, op):
-    pytest.importorskip('cupy')
-    import cupy as cp
+    try:
+        import cupy as cp
+
+        device_count = range(cp.cuda.runtime.getDeviceCount())
+        print(f"Found {device_count} GPUs")
+    except ImportError:
+        pytest.skip(reason="CuPy is not available", allow_module_level=True)
+    except CUDARuntimeError:
+        pytest.skip(reason="No CUDA capable device is detected", allow_module_level=True)
 
     config = ps.CreateKernelConfig(target=ps.Target.GPU)
 
-- 
GitLab


From ce816539159408b26b09b4bf6e1df0cbff437829 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 19 Feb 2025 16:33:25 +0100
Subject: [PATCH 113/180] Encapsulate fetching of kernel conditions for
 iteration spaces in separate function

---
 src/pystencils/backend/platforms/cuda.py | 55 ++++++++++++++++++------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index fb613347a..eff88df7e 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -315,13 +315,47 @@ class CudaPlatform(GenericGpu):
 
     #   Internals
 
+    # TODO: SYCL platform has very similar code for fetching conditionals -> move to GenericGPU?
+
+    def _get_condition_for_translation(
+            self, ispace: IterationSpace):
+
+        if not self._omit_range_check:
+            return None
+
+        match ispace:
+            case FullIterationSpace():
+
+                dimensions = ispace.dimensions_in_loop_order()
+
+                conds = []
+                for dim in dimensions:
+                    ctr_expr = PsExpression.make(dim.counter)
+                    conds.append(PsLt(ctr_expr, dim.stop))
+
+                    if conds:
+                        condition: PsExpression = conds[0]
+                        for cond in conds[1:]:
+                            condition = PsAnd(condition, cond)
+                        return condition
+                    else:
+                        return None
+
+            case SparseIterationSpace():
+                sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
+                stop = PsExpression.make(ispace.index_list.shape[0])
+
+                return PsLt(sparse_ctr_expr.clone(), stop)
+            case _:
+                assert False, "Unknown iteration space"
+
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
     ) -> PsBlock:
         ctr_mapping = self._thread_mapping(ispace)
 
         indexing_decls = []
-        conds = []
+        cond = self._get_condition_for_translation(ispace)
 
         dimensions = ispace.dimensions_in_loop_order()
 
@@ -335,14 +369,9 @@ class CudaPlatform(GenericGpu):
             indexing_decls.append(
                 self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
-            if not self._omit_range_check:
-                conds.append(PsLt(ctr_expr, dim.stop))
-
-        if conds:
-            condition: PsExpression = conds[0]
-            for cond in conds[1:]:
-                condition = PsAnd(condition, cond)
-            ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
+
+        if cond:
+            ast = PsBlock(indexing_decls + [PsConditional(cond, body)])
         else:
             body.statements = indexing_decls + body.statements
             ast = body
@@ -355,6 +384,8 @@ class CudaPlatform(GenericGpu):
         factory = AstFactory(self._ctx)
         ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
 
+        cond = self._get_condition_for_translation(ispace)
+
         sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
         ctr_mapping = self._thread_mapping(ispace)
 
@@ -377,10 +408,8 @@ class CudaPlatform(GenericGpu):
         ]
         body.statements = mappings + body.statements
 
-        if not self._omit_range_check:
-            stop = PsExpression.make(ispace.index_list.shape[0])
-            condition = PsLt(sparse_ctr_expr.clone(), stop)
-            ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
+        if cond:
+            ast = PsBlock([sparse_idx_decl, PsConditional(cond, body)])
         else:
             body.statements = [sparse_idx_decl] + body.statements
             ast = body
-- 
GitLab


From ad292c2b42e01c38843eb1a5556ea2ced762f845 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 19 Feb 2025 16:34:14 +0100
Subject: [PATCH 114/180] Add initial version of warp-level reduction for CUDA

---
 src/pystencils/backend/platforms/cuda.py | 73 ++++++++++++++++++------
 1 file changed, 56 insertions(+), 17 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index eff88df7e..6f32102de 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,9 +1,16 @@
 from __future__ import annotations
+
+import math
+import operator
 from abc import ABC, abstractmethod
+from functools import reduce
 
 from ..ast import PsAstNode
+from ..constants import PsConstant
+from ...compound_op_mapping import compound_op_to_expr
 from ...sympyextensions.reduction import ReductionOp
 from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType
+from ...types.quick import UInt, SInt
 from ..exceptions import MaterializationError
 from .generic_gpu import GenericGpu
 
@@ -17,14 +24,14 @@ from ..kernelcreation import (
 )
 
 from ..kernelcreation.context import KernelCreationContext
-from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement
+from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment
 from ..ast.expressions import (
     PsExpression,
     PsLiteralExpr,
     PsCast,
     PsCall,
     PsLookup,
-    PsBufferAcc, PsSymbolExpr
+    PsBufferAcc, PsSymbolExpr, PsConstantExpr, PsAdd, PsRem, PsEq
 )
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
@@ -292,26 +299,58 @@ class CudaPlatform(GenericGpu):
             case ReductionFunctions.WriteBackToPtr:
                 ptr_expr, symbol_expr = call.args
                 op = call.function.reduction_op
+                stype = symbol_expr.dtype
+                ptrtype = ptr_expr.dtype
+
+                warp_size = 32   # TODO: get from platform/user config
+
+                assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
+                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
 
-                assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
-                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
+                if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
+                    NotImplementedError("atomic operations are only available for float32/64 datatypes")
 
+                def gen_shuffle_instr(offset: int):
+                    return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                                  [PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))),
+                                   symbol_expr,
+                                   PsConstantExpr(PsConstant(offset, SInt(32)))])
+
+                # workaround for subtractions -> use additions for reducing intermediate results
+                # similar to OpenMP reductions: local copies (negative sign) are added at the end
                 match op:
                     case ReductionOp.Sub:
-                        # workaround for unsupported atomicSub: use atomic add
-                        # similar to OpenMP reductions: local copies (negative sign) are added at the end
-                        call.function = CFunction("atomicAdd", [ptr_expr.dtype, symbol_expr.dtype],
-                                                  PsCustomType("void"))
-                        call.args = (ptr_expr, symbol_expr)
+                        actual_op = ReductionOp.Add
                     case _:
-                        call.function = CFunction(f"atomic{op.name}", [ptr_expr.dtype, symbol_expr.dtype],
-                                                  PsCustomType("void"))
-                        call.args = (ptr_expr, symbol_expr)
-
-                if not isinstance(symbol_expr.dtype, PsIeeeFloatType) or symbol_expr.dtype.width not in (32, 64):
-                    NotImplementedError("atomicMul is only available for float32/64 datatypes")
-
-                return PsStatement(call)
+                        actual_op = op
+
+                # perform local warp reductions
+                num_shuffles = math.frexp(warp_size)[1] - 1
+                shuffles = [PsAssignment(symbol_expr, compound_op_to_expr(actual_op, symbol_expr, gen_shuffle_instr(i)))
+                            for i in reversed(range(1, num_shuffles))]
+
+                # find first thread in warp
+                ispace = self._ctx.get_iteration_space()  # TODO: receive as argument in unfold_function?
+                is_valid_thread = self._get_condition_for_translation(ispace)
+                thread_indices_per_dim = [
+                    idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)))
+                    for i, idx in enumerate(THREAD_IDX[:ispace.rank])
+                ]
+                tid: PsExpression = thread_indices_per_dim[0]
+                for t in thread_indices_per_dim[1:]:
+                    tid = PsAdd(tid, t)
+                first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))),
+                                            PsConstantExpr(PsConstant(0, SInt(32))))
+                cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
+
+                # use atomic operation on first thread of warp to sync
+                call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
+                call.args = (ptr_expr, symbol_expr)
+
+                # assemble warp reduction
+                return PsBlock(
+                    shuffles
+                    + [PsConditional(cond, PsBlock([PsStatement(call)]))])
 
     #   Internals
 
-- 
GitLab


From c7b564bed65615c77f1065dd4821b5a79d2244b3 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 19 Feb 2025 19:04:57 +0100
Subject: [PATCH 115/180] Fix CUDARuntimeError import

---
 tests/kernelcreation/test_reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 992c328d7..c3775964b 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -2,7 +2,6 @@ import pytest
 import numpy as np
 
 import pystencils as ps
-from cupy_backends.cuda.api.runtime import CUDARuntimeError
 from pystencils.sympyextensions import reduction_assignment_from_str
 
 INIT_W = 5
@@ -51,6 +50,7 @@ def test_reduction_cpu(instruction_set, dtype, op):
 def test_reduction_gpu(dtype, op):
     try:
         import cupy as cp
+        from cupy_backends.cuda.api.runtime import CUDARuntimeError
 
         device_count = range(cp.cuda.runtime.getDeviceCount())
         print(f"Found {device_count} GPUs")
-- 
GitLab


From 2b6589b8fd5eea2e416432c9261884e9f50e4e1c Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 19 Feb 2025 19:06:30 +0100
Subject: [PATCH 116/180] Introduce masks for warp reductions and fix errors
 when shuffling warp results

---
 src/pystencils/backend/platforms/cuda.py | 34 ++++++++++++++++--------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 6f32102de..873961cc7 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -5,6 +5,8 @@ import operator
 from abc import ABC, abstractmethod
 from functools import reduce
 
+from pystencils.types import PsBoolType
+
 from ..ast import PsAstNode
 from ..constants import PsConstant
 from ...compound_op_mapping import compound_op_to_expr
@@ -310,11 +312,9 @@ class CudaPlatform(GenericGpu):
                 if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
                     NotImplementedError("atomic operations are only available for float32/64 datatypes")
 
-                def gen_shuffle_instr(offset: int):
-                    return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
-                                  [PsLiteralExpr(PsLiteral("0xffffffff", UInt(32))),
-                                   symbol_expr,
-                                   PsConstantExpr(PsConstant(offset, SInt(32)))])
+                # set up mask symbol for active threads in warp
+                mask = PsSymbol("__shfl_mask", UInt(32))
+                self._ctx.add_symbol(mask)
 
                 # workaround for subtractions -> use additions for reducing intermediate results
                 # similar to OpenMP reductions: local copies (negative sign) are added at the end
@@ -325,8 +325,13 @@ class CudaPlatform(GenericGpu):
                         actual_op = op
 
                 # perform local warp reductions
-                num_shuffles = math.frexp(warp_size)[1] - 1
-                shuffles = [PsAssignment(symbol_expr, compound_op_to_expr(actual_op, symbol_expr, gen_shuffle_instr(i)))
+                def gen_shuffle_instr(offset: int):
+                    return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                                  [PsSymbolExpr(mask), symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
+
+                num_shuffles = math.frexp(warp_size)[1]
+                shuffles = [PsAssignment(symbol_expr,
+                                         compound_op_to_expr(actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1))))
                             for i in reversed(range(1, num_shuffles))]
 
                 # find first thread in warp
@@ -343,14 +348,21 @@ class CudaPlatform(GenericGpu):
                                             PsConstantExpr(PsConstant(0, SInt(32))))
                 cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
 
+                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
+                ballot_instr = PsCall(CFunction("__ballot_sync", [UInt(32), SInt(32)], SInt(32)),
+                                                 [full_mask, is_valid_thread])
+                decl_mask = PsDeclaration(PsSymbolExpr(mask), ballot_instr if is_valid_thread else full_mask)
+
                 # use atomic operation on first thread of warp to sync
                 call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
                 call.args = (ptr_expr, symbol_expr)
 
                 # assemble warp reduction
-                return PsBlock(
-                    shuffles
-                    + [PsConditional(cond, PsBlock([PsStatement(call)]))])
+                return PsConditional(is_valid_thread if is_valid_thread else PsConstantExpr(PsLiteral("true", PsBoolType)),
+                    PsBlock(
+                    [decl_mask]
+                    + shuffles
+                    + [PsConditional(cond, PsBlock([PsStatement(call)]))]))
 
     #   Internals
 
@@ -359,7 +371,7 @@ class CudaPlatform(GenericGpu):
     def _get_condition_for_translation(
             self, ispace: IterationSpace):
 
-        if not self._omit_range_check:
+        if self._omit_range_check:
             return None
 
         match ispace:
-- 
GitLab


From bcd83842f3d331fa039a96e3eab68c34b3beb6f3 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 21 Feb 2025 14:56:46 +0100
Subject: [PATCH 117/180] Use full mask for CUDA reductions

---
 src/pystencils/backend/platforms/cuda.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 873961cc7..8936bf73f 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -313,8 +313,9 @@ class CudaPlatform(GenericGpu):
                     NotImplementedError("atomic operations are only available for float32/64 datatypes")
 
                 # set up mask symbol for active threads in warp
-                mask = PsSymbol("__shfl_mask", UInt(32))
-                self._ctx.add_symbol(mask)
+                #mask = PsSymbol("__shfl_mask", UInt(32))
+                #self._ctx.add_symbol(mask)
+                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
 
                 # workaround for subtractions -> use additions for reducing intermediate results
                 # similar to OpenMP reductions: local copies (negative sign) are added at the end
@@ -327,7 +328,7 @@ class CudaPlatform(GenericGpu):
                 # perform local warp reductions
                 def gen_shuffle_instr(offset: int):
                     return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
-                                  [PsSymbolExpr(mask), symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
+                                  [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
 
                 num_shuffles = math.frexp(warp_size)[1]
                 shuffles = [PsAssignment(symbol_expr,
@@ -348,21 +349,19 @@ class CudaPlatform(GenericGpu):
                                             PsConstantExpr(PsConstant(0, SInt(32))))
                 cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
 
-                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
-                ballot_instr = PsCall(CFunction("__ballot_sync", [UInt(32), SInt(32)], SInt(32)),
-                                                 [full_mask, is_valid_thread])
-                decl_mask = PsDeclaration(PsSymbolExpr(mask), ballot_instr if is_valid_thread else full_mask)
+                #ballot_instr = PsCall(CFunction("__ballot_sync", [UInt(32), SInt(32)], SInt(32)),
+                #                                 [full_mask, is_valid_thread])
+                #decl_mask = PsDeclaration(full_mask)
 
                 # use atomic operation on first thread of warp to sync
                 call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
                 call.args = (ptr_expr, symbol_expr)
 
                 # assemble warp reduction
-                return PsConditional(is_valid_thread if is_valid_thread else PsConstantExpr(PsLiteral("true", PsBoolType)),
-                    PsBlock(
-                    [decl_mask]
-                    + shuffles
-                    + [PsConditional(cond, PsBlock([PsStatement(call)]))]))
+                return PsBlock(
+                    #[decl_mask]
+                    shuffles
+                    + [PsConditional(cond, PsBlock([PsStatement(call)]))])
 
     #   Internals
 
-- 
GitLab


From a8479afadc2a95e2ecda861ce7fd186375477347 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 21 Feb 2025 18:11:51 +0100
Subject: [PATCH 118/180] Refactor reductionfunction mechanism

---
 src/pystencils/backend/functions.py           |   1 -
 .../backend/kernelcreation/typification.py    |   2 +-
 src/pystencils/backend/platforms/cuda.py      | 142 ++++++++----------
 .../backend/platforms/generic_cpu.py          |  49 +++---
 src/pystencils/backend/platforms/platform.py  |  12 +-
 src/pystencils/backend/platforms/sycl.py      |   9 +-
 .../transformations/select_functions.py       |  36 ++++-
 src/pystencils/codegen/driver.py              |  16 +-
 8 files changed, 119 insertions(+), 148 deletions(-)

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index d28ef5f44..4e38de5e9 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -142,7 +142,6 @@ class ReductionFunctions(Enum):
     Each platform has to materialize these functions to a concrete implementation.
     """
 
-    InitLocalCopy = ("InitLocalCopy", 2)
     WriteBackToPtr = ("WriteBackToPtr", 2)
 
     def __init__(self, func_name, num_args):
diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index 544746ef6..284e80b9d 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -617,7 +617,7 @@ class Typifier:
 
             case PsCall(function, args):
                 match function:
-                    case PsMathFunction() | PsReductionFunction():
+                    case PsMathFunction():
                         for arg in args:
                             self.visit_expr(arg, tc)
                         tc.infer_dtype(expr)
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 8936bf73f..1f6506c8f 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -212,10 +212,66 @@ class CudaPlatform(GenericGpu):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression:
-        assert isinstance(call.function, PsMathFunction)
-
+    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]:
         func = call.function.func
+
+        if func in ReductionFunctions:
+            match func:
+                case ReductionFunctions.WriteBackToPtr:
+                    ptr_expr, symbol_expr = call.args
+                    op = call.function.reduction_op
+                    stype = symbol_expr.dtype
+                    ptrtype = ptr_expr.dtype
+
+                    warp_size = 32   # TODO: get from platform/user config
+
+                    assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
+                    assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
+
+                    if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
+                        NotImplementedError("atomic operations are only available for float32/64 datatypes")
+
+                    # workaround for subtractions -> use additions for reducing intermediate results
+                    # similar to OpenMP reductions: local copies (negative sign) are added at the end
+                    match op:
+                        case ReductionOp.Sub:
+                            actual_op = ReductionOp.Add
+                        case _:
+                            actual_op = op
+
+                    # perform local warp reductions
+                    def gen_shuffle_instr(offset: int):
+                        full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
+                        return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                                      [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
+
+                    num_shuffles = math.frexp(warp_size)[1]
+                    shuffles = [PsAssignment(symbol_expr,
+                                             compound_op_to_expr(actual_op,
+                                                                 symbol_expr, gen_shuffle_instr(pow(2, i - 1))))
+                                for i in reversed(range(1, num_shuffles))]
+
+                    # find first thread in warp
+                    ispace = self._ctx.get_iteration_space()
+                    is_valid_thread = self._get_condition_for_translation(ispace)
+                    thread_indices_per_dim = [
+                        idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)))
+                        for i, idx in enumerate(THREAD_IDX[:ispace.rank])
+                    ]
+                    tid: PsExpression = thread_indices_per_dim[0]
+                    for t in thread_indices_per_dim[1:]:
+                        tid = PsAdd(tid, t)
+                    first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))),
+                                                PsConstantExpr(PsConstant(0, SInt(32))))
+                    cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
+
+                    # use atomic operation on first thread of warp to sync
+                    call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
+                    call.args = (ptr_expr, symbol_expr)
+
+                    # assemble warp reduction
+                    return (shuffles, PsConditional(cond, PsBlock([PsStatement(call)])))
+
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
@@ -232,7 +288,7 @@ class CudaPlatform(GenericGpu):
 
             return PsLiteralExpr(PsLiteral(define, dtype))
 
-        if isinstance(dtype, PsIeeeFloatType):
+        if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions:
             match func:
                 case (
                     MathFunctions.Exp
@@ -285,84 +341,6 @@ class CudaPlatform(GenericGpu):
             f"No implementation available for function {func} on data type {dtype}"
         )
 
-    def unfold_function(
-        self, call: PsCall
-    ) -> PsAstNode:
-        assert isinstance(call.function, PsReductionFunction)
-
-        func = call.function.func
-
-        match func:
-            case ReductionFunctions.InitLocalCopy:
-                symbol_expr, init_val = call.args
-                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression)
-
-                return PsDeclaration(symbol_expr, init_val)
-            case ReductionFunctions.WriteBackToPtr:
-                ptr_expr, symbol_expr = call.args
-                op = call.function.reduction_op
-                stype = symbol_expr.dtype
-                ptrtype = ptr_expr.dtype
-
-                warp_size = 32   # TODO: get from platform/user config
-
-                assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
-                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
-
-                if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
-                    NotImplementedError("atomic operations are only available for float32/64 datatypes")
-
-                # set up mask symbol for active threads in warp
-                #mask = PsSymbol("__shfl_mask", UInt(32))
-                #self._ctx.add_symbol(mask)
-                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
-
-                # workaround for subtractions -> use additions for reducing intermediate results
-                # similar to OpenMP reductions: local copies (negative sign) are added at the end
-                match op:
-                    case ReductionOp.Sub:
-                        actual_op = ReductionOp.Add
-                    case _:
-                        actual_op = op
-
-                # perform local warp reductions
-                def gen_shuffle_instr(offset: int):
-                    return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
-                                  [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
-
-                num_shuffles = math.frexp(warp_size)[1]
-                shuffles = [PsAssignment(symbol_expr,
-                                         compound_op_to_expr(actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1))))
-                            for i in reversed(range(1, num_shuffles))]
-
-                # find first thread in warp
-                ispace = self._ctx.get_iteration_space()  # TODO: receive as argument in unfold_function?
-                is_valid_thread = self._get_condition_for_translation(ispace)
-                thread_indices_per_dim = [
-                    idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)))
-                    for i, idx in enumerate(THREAD_IDX[:ispace.rank])
-                ]
-                tid: PsExpression = thread_indices_per_dim[0]
-                for t in thread_indices_per_dim[1:]:
-                    tid = PsAdd(tid, t)
-                first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))),
-                                            PsConstantExpr(PsConstant(0, SInt(32))))
-                cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
-
-                #ballot_instr = PsCall(CFunction("__ballot_sync", [UInt(32), SInt(32)], SInt(32)),
-                #                                 [full_mask, is_valid_thread])
-                #decl_mask = PsDeclaration(full_mask)
-
-                # use atomic operation on first thread of warp to sync
-                call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
-                call.args = (ptr_expr, symbol_expr)
-
-                # assemble warp reduction
-                return PsBlock(
-                    #[decl_mask]
-                    shuffles
-                    + [PsConditional(cond, PsBlock([PsStatement(call)]))])
-
     #   Internals
 
     # TODO: SYCL platform has very similar code for fetching conditionals -> move to GenericGPU?
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 1e7468e33..24692b25c 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -4,8 +4,7 @@ from typing import Sequence
 from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr
 
 from ..ast import PsAstNode
-from ..functions import CFunction, PsMathFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions, \
-    PsReductionFunction
+from ..functions import CFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions
 from ..literals import PsLiteral
 from ...compound_op_mapping import compound_op_to_expr
 from ...sympyextensions import ReductionOp
@@ -60,43 +59,31 @@ class GenericCpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def unfold_function(
-        self, call: PsCall
-    ) -> PsAstNode:
-        assert isinstance(call.function, PsReductionFunction)
-
+    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]:
         func = call.function.func
 
-        match func:
-            case ReductionFunctions.InitLocalCopy:
-                symbol_expr, init_val = call.args
-                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(init_val, PsExpression)
-
-                return PsDeclaration(symbol_expr, init_val)
-            case ReductionFunctions.WriteBackToPtr:
-                ptr_expr, symbol_expr = call.args
-                op = call.function.reduction_op
-
-                assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
-                assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
+        if func in ReductionFunctions:
+            match func:
+                case ReductionFunctions.WriteBackToPtr:
+                    ptr_expr, symbol_expr = call.args
+                    op = call.function.reduction_op
 
-                ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
+                    assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
+                    assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
 
-                # inspired by OpenMP: local reduction variable (negative sign) is added at the end
-                actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
+                    ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
 
-                # TODO: can this be avoided somehow?
-                potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr)
-                if isinstance(potential_call, PsCall):
-                    potential_call.dtype = symbol_expr.dtype
-                    potential_call = self.select_function(potential_call)
+                    # inspired by OpenMP: local reduction variable (negative sign) is added at the end
+                    actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
 
-                return PsAssignment(ptr_access, potential_call)
+                    # TODO: can this be avoided somehow?
+                    potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr)
+                    if isinstance(potential_call, PsCall):
+                        potential_call.dtype = symbol_expr.dtype
+                        potential_call = self.select_function(potential_call)
 
-    def select_function(self, call: PsCall) -> PsExpression:
-        assert isinstance(call.function, PsMathFunction)
+                    return potential_call
 
-        func = call.function.func
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index e195d59bc..90fd69084 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -38,19 +38,9 @@ class Platform(ABC):
     @abstractmethod
     def select_function(
         self, call: PsCall
-    ) -> PsExpression:
+    ) -> PsExpression | tuple[tuple[PsAstNode, ...], PsExpression]:
         """Select an implementation for the given function on the given data type.
 
         If no viable implementation exists, raise a `MaterializationError`.
         """
         pass
-
-    @abstractmethod
-    def unfold_function(
-        self, call: PsCall
-    ) -> PsAstNode:
-        """Unfolds an implementation for the given function on the given data type.
-
-        If no viable implementation exists, raise a `MaterializationError`.
-        """
-        pass
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index 9b077fd2b..eae2b7598 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -57,7 +57,7 @@ class SyclPlatform(GenericGpu):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression:
+    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]:
         assert isinstance(call.function, PsMathFunction)
 
         func = call.function.func
@@ -108,13 +108,6 @@ class SyclPlatform(GenericGpu):
             f"No implementation available for function {func} on data type {dtype}"
         )
 
-    def unfold_function(
-        self, call: PsCall
-    ) -> PsAstNode:
-        raise MaterializationError(
-            f"No implementation available for function {call.function.name}"
-        )
-
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
     ) -> PsBlock:
diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py
index 0045de87b..288650698 100644
--- a/src/pystencils/backend/transformations/select_functions.py
+++ b/src/pystencils/backend/transformations/select_functions.py
@@ -1,6 +1,8 @@
-from ..platforms import Platform
+from ..ast.structural import PsStatement, PsAssignment, PsBlock
+from ..exceptions import MaterializationError
+from ..platforms import Platform, CudaPlatform
 from ..ast import PsAstNode
-from ..ast.expressions import PsCall
+from ..ast.expressions import PsCall, PsExpression
 from ..functions import PsMathFunction, PsReductionFunction
 
 
@@ -17,9 +19,31 @@ class SelectFunctions:
     def visit(self, node: PsAstNode) -> PsAstNode:
         node.children = [self.visit(c) for c in node.children]
 
-        if isinstance(node, PsCall) and isinstance(node.function, PsMathFunction):
-            return self._platform.select_function(node)
-        elif isinstance(node, PsCall) and isinstance(node.function, PsReductionFunction):
-            return self._platform.unfold_function(node)
+        if isinstance(node, PsAssignment):
+            rhs = node.rhs
+            if isinstance(rhs, PsCall) and isinstance(rhs.function, PsReductionFunction):
+                resolved_func = self._platform.select_function(rhs)
+
+                match resolved_func:
+                    case ((prepend), expr):
+                        match self._platform:
+                            case CudaPlatform():
+                                # special case: produces statement with atomic operation writing value back to ptr
+                                return PsBlock(prepend + [PsStatement(expr)])
+                            case _:
+                                return PsBlock(prepend + [PsAssignment(node.lhs, expr)])
+                    case PsExpression():
+                        return PsAssignment(node.lhs, resolved_func)
+                    case _:
+                        raise MaterializationError(
+                            f"Wrong return type for resolved function {rhs.function.name} in SelectFunctions."
+                        )
+            else:
+                return node
+        elif isinstance(node, PsCall) and isinstance(node.function, PsMathFunction):
+            resolved_func = self._platform.select_function(node)
+            assert isinstance(resolved_func, PsExpression)
+
+            return resolved_func
         else:
             return node
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 96e9b94ed..9f04d074a 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -16,7 +16,7 @@ from .kernel import Kernel, GpuKernel
 from .properties import PsSymbolProperty, FieldBasePtr
 from .parameters import Parameter
 from ..backend.functions import PsReductionFunction, ReductionFunctions
-from ..backend.ast.expressions import PsSymbolExpr, PsCall
+from ..backend.ast.expressions import PsSymbolExpr, PsCall, PsMemAcc, PsConstantExpr
 from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration
 
 from ..field import Field
@@ -24,7 +24,7 @@ from ..types import PsIntegerType, PsScalarType
 
 from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
-from ..backend.ast.structural import PsBlock, PsLoop
+from ..backend.ast.structural import PsBlock, PsLoop, PsDeclaration, PsAssignment
 from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -187,16 +187,16 @@ class DefaultKernelCreationDriver:
             symbol_expr = typify(PsSymbolExpr(symbol))
             ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol))
             init_val = typify(reduction_info.init_val)
+            ptr_access = PsMemAcc(ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
 
-            init_local_copy = PsCall(PsReductionFunction(ReductionFunctions.InitLocalCopy, reduction_info.op),
-                                     [symbol_expr, init_val])
+            decl_local_copy = PsDeclaration(symbol_expr, init_val)
             write_back_ptr = PsCall(PsReductionFunction(ReductionFunctions.WriteBackToPtr, reduction_info.op),
                                     [ptr_symbol_expr, symbol_expr])
 
-            # Init local reduction variable copy
-            kernel_ast.statements = [init_local_copy] + kernel_ast.statements
-            # Write back result to reduction target variable
-            kernel_ast.statements += [write_back_ptr]
+            prepend_ast = [decl_local_copy]                          # declare and init local copy with neutral element
+            append_ast = [PsAssignment(ptr_access, write_back_ptr)]  # write back result to reduction target variable
+
+            kernel_ast.statements = prepend_ast + kernel_ast.statements + append_ast
 
         #   Target-Specific optimizations
         if self._target.is_cpu():
-- 
GitLab


From 89a6f36ad50849267b3a95ef5a035d22566c5748 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 21 Feb 2025 18:17:44 +0100
Subject: [PATCH 119/180] Fix lint

---
 src/pystencils/backend/kernelcreation/typification.py | 2 +-
 src/pystencils/backend/platforms/cuda.py              | 5 +----
 src/pystencils/backend/platforms/generic_cpu.py       | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index 284e80b9d..3ca0a16e2 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -50,7 +50,7 @@ from ..ast.expressions import (
     PsNot,
 )
 from ..ast.vector import PsVecBroadcast, PsVecMemAcc, PsVecHorizontal
-from ..functions import PsMathFunction, CFunction, PsReductionFunction
+from ..functions import PsMathFunction, CFunction
 from ..ast.util import determine_memory_object
 from ..exceptions import TypificationError
 
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 1f6506c8f..6df502c1f 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -5,8 +5,6 @@ import operator
 from abc import ABC, abstractmethod
 from functools import reduce
 
-from pystencils.types import PsBoolType
-
 from ..ast import PsAstNode
 from ..constants import PsConstant
 from ...compound_op_mapping import compound_op_to_expr
@@ -38,8 +36,7 @@ from ..ast.expressions import (
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
-from ..functions import PsMathFunction, MathFunctions, CFunction, PsReductionFunction, ReductionFunctions, \
-    NumericLimitsFunctions
+from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions
 
 
 int32 = PsSignedIntegerType(width=32, const=False)
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 24692b25c..3ffdfa22f 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -21,7 +21,7 @@ from ..kernelcreation.iteration_space import (
 )
 
 from ..constants import PsConstant
-from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsAssignment
+from ..ast.structural import PsDeclaration, PsLoop, PsBlock
 from ..ast.expressions import (
     PsSymbolExpr,
     PsExpression,
-- 
GitLab


From 02be4d5e994e458e0e42d72045ec53355e7820d1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 21 Feb 2025 19:35:48 +0100
Subject: [PATCH 120/180] Fix typecheck

---
 src/pystencils/backend/platforms/cuda.py      | 121 +++++++++---------
 .../backend/platforms/generic_cpu.py          |  40 +++---
 src/pystencils/backend/platforms/platform.py  |   2 +-
 .../transformations/select_functions.py       |  20 +--
 src/pystencils/codegen/driver.py              |   8 +-
 5 files changed, 99 insertions(+), 92 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 6df502c1f..291858810 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -36,8 +36,8 @@ from ..ast.expressions import (
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
-from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions
-
+from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions, PsReductionFunction, \
+    PsMathFunction
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
@@ -209,65 +209,66 @@ class CudaPlatform(GenericGpu):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]:
-        func = call.function.func
+    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]:
+        call_func = call.function
+        assert isinstance(call_func, PsReductionFunction | PsMathFunction)
 
-        if func in ReductionFunctions:
-            match func:
-                case ReductionFunctions.WriteBackToPtr:
-                    ptr_expr, symbol_expr = call.args
-                    op = call.function.reduction_op
-                    stype = symbol_expr.dtype
-                    ptrtype = ptr_expr.dtype
-
-                    warp_size = 32   # TODO: get from platform/user config
-
-                    assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
-                    assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
-
-                    if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
-                        NotImplementedError("atomic operations are only available for float32/64 datatypes")
-
-                    # workaround for subtractions -> use additions for reducing intermediate results
-                    # similar to OpenMP reductions: local copies (negative sign) are added at the end
-                    match op:
-                        case ReductionOp.Sub:
-                            actual_op = ReductionOp.Add
-                        case _:
-                            actual_op = op
-
-                    # perform local warp reductions
-                    def gen_shuffle_instr(offset: int):
-                        full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
-                        return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
-                                      [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
-
-                    num_shuffles = math.frexp(warp_size)[1]
-                    shuffles = [PsAssignment(symbol_expr,
-                                             compound_op_to_expr(actual_op,
-                                                                 symbol_expr, gen_shuffle_instr(pow(2, i - 1))))
-                                for i in reversed(range(1, num_shuffles))]
-
-                    # find first thread in warp
-                    ispace = self._ctx.get_iteration_space()
-                    is_valid_thread = self._get_condition_for_translation(ispace)
-                    thread_indices_per_dim = [
-                        idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)))
-                        for i, idx in enumerate(THREAD_IDX[:ispace.rank])
-                    ]
-                    tid: PsExpression = thread_indices_per_dim[0]
-                    for t in thread_indices_per_dim[1:]:
-                        tid = PsAdd(tid, t)
-                    first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))),
-                                                PsConstantExpr(PsConstant(0, SInt(32))))
-                    cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
-
-                    # use atomic operation on first thread of warp to sync
-                    call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
-                    call.args = (ptr_expr, symbol_expr)
-
-                    # assemble warp reduction
-                    return (shuffles, PsConditional(cond, PsBlock([PsStatement(call)])))
+        func = call_func.func
+
+        if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr:
+            ptr_expr, symbol_expr = call.args
+            op = call_func.reduction_op
+            stype = symbol_expr.dtype
+            ptrtype = ptr_expr.dtype
+
+            warp_size = 32  # TODO: get from platform/user config
+
+            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
+            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
+
+            if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
+                NotImplementedError("atomic operations are only available for float32/64 datatypes")
+
+            # workaround for subtractions -> use additions for reducing intermediate results
+            # similar to OpenMP reductions: local copies (negative sign) are added at the end
+            match op:
+                case ReductionOp.Sub:
+                    actual_op = ReductionOp.Add
+                case _:
+                    actual_op = op
+
+            # perform local warp reductions
+            def gen_shuffle_instr(offset: int):
+                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
+                return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                              [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
+
+            num_shuffles = math.frexp(warp_size)[1]
+            shuffles = tuple(PsAssignment(symbol_expr,
+                                          compound_op_to_expr(actual_op,
+                                                              symbol_expr, gen_shuffle_instr(pow(2, i - 1))))
+                             for i in reversed(range(1, num_shuffles)))
+
+            # find first thread in warp
+            ispace = self._ctx.get_iteration_space()
+            is_valid_thread = self._get_condition_for_translation(ispace)
+            thread_indices_per_dim = [
+                idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)))
+                for i, idx in enumerate(THREAD_IDX[:ispace.rank])
+            ]
+            tid: PsExpression = thread_indices_per_dim[0]
+            for t in thread_indices_per_dim[1:]:
+                tid = PsAdd(tid, t)
+            first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(warp_size, SInt(32)))),
+                                        PsConstantExpr(PsConstant(0, SInt(32))))
+            cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
+
+            # use atomic operation on first thread of warp to sync
+            call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
+            call.args = (ptr_expr, symbol_expr)
+
+            # assemble warp reduction
+            return shuffles, PsConditional(cond, PsBlock([PsStatement(call)]))
 
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 3ffdfa22f..2f873ff29 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -4,7 +4,8 @@ from typing import Sequence
 from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr
 
 from ..ast import PsAstNode
-from ..functions import CFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions
+from ..functions import CFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions, PsMathFunction, \
+    PsReductionFunction
 from ..literals import PsLiteral
 from ...compound_op_mapping import compound_op_to_expr
 from ...sympyextensions import ReductionOp
@@ -59,30 +60,31 @@ class GenericCpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]:
-        func = call.function.func
+    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]:
+        call_func = call.function
+        assert isinstance(call_func, PsReductionFunction | PsMathFunction)
 
-        if func in ReductionFunctions:
-            match func:
-                case ReductionFunctions.WriteBackToPtr:
-                    ptr_expr, symbol_expr = call.args
-                    op = call.function.reduction_op
+        func = call_func.func
+
+        if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr:
+            ptr_expr, symbol_expr = call.args
+            op = call_func.reduction_op
 
-                    assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
-                    assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
+            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
+            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
 
-                    ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
+            ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
 
-                    # inspired by OpenMP: local reduction variable (negative sign) is added at the end
-                    actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
+            # inspired by OpenMP: local reduction variable (negative sign) is added at the end
+            actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
 
-                    # TODO: can this be avoided somehow?
-                    potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr)
-                    if isinstance(potential_call, PsCall):
-                        potential_call.dtype = symbol_expr.dtype
-                        potential_call = self.select_function(potential_call)
+            # create binop and potentially select corresponding function for e.g. min or max
+            potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr)
+            if isinstance(potential_call, PsCall):
+                potential_call.dtype = symbol_expr.dtype
+                return self.select_function(potential_call)
 
-                    return potential_call
+            return potential_call
 
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index 90fd69084..437962172 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -38,7 +38,7 @@ class Platform(ABC):
     @abstractmethod
     def select_function(
         self, call: PsCall
-    ) -> PsExpression | tuple[tuple[PsAstNode, ...], PsExpression]:
+    ) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]:
         """Select an implementation for the given function on the given data type.
 
         If no viable implementation exists, raise a `MaterializationError`.
diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py
index 288650698..d5f731653 100644
--- a/src/pystencils/backend/transformations/select_functions.py
+++ b/src/pystencils/backend/transformations/select_functions.py
@@ -1,6 +1,6 @@
-from ..ast.structural import PsStatement, PsAssignment, PsBlock
+from ..ast.structural import PsAssignment, PsBlock
 from ..exceptions import MaterializationError
-from ..platforms import Platform, CudaPlatform
+from ..platforms import Platform
 from ..ast import PsAstNode
 from ..ast.expressions import PsCall, PsExpression
 from ..functions import PsMathFunction, PsReductionFunction
@@ -25,13 +25,17 @@ class SelectFunctions:
                 resolved_func = self._platform.select_function(rhs)
 
                 match resolved_func:
-                    case ((prepend), expr):
-                        match self._platform:
-                            case CudaPlatform():
-                                # special case: produces statement with atomic operation writing value back to ptr
-                                return PsBlock(prepend + [PsStatement(expr)])
+                    case (prepend, new_rhs):
+                        assert isinstance(prepend, tuple)
+
+                        match new_rhs:
+                            case PsExpression():
+                                return PsBlock(prepend + (PsAssignment(node.lhs, new_rhs),))
+                            case PsAstNode():
+                                # special case: produces structural with atomic operation writing value back to ptr
+                                return PsBlock(prepend + (new_rhs,))
                             case _:
-                                return PsBlock(prepend + [PsAssignment(node.lhs, expr)])
+                                assert False, "Unexpected output from SelectFunctions."
                     case PsExpression():
                         return PsAssignment(node.lhs, resolved_func)
                     case _:
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 9f04d074a..cc3411249 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -187,16 +187,16 @@ class DefaultKernelCreationDriver:
             symbol_expr = typify(PsSymbolExpr(symbol))
             ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol))
             init_val = typify(reduction_info.init_val)
-            ptr_access = PsMemAcc(ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
 
-            decl_local_copy = PsDeclaration(symbol_expr, init_val)
+            ptr_access = PsMemAcc(ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
             write_back_ptr = PsCall(PsReductionFunction(ReductionFunctions.WriteBackToPtr, reduction_info.op),
                                     [ptr_symbol_expr, symbol_expr])
 
-            prepend_ast = [decl_local_copy]                          # declare and init local copy with neutral element
+            prepend_ast = [PsDeclaration(symbol_expr, init_val)]     # declare and init local copy with neutral element
             append_ast = [PsAssignment(ptr_access, write_back_ptr)]  # write back result to reduction target variable
 
-            kernel_ast.statements = prepend_ast + kernel_ast.statements + append_ast
+            kernel_ast.statements = prepend_ast + kernel_ast.statements
+            kernel_ast.statements += append_ast
 
         #   Target-Specific optimizations
         if self._target.is_cpu():
-- 
GitLab


From a972d759e7e6e8e8e0c6241a70456184a1366143 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 26 Feb 2025 19:15:45 +0100
Subject: [PATCH 121/180] Fix getter for thread exec condition for dense/sparse
 iteration spaces in cuda.py

---
 src/pystencils/backend/platforms/cuda.py | 68 +++++++++++-------------
 1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 291858810..e67b70db6 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -172,7 +172,7 @@ class Blockwise4DMapping(ThreadMapping):
 
 class CudaPlatform(GenericGpu):
     """Platform for CUDA-based GPUs.
-
+    
     Args:
         ctx: The kernel creation context
         omit_range_check: If `True`, generated index translation code will not check if the point identified
@@ -209,6 +209,33 @@ class CudaPlatform(GenericGpu):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
+    def _get_condition_for_translation(self, ispace: IterationSpace):
+        if self._omit_range_check:
+            return None
+
+        if isinstance(ispace, FullIterationSpace):
+            conds = []
+
+            dimensions = ispace.dimensions_in_loop_order()
+
+            for dim in dimensions:
+                ctr_expr = PsExpression.make(dim.counter)
+                conds.append(PsLt(ctr_expr, dim.stop))
+
+            condition: PsExpression = conds[0]
+            for cond in conds[1:]:
+                condition = PsAnd(condition, cond)
+
+            return condition
+        elif isinstance(ispace, SparseIterationSpace):
+            sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
+            stop = PsExpression.make(ispace.index_list.shape[0])
+
+            return PsLt(sparse_ctr_expr.clone(), stop)
+        else:
+            raise MaterializationError(f"Unknown type of iteration space: {ispace}")
+
+
     def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]:
         call_func = call.function
         assert isinstance(call_func, PsReductionFunction | PsMathFunction)
@@ -341,47 +368,12 @@ class CudaPlatform(GenericGpu):
 
     #   Internals
 
-    # TODO: SYCL platform has very similar code for fetching conditionals -> move to GenericGPU?
-
-    def _get_condition_for_translation(
-            self, ispace: IterationSpace):
-
-        if self._omit_range_check:
-            return None
-
-        match ispace:
-            case FullIterationSpace():
-
-                dimensions = ispace.dimensions_in_loop_order()
-
-                conds = []
-                for dim in dimensions:
-                    ctr_expr = PsExpression.make(dim.counter)
-                    conds.append(PsLt(ctr_expr, dim.stop))
-
-                    if conds:
-                        condition: PsExpression = conds[0]
-                        for cond in conds[1:]:
-                            condition = PsAnd(condition, cond)
-                        return condition
-                    else:
-                        return None
-
-            case SparseIterationSpace():
-                sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
-                stop = PsExpression.make(ispace.index_list.shape[0])
-
-                return PsLt(sparse_ctr_expr.clone(), stop)
-            case _:
-                assert False, "Unknown iteration space"
-
     def _prepend_dense_translation(
         self, body: PsBlock, ispace: FullIterationSpace
     ) -> PsBlock:
         ctr_mapping = self._thread_mapping(ispace)
 
         indexing_decls = []
-        cond = self._get_condition_for_translation(ispace)
 
         dimensions = ispace.dimensions_in_loop_order()
 
@@ -396,6 +388,7 @@ class CudaPlatform(GenericGpu):
                 self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
 
+        cond = self._get_condition_for_translation(ispace)
         if cond:
             ast = PsBlock(indexing_decls + [PsConditional(cond, body)])
         else:
@@ -410,8 +403,6 @@ class CudaPlatform(GenericGpu):
         factory = AstFactory(self._ctx)
         ispace.sparse_counter.dtype = constify(ispace.sparse_counter.get_dtype())
 
-        cond = self._get_condition_for_translation(ispace)
-
         sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
         ctr_mapping = self._thread_mapping(ispace)
 
@@ -434,6 +425,7 @@ class CudaPlatform(GenericGpu):
         ]
         body.statements = mappings + body.statements
 
+        cond = self._get_condition_for_translation(ispace)
         if cond:
             ast = PsBlock([sparse_idx_decl, PsConditional(cond, body)])
         else:
-- 
GitLab


From 4a031fc192fc3f7d30fde450e8ff1788d3ecc3dd Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 27 Feb 2025 11:49:00 +0100
Subject: [PATCH 122/180] Fix lint

---
 src/pystencils/backend/platforms/cuda.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index e67b70db6..a9ec9d8d6 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -235,7 +235,6 @@ class CudaPlatform(GenericGpu):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-
     def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]:
         call_func = call.function
         assert isinstance(call_func, PsReductionFunction | PsMathFunction)
-- 
GitLab


From c7f1518efc14326f9b5732e7e8f6ac7f07acc71a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 12 Mar 2025 16:28:42 +0100
Subject: [PATCH 123/180] Move manual atomic op implementations to new header

---
 src/pystencils/backend/platforms/cuda.py      |  5 +-
 src/pystencils/include/gpu_atomics.h          | 90 +++++++++++++++++++
 .../include/pystencils_runtime/hip.h          | 89 ------------------
 3 files changed, 94 insertions(+), 90 deletions(-)
 create mode 100644 src/pystencils/include/gpu_atomics.h

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 7aac0d412..32744661a 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -197,7 +197,10 @@ class CudaPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return {'"pystencils_runtime/hip.h"'}  # TODO: move to HipPlatform once it is introduced
+        return {
+            '"pystencils_runtime/hip.h"',  # TODO: move to HipPlatform once it is introduced
+            '"gpu_atomics.h'
+        }
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
diff --git a/src/pystencils/include/gpu_atomics.h b/src/pystencils/include/gpu_atomics.h
new file mode 100644
index 000000000..6de5c3321
--- /dev/null
+++ b/src/pystencils/include/gpu_atomics.h
@@ -0,0 +1,90 @@
+#pragma once
+
+// No direct implementation for all atomic operations available
+// -> add support by custom implementations using a CAS mechanism
+
+#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__)
+
+// - atomicMul (double/float)
+//   see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
+__device__ double atomicMul(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int oldValue = *address_as_ull, assumed;
+    do {
+      assumed = oldValue;
+      oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val *
+                           __longlong_as_double(assumed)));
+    } while (assumed != oldValue);
+
+    return __longlong_as_double(oldValue);
+}
+
+__device__ float atomicMul(float* address, float val) {
+    int* address_as_int = (int*)address;
+    int old = *address_as_int;
+    int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
+    } while (assumed != old);
+
+    return __int_as_float(old);
+}
+
+#endif
+
+#ifdef __CUDA_ARCH__
+
+// - atomicMin (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMin(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val < __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMin(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val < __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
+
+// - atomicMax (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMax(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val > __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMax(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val > __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/pystencils/include/pystencils_runtime/hip.h b/src/pystencils/include/pystencils_runtime/hip.h
index b0b4d9679..4bf4917f8 100644
--- a/src/pystencils/include/pystencils_runtime/hip.h
+++ b/src/pystencils/include/pystencils_runtime/hip.h
@@ -6,92 +6,3 @@ typedef __hip_int8_t int8_t;
 typedef __hip_uint16_t uint16_t;
 typedef __hip_int16_t int16_t;
 #endif
-
-// No direct implementation for all atomic operations available
-// -> add support by custom implementations using a CAS mechanism
-
-#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__)
-
-// - atomicMul (double/float)
-//   see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
-__device__ double atomicMul(double* address, double val) {
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int oldValue = *address_as_ull, assumed;
-    do {
-      assumed = oldValue;
-      oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val *
-                           __longlong_as_double(assumed)));
-    } while (assumed != oldValue);
-
-    return __longlong_as_double(oldValue);
-}
-
-__device__ float atomicMul(float* address, float val) {
-    int* address_as_int = (int*)address;
-    int old = *address_as_int;
-    int assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
-    } while (assumed != old);
-
-    return __int_as_float(old);
-}
-
-#endif
-
-#ifdef __CUDA_ARCH__
-
-// - atomicMin (double/float)
-//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
-__device__ __forceinline__ double atomicMin(double *address, double val)
-{
-    unsigned long long ret = __double_as_longlong(*address);
-    while(val < __longlong_as_double(ret))
-    {
-        unsigned long long old = ret;
-        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
-            break;
-    }
-    return __longlong_as_double(ret);
-}
-
-__device__ __forceinline__ float atomicMin(float *address, float val)
-{
-    int ret = __float_as_int(*address);
-    while(val < __int_as_float(ret))
-    {
-        int old = ret;
-        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
-            break;
-    }
-    return __int_as_float(ret);
-}
-
-// - atomicMax (double/float)
-//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
-__device__ __forceinline__ double atomicMax(double *address, double val)
-{
-    unsigned long long ret = __double_as_longlong(*address);
-    while(val > __longlong_as_double(ret))
-    {
-        unsigned long long old = ret;
-        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
-            break;
-    }
-    return __longlong_as_double(ret);
-}
-
-__device__ __forceinline__ float atomicMax(float *address, float val)
-{
-    int ret = __float_as_int(*address);
-    while(val > __int_as_float(ret))
-    {
-        int old = ret;
-        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
-            break;
-    }
-    return __int_as_float(ret);
-}
-
-#endif
-- 
GitLab


From 974cf848bfe474a5003e95c907e3e1289b6a5454 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 12 Mar 2025 16:40:25 +0100
Subject: [PATCH 124/180] Fix header incl

---
 src/pystencils/backend/platforms/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 32744661a..637723f07 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -199,7 +199,7 @@ class CudaPlatform(GenericGpu):
     def required_headers(self) -> set[str]:
         return {
             '"pystencils_runtime/hip.h"',  # TODO: move to HipPlatform once it is introduced
-            '"gpu_atomics.h'
+            '"gpu_atomics.h"'
         }
 
     def materialize_iteration_space(
-- 
GitLab


From 90837d04eaad5b3ad049c11fa5af48cf0942e812 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 20 Mar 2025 15:36:01 +0100
Subject: [PATCH 125/180] Merge handling for GPU reductions into generic_gpu.py
 for the time being

---
 .../backend/platforms/generic_gpu.py          | 147 +++++++++++++++---
 src/pystencils/codegen/driver.py              |   5 +
 2 files changed, 130 insertions(+), 22 deletions(-)

diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 11425d923..f16c28e8c 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,7 +1,16 @@
 from __future__ import annotations
-from abc import ABC, abstractmethod
 
-from ...types import constify, deconstify
+import math
+import operator
+from abc import ABC, abstractmethod
+from functools import reduce
+
+from ..ast import PsAstNode
+from ..constants import PsConstant
+from ...compound_op_mapping import compound_op_to_expr
+from ...sympyextensions.reduction import ReductionOp
+from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType
+from ...types.quick import UInt, SInt
 from ..exceptions import MaterializationError
 from .platform import Platform
 
@@ -15,7 +24,7 @@ from ..kernelcreation import (
 )
 
 from ..kernelcreation.context import KernelCreationContext
-from ..ast.structural import PsBlock, PsConditional, PsDeclaration
+from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment
 from ..ast.expressions import (
     PsExpression,
     PsLiteralExpr,
@@ -23,12 +32,17 @@ from ..ast.expressions import (
     PsCall,
     PsLookup,
     PsBufferAcc,
+    PsSymbolExpr,
+    PsConstantExpr,
+    PsAdd,
+    PsRem,
+    PsEq
 )
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
-from ..functions import PsMathFunction, MathFunctions, CFunction
-
+from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions, PsReductionFunction, \
+    PsMathFunction
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
@@ -174,10 +188,15 @@ class GenericGpu(Platform):
     def __init__(
         self,
         ctx: KernelCreationContext,
+        assume_warp_aligned_block_size: bool,
+        warp_size: int | None,
         thread_mapping: ThreadMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+        self._warp_size = warp_size
+
         self._thread_mapping = (
             thread_mapping if thread_mapping is not None else Linear3DMapping()
         )
@@ -194,14 +213,107 @@ class GenericGpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression:
-        assert isinstance(call.function, PsMathFunction)
+    @staticmethod
+    def _get_condition_for_translation(ispace: IterationSpace):
+
+        if isinstance(ispace, FullIterationSpace):
+            conds = []
+
+            dimensions = ispace.dimensions_in_loop_order()
+
+            for dim in dimensions:
+                ctr_expr = PsExpression.make(dim.counter)
+                conds.append(PsLt(ctr_expr, dim.stop))
+
+            condition: PsExpression = conds[0]
+            for cond in conds[1:]:
+                condition = PsAnd(condition, cond)
+
+            return condition
+        elif isinstance(ispace, SparseIterationSpace):
+            sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
+            stop = PsExpression.make(ispace.index_list.shape[0])
+
+            return PsLt(sparse_ctr_expr.clone(), stop)
+        else:
+            raise MaterializationError(f"Unknown type of iteration space: {ispace}")
+
+    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]:
+        call_func = call.function
+        assert isinstance(call_func, PsReductionFunction | PsMathFunction)
+
+        func = call_func.func
+
+        if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr:
+            ptr_expr, symbol_expr = call.args
+            op = call_func.reduction_op
+            stype = symbol_expr.dtype
+            ptrtype = ptr_expr.dtype
+
+            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
+            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
+
+            if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
+                NotImplementedError("atomic operations are only available for float32/64 datatypes")
+
+            # workaround for subtractions -> use additions for reducing intermediate results
+            # similar to OpenMP reductions: local copies (negative sign) are added at the end
+            match op:
+                case ReductionOp.Sub:
+                    actual_op = ReductionOp.Add
+                case _:
+                    actual_op = op
+
+            # perform local warp reductions
+            def gen_shuffle_instr(offset: int):
+                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
+                return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                              [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
+
+            num_shuffles = math.frexp(self._warp_size)[1]
+            shuffles = tuple(PsAssignment(symbol_expr,
+                                          compound_op_to_expr(actual_op,
+                                                              symbol_expr, gen_shuffle_instr(pow(2, i - 1))))
+                             for i in reversed(range(1, num_shuffles)))
+
+            # find first thread in warp
+            ispace = self._ctx.get_iteration_space()
+            is_valid_thread = self._get_condition_for_translation(ispace)
+            thread_indices_per_dim = [
+                idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)))
+                for i, idx in enumerate(THREAD_IDX[:ispace.rank])
+            ]
+            tid: PsExpression = thread_indices_per_dim[0]
+            for t in thread_indices_per_dim[1:]:
+                tid = PsAdd(tid, t)
+            first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
+                                        PsConstantExpr(PsConstant(0, SInt(32))))
+            cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
+
+            # use atomic operation on first thread of warp to sync
+            call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
+            call.args = (ptr_expr, symbol_expr)
+
+            # assemble warp reduction
+            return shuffles, PsConditional(cond, PsBlock([PsStatement(call)]))
 
-        func = call.function.func
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        if isinstance(dtype, PsIeeeFloatType):
+        if isinstance(dtype, PsScalarType) and func in NumericLimitsFunctions:
+            assert isinstance(dtype, PsIeeeFloatType)
+
+            match func:
+                case NumericLimitsFunctions.Min:
+                    define = "NEG_INFINITY"
+                case NumericLimitsFunctions.Max:
+                    define = "POS_INFINITY"
+                case _:
+                    raise MaterializationError(f"Cannot materialize call to function {func}")
+
+            return PsLiteralExpr(PsLiteral(define, dtype))
+
+        if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions:
             match func:
                 case (
                     MathFunctions.Exp
@@ -262,7 +374,6 @@ class GenericGpu(Platform):
         ctr_mapping = self._thread_mapping(ispace)
 
         indexing_decls = []
-        conds = []
 
         dimensions = ispace.dimensions_in_loop_order()
 
@@ -276,14 +387,9 @@ class GenericGpu(Platform):
             indexing_decls.append(
                 self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
-            conds.append(PsLt(ctr_expr, dim.stop))
-
-        condition: PsExpression = conds[0]
-        for cond in conds[1:]:
-            condition = PsAnd(condition, cond)
-        ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
 
-        return ast
+        cond = self._get_condition_for_translation(ispace)
+        return PsBlock(indexing_decls + [PsConditional(cond, body)])
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
@@ -313,8 +419,5 @@ class GenericGpu(Platform):
         ]
         body.statements = mappings + body.statements
 
-        stop = PsExpression.make(ispace.index_list.shape[0])
-        condition = PsLt(sparse_ctr_expr.clone(), stop)
-        ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-
-        return ast
+        cond = self._get_condition_for_translation(ispace)
+        return PsBlock([sparse_idx_decl, PsConditional(cond, body)])
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index c2bee0ad2..3962c316b 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -475,6 +475,9 @@ class DefaultKernelCreationDriver:
                 else None
             )
 
+            assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
+            warp_size: int | None = self._cfg.gpu.get_option("warp_size")
+
             GpuPlatform: type
             match self._target:
                 case Target.CUDA:
@@ -486,6 +489,8 @@ class DefaultKernelCreationDriver:
 
             return GpuPlatform(
                 self._ctx,
+                assume_warp_aligned_block_size,
+                warp_size,
                 thread_mapping=thread_mapping,
             )
 
-- 
GitLab


From 9ec813bdd1c8dfd2a1f32bb300d6f9e7d172542f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 20 Mar 2025 16:07:01 +0100
Subject: [PATCH 126/180] Employ optimized warp-level reduction based on check

---
 .../backend/platforms/generic_gpu.py          | 62 +++++++++++--------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index f16c28e8c..d3e8de42d 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -264,33 +264,45 @@ class GenericGpu(Platform):
                 case _:
                     actual_op = op
 
-            # perform local warp reductions
-            def gen_shuffle_instr(offset: int):
-                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
-                return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
-                              [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
-
-            num_shuffles = math.frexp(self._warp_size)[1]
-            shuffles = tuple(PsAssignment(symbol_expr,
-                                          compound_op_to_expr(actual_op,
-                                                              symbol_expr, gen_shuffle_instr(pow(2, i - 1))))
-                             for i in reversed(range(1, num_shuffles)))
-
-            # find first thread in warp
+            # check if thread is valid for performing reduction
             ispace = self._ctx.get_iteration_space()
             is_valid_thread = self._get_condition_for_translation(ispace)
-            thread_indices_per_dim = [
-                idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)))
-                for i, idx in enumerate(THREAD_IDX[:ispace.rank])
-            ]
-            tid: PsExpression = thread_indices_per_dim[0]
-            for t in thread_indices_per_dim[1:]:
-                tid = PsAdd(tid, t)
-            first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
-                                        PsConstantExpr(PsConstant(0, SInt(32))))
-            cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
-
-            # use atomic operation on first thread of warp to sync
+
+            cond: PsExpression
+            shuffles: tuple[PsAssignment, ...]
+            if self._warp_size and self._assume_warp_aligned_block_size:
+                # perform local warp reductions
+                def gen_shuffle_instr(offset: int):
+                    full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
+                    return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                                  [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
+
+                # set up shuffle instructions for warp-level reduction
+                num_shuffles = math.frexp(self._warp_size)[1]
+                shuffles = tuple(PsAssignment(symbol_expr,
+                                              compound_op_to_expr(actual_op,
+                                                                  symbol_expr, gen_shuffle_instr(pow(2, i - 1))))
+                                 for i in reversed(range(1, num_shuffles)))
+
+                # find first thread in warp
+                thread_indices_per_dim = [
+                    idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)))
+                    for i, idx in enumerate(THREAD_IDX[:ispace.rank])
+                ]
+                tid: PsExpression = thread_indices_per_dim[0]
+                for t in thread_indices_per_dim[1:]:
+                    tid = PsAdd(tid, t)
+                first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
+                                            PsConstantExpr(PsConstant(0, SInt(32))))
+
+                # set condition to only execute atomic operation on first valid thread in warp
+                cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
+            else:
+                # no optimization: only execute atomic add on valid thread
+                shuffles = ()
+                cond = is_valid_thread
+
+            # use atomic operation
             call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
             call.args = (ptr_expr, symbol_expr)
 
-- 
GitLab


From d7e6890cff8f327ce285a3360821a4341b2acc46 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 20 Mar 2025 16:23:54 +0100
Subject: [PATCH 127/180] Fix typecheck

---
 src/pystencils/backend/platforms/generic_cpu.py    |  4 ++--
 src/pystencils/backend/platforms/generic_gpu.py    |  4 ++--
 src/pystencils/backend/platforms/platform.py       |  4 ++--
 src/pystencils/backend/platforms/sycl.py           |  4 ++--
 .../backend/transformations/loop_vectorizer.py     | 14 +++++++-------
 .../backend/transformations/select_functions.py    |  4 ++--
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 2f873ff29..43b048184 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -22,7 +22,7 @@ from ..kernelcreation.iteration_space import (
 )
 
 from ..constants import PsConstant
-from ..ast.structural import PsDeclaration, PsLoop, PsBlock
+from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsStructuralNode
 from ..ast.expressions import (
     PsSymbolExpr,
     PsExpression,
@@ -60,7 +60,7 @@ class GenericCpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]:
+    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         call_func = call.function
         assert isinstance(call_func, PsReductionFunction | PsMathFunction)
 
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index d3e8de42d..9b21457be 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -24,7 +24,7 @@ from ..kernelcreation import (
 )
 
 from ..kernelcreation.context import KernelCreationContext
-from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment
+from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment, PsStructuralNode
 from ..ast.expressions import (
     PsExpression,
     PsLiteralExpr,
@@ -238,7 +238,7 @@ class GenericGpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]:
+    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         call_func = call.function
         assert isinstance(call_func, PsReductionFunction | PsMathFunction)
 
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index 437962172..4f738dd5d 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 
 from ..ast import PsAstNode
-from ..ast.structural import PsBlock
+from ..ast.structural import PsBlock, PsStructuralNode
 from ..ast.expressions import PsCall, PsExpression
 
 from ..kernelcreation.context import KernelCreationContext
@@ -38,7 +38,7 @@ class Platform(ABC):
     @abstractmethod
     def select_function(
         self, call: PsCall
-    ) -> PsExpression | tuple[tuple[PsAstNode, ...], PsAstNode]:
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         """Select an implementation for the given function on the given data type.
 
         If no viable implementation exists, raise a `MaterializationError`.
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index 7d7b8d1a7..78af01b2f 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -7,7 +7,7 @@ from ..kernelcreation.iteration_space import (
     FullIterationSpace,
     SparseIterationSpace,
 )
-from ..ast.structural import PsDeclaration, PsBlock, PsConditional
+from ..ast.structural import PsDeclaration, PsBlock, PsConditional, PsStructuralNode
 from ..ast.expressions import (
     PsExpression,
     PsSymbolExpr,
@@ -56,7 +56,7 @@ class SyclPlatform(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsAstNode], PsExpression]:
+    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         assert isinstance(call.function, PsMathFunction)
 
         func = call.function.func
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index b78114553..a96c6af4b 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -7,7 +7,7 @@ from ...types import PsVectorType, PsScalarType
 from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
-from ..ast.structural import PsLoop, PsBlock, PsDeclaration, PsAssignment
+from ..ast.structural import PsLoop, PsBlock, PsDeclaration, PsAssignment, PsStructuralNode
 from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr
 from ..ast.vector import PsVecBroadcast, PsVecHorizontal
 from ..ast.analysis import collect_undefined_symbols
@@ -135,20 +135,20 @@ class LoopVectorizer:
         vc = VectorizationContext(self._ctx, self._lanes, axis)
 
         #   Prepare reductions
-        simd_init_local_reduction_vars = []
-        simd_writeback_local_reduction_vars = []
+        simd_init_local_reduction_vars: list[PsStructuralNode] = []
+        simd_writeback_local_reduction_vars: list[PsStructuralNode] = []
         for symb, reduction_info in self._ctx.symbols_reduction_info.items():
             # Vectorize symbol for local copy
             vector_symb = vc.vectorize_symbol(symb)
 
             # Declare and init vector
-            simd_init_local_reduction_vars += [self._type_fold(PsDeclaration(
-                PsSymbolExpr(vector_symb), PsVecBroadcast(self._lanes, PsSymbolExpr(symb))))]
+            simd_init_local_reduction_vars += [PsDeclaration(
+                PsSymbolExpr(vector_symb), PsVecBroadcast(self._lanes, PsSymbolExpr(symb)))]
 
             # Write back vectorization result
-            simd_writeback_local_reduction_vars += [self._type_fold(PsAssignment(
+            simd_writeback_local_reduction_vars += [PsAssignment(
                 PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(symb), PsSymbolExpr(vector_symb),
-                                                    reduction_info.op)))]
+                                                    reduction_info.op))]
 
         #   Generate vectorized loop body
         simd_body = self._vectorize_ast(loop.body, vc)
diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py
index d5f731653..576cebad1 100644
--- a/src/pystencils/backend/transformations/select_functions.py
+++ b/src/pystencils/backend/transformations/select_functions.py
@@ -1,4 +1,4 @@
-from ..ast.structural import PsAssignment, PsBlock
+from ..ast.structural import PsAssignment, PsBlock, PsStructuralNode
 from ..exceptions import MaterializationError
 from ..platforms import Platform
 from ..ast import PsAstNode
@@ -31,7 +31,7 @@ class SelectFunctions:
                         match new_rhs:
                             case PsExpression():
                                 return PsBlock(prepend + (PsAssignment(node.lhs, new_rhs),))
-                            case PsAstNode():
+                            case PsStructuralNode():
                                 # special case: produces structural with atomic operation writing value back to ptr
                                 return PsBlock(prepend + (new_rhs,))
                             case _:
-- 
GitLab


From 5ee715d0df2651e745ff5de0524abfe24d48c968 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 20 Mar 2025 17:04:47 +0100
Subject: [PATCH 128/180] Reformat adapted files [skip ci]

---
 src/pystencils/__init__.py                    |  9 +-
 src/pystencils/backend/ast/vector.py          | 46 +++++----
 src/pystencils/backend/emission/ir_printer.py |  4 +-
 .../backend/kernelcreation/context.py         |  6 +-
 .../backend/kernelcreation/freeze.py          | 18 ++--
 .../backend/kernelcreation/typification.py    |  7 +-
 .../backend/platforms/generic_cpu.py          | 46 +++++++--
 .../backend/platforms/generic_gpu.py          | 98 ++++++++++++++-----
 src/pystencils/backend/platforms/platform.py  |  2 +-
 src/pystencils/backend/platforms/sycl.py      |  4 +-
 src/pystencils/backend/platforms/x86.py       | 10 +-
 .../backend/transformations/add_pragmas.py    |  8 +-
 .../transformations/loop_vectorizer.py        | 36 ++++---
 .../transformations/select_functions.py       |  8 +-
 .../transformations/select_intrinsics.py      |  4 +-
 src/pystencils/codegen/driver.py              | 38 +++++--
 src/pystencils/compound_op_mapping.py         | 15 ++-
 src/pystencils/jit/cpu_extension_module.py    |  5 +-
 src/pystencils/sympyextensions/__init__.py    |  4 +-
 src/pystencils/sympyextensions/reduction.py   | 11 ++-
 tests/kernelcreation/test_reduction.py        | 23 +++--
 21 files changed, 283 insertions(+), 119 deletions(-)

diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index a7bf33aa6..329f61d32 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -1,10 +1,6 @@
 """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions"""
 
-from .codegen import (
-    Target,
-    CreateKernelConfig,
-    AUTO
-)
+from .codegen import Target, CreateKernelConfig, AUTO
 from .defaults import DEFAULTS
 from . import fd
 from . import stencil as stencil
@@ -93,4 +89,5 @@ __all__ = [
 ]
 
 from . import _version
-__version__ = _version.get_versions()['version']
+
+__version__ = _version.get_versions()["version"]
diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 4f5224133..4141b0296 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -18,7 +18,7 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
     """Broadcast a scalar value to N vector lanes."""
 
     __match_args__ = ("lanes", "operand")
-    
+
     def __init__(self, lanes: int, operand: PsExpression):
         super().__init__(operand)
         self._lanes = lanes
@@ -26,21 +26,18 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
     @property
     def lanes(self) -> int:
         return self._lanes
-    
+
     @lanes.setter
     def lanes(self, n: int):
         self._lanes = n
 
     def _clone_expr(self) -> PsVecBroadcast:
         return PsVecBroadcast(self._lanes, self._operand.clone())
-    
+
     def structurally_equal(self, other: PsAstNode) -> bool:
         if not isinstance(other, PsVecBroadcast):
             return False
-        return (
-            super().structurally_equal(other)
-            and self._lanes == other._lanes
-        )
+        return super().structurally_equal(other) and self._lanes == other._lanes
 
 
 class PsVecHorizontal(PsBinOp, PsVectorOp):
@@ -48,8 +45,13 @@ class PsVecHorizontal(PsBinOp, PsVectorOp):
 
     __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op")
 
-    def __init__(self, lanes: int, scalar_operand: PsExpression, vector_operand: PsExpression,
-                 reduction_op: ReductionOp):
+    def __init__(
+        self,
+        lanes: int,
+        scalar_operand: PsExpression,
+        vector_operand: PsExpression,
+        reduction_op: ReductionOp,
+    ):
         super().__init__(scalar_operand, vector_operand)
         self._lanes = lanes
         self._reduction_op = reduction_op
@@ -87,19 +89,23 @@ class PsVecHorizontal(PsBinOp, PsVectorOp):
         self._reduction_op = op
 
     def _clone_expr(self) -> PsVecHorizontal:
-        return PsVecHorizontal(self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op)
+        return PsVecHorizontal(
+            self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op
+        )
 
     def structurally_equal(self, other: PsAstNode) -> bool:
         if not isinstance(other, PsVecHorizontal):
             return False
-        return (super().structurally_equal(other)
-                and self._lanes == other._lanes
-                and self._reduction_op == other._reduction_op)
+        return (
+            super().structurally_equal(other)
+            and self._lanes == other._lanes
+            and self._reduction_op == other._reduction_op
+        )
 
 
 class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
     """Pointer-based vectorized memory access.
-    
+
     Args:
         base_ptr: Pointer identifying the accessed memory region
         offset: Offset inside the memory region
@@ -150,7 +156,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
     @property
     def stride(self) -> PsExpression | None:
         return self._stride
-    
+
     @stride.setter
     def stride(self, expr: PsExpression | None):
         self._stride = expr
@@ -161,10 +167,12 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
 
     def get_vector_type(self) -> PsVectorType:
         return cast(PsVectorType, self._dtype)
-    
+
     def get_children(self) -> tuple[PsAstNode, ...]:
-        return (self._ptr, self._offset) + (() if self._stride is None else (self._stride,))
-    
+        return (self._ptr, self._offset) + (
+            () if self._stride is None else (self._stride,)
+        )
+
     def set_child(self, idx: int, c: PsAstNode):
         idx = [0, 1, 2][idx]
         match idx:
@@ -193,7 +201,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
             and self._vector_entries == other._vector_entries
             and self._aligned == other._aligned
         )
-    
+
     def __repr__(self) -> str:
         return (
             f"PsVecMemAcc({repr(self._ptr)}, {repr(self._offset)}, {repr(self._vector_entries)}, "
diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py
index 1508e6d94..22ae2f91a 100644
--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -24,7 +24,7 @@ def emit_ir(ir: PsAstNode | Kernel):
 
 class IRAstPrinter(BasePrinter):
     """Print the IR AST as pseudo-code.
-    
+
     This printer produces a complete pseudocode representation of a pystencils AST.
     Other than the `CAstPrinter`, the `IRAstPrinter` is capable of emitting code for
     each node defined in `ast <pystencils.backend.ast>`.
@@ -85,7 +85,7 @@ class IRAstPrinter(BasePrinter):
 
                 return pc.parenthesize(
                     f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({scalar_operand_code, vector_operand_code})",
-                    Ops.Weakest
+                    Ops.Weakest,
                 )
 
             case _:
diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 827be45a5..536c73c7f 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -106,7 +106,7 @@ class KernelCreationContext:
     def index_dtype(self) -> PsIntegerType:
         """Data type used by default for index expressions"""
         return self._index_dtype
-    
+
     def resolve_dynamic_type(self, dtype: DynamicType | PsType) -> PsType:
         """Selects the appropriate data type for `DynamicType` instances, and returns all other types as they are."""
         match dtype:
@@ -191,7 +191,9 @@ class KernelCreationContext:
 
         self._symbols[old.name] = new
 
-    def add_symbol_reduction_info(self, local_symb: PsSymbol, reduction_info: ReductionInfo):
+    def add_symbol_reduction_info(
+        self, local_symb: PsSymbol, reduction_info: ReductionInfo
+    ):
         """Adds entry for a symbol and its reduction info to its corresponding lookup table.
 
         The symbol ``symbol`` shall not exist in the symbol table already.
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index df6bfbd1f..63e9ea5b1 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -57,7 +57,7 @@ from ..ast.expressions import (
     PsAnd,
     PsOr,
     PsNot,
-    PsMemAcc
+    PsMemAcc,
 )
 from ..ast.vector import PsVecMemAcc
 
@@ -110,7 +110,9 @@ class FreezeExpressions:
 
     def __call__(self, obj: AssignmentCollection | sp.Basic) -> PsAstNode:
         if isinstance(obj, AssignmentCollection):
-            return PsBlock([cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments])
+            return PsBlock(
+                [cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments]
+            )
         elif isinstance(obj, AssignmentBase):
             return cast(PsAssignment, self.visit(obj))
         elif isinstance(obj, _ExprLike):
@@ -179,7 +181,9 @@ class FreezeExpressions:
             "/=": ReductionOp.Div,
         }
 
-        return PsAssignment(lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs))
+        return PsAssignment(
+            lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs)
+        )
 
     def map_ReductionAssignment(self, expr: ReductionAssignment):
         assert isinstance(expr.lhs, TypedSymbol)
@@ -327,22 +331,22 @@ class FreezeExpressions:
             raise FreezeError("Cannot translate an empty tuple.")
 
         items = [self.visit_expr(item) for item in expr]
-        
+
         if any(isinstance(i, PsArrayInitList) for i in items):
             #  base case: have nested arrays
             if not all(isinstance(i, PsArrayInitList) for i in items):
                 raise FreezeError(
                     f"Cannot translate nested arrays of non-uniform shape: {expr}"
                 )
-            
+
             subarrays = cast(list[PsArrayInitList], items)
             shape_tail = subarrays[0].shape
-            
+
             if not all(s.shape == shape_tail for s in subarrays[1:]):
                 raise FreezeError(
                     f"Cannot translate nested arrays of non-uniform shape: {expr}"
                 )
-            
+
             return PsArrayInitList([s.items_grid for s in subarrays])  # type: ignore
         else:
             #  base case: no nested arrays
diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index 3ca0a16e2..b457f39a0 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -194,9 +194,10 @@ class TypeContext:
                             f"    Target type: {self._target_type}"
                         )
 
-                case PsNumericOpTrait() if not isinstance(
-                    self._target_type, PsNumericType
-                ) or self._target_type.is_bool():
+                case PsNumericOpTrait() if (
+                    not isinstance(self._target_type, PsNumericType)
+                    or self._target_type.is_bool()
+                ):
                     #   FIXME: PsBoolType derives from PsNumericType, but is not numeric
                     raise TypificationError(
                         f"Numerical operation encountered in non-numerical type context:\n"
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 43b048184..ccef61817 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -4,8 +4,14 @@ from typing import Sequence
 from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr
 
 from ..ast import PsAstNode
-from ..functions import CFunction, MathFunctions, NumericLimitsFunctions, ReductionFunctions, PsMathFunction, \
-    PsReductionFunction
+from ..functions import (
+    CFunction,
+    MathFunctions,
+    NumericLimitsFunctions,
+    ReductionFunctions,
+    PsMathFunction,
+    PsReductionFunction,
+)
 from ..literals import PsLiteral
 from ...compound_op_mapping import compound_op_to_expr
 from ...sympyextensions import ReductionOp
@@ -30,7 +36,8 @@ from ..ast.expressions import (
     PsLookup,
     PsGe,
     PsLe,
-    PsTernary, PsLiteralExpr,
+    PsTernary,
+    PsLiteralExpr,
 )
 from ..ast.vector import PsVecMemAcc
 from ...types import PsVectorType, PsCustomType
@@ -60,20 +67,31 @@ class GenericCpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         call_func = call.function
         assert isinstance(call_func, PsReductionFunction | PsMathFunction)
 
         func = call_func.func
 
-        if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr:
+        if (
+            isinstance(call_func, PsReductionFunction)
+            and func is ReductionFunctions.WriteBackToPtr
+        ):
             ptr_expr, symbol_expr = call.args
             op = call_func.reduction_op
 
-            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptr_expr.dtype, PsPointerType)
-            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(symbol_expr.dtype, PsScalarType)
+            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(
+                ptr_expr.dtype, PsPointerType
+            )
+            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(
+                symbol_expr.dtype, PsScalarType
+            )
 
-            ptr_access = PsMemAcc(ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
+            ptr_access = PsMemAcc(
+                ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))
+            )
 
             # inspired by OpenMP: local reduction variable (negative sign) is added at the end
             actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
@@ -89,8 +107,16 @@ class GenericCpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        if isinstance(dtype, PsScalarType) and func in (NumericLimitsFunctions.Min, NumericLimitsFunctions.Max):
-            return PsLiteralExpr(PsLiteral(f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()", dtype))
+        if isinstance(dtype, PsScalarType) and func in (
+            NumericLimitsFunctions.Min,
+            NumericLimitsFunctions.Max,
+        ):
+            return PsLiteralExpr(
+                PsLiteral(
+                    f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()",
+                    dtype,
+                )
+            )
 
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
             cfunc: CFunction
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 9b21457be..2a12d6b7b 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -24,7 +24,14 @@ from ..kernelcreation import (
 )
 
 from ..kernelcreation.context import KernelCreationContext
-from ..ast.structural import PsBlock, PsConditional, PsDeclaration, PsStatement, PsAssignment, PsStructuralNode
+from ..ast.structural import (
+    PsBlock,
+    PsConditional,
+    PsDeclaration,
+    PsStatement,
+    PsAssignment,
+    PsStructuralNode,
+)
 from ..ast.expressions import (
     PsExpression,
     PsLiteralExpr,
@@ -36,13 +43,19 @@ from ..ast.expressions import (
     PsConstantExpr,
     PsAdd,
     PsRem,
-    PsEq
+    PsEq,
 )
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
-from ..functions import MathFunctions, CFunction, ReductionFunctions, NumericLimitsFunctions, PsReductionFunction, \
-    PsMathFunction
+from ..functions import (
+    MathFunctions,
+    CFunction,
+    ReductionFunctions,
+    NumericLimitsFunctions,
+    PsReductionFunction,
+    PsMathFunction,
+)
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
@@ -131,7 +144,7 @@ class Blockwise4DMapping(ThreadMapping):
         THREAD_IDX[0],
         BLOCK_IDX[0],
         BLOCK_IDX[1],
-        BLOCK_IDX[2]
+        BLOCK_IDX[2],
     ]
 
     def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
@@ -177,7 +190,7 @@ class Blockwise4DMapping(ThreadMapping):
 
 class GenericGpu(Platform):
     """Common base platform for CUDA- and HIP-type GPU targets.
-    
+
     Args:
         ctx: The kernel creation context
         omit_range_check: If `True`, generated index translation code will not check if the point identified
@@ -238,23 +251,34 @@ class GenericGpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         call_func = call.function
         assert isinstance(call_func, PsReductionFunction | PsMathFunction)
 
         func = call_func.func
 
-        if isinstance(call_func, PsReductionFunction) and func is ReductionFunctions.WriteBackToPtr:
+        if (
+            isinstance(call_func, PsReductionFunction)
+            and func is ReductionFunctions.WriteBackToPtr
+        ):
             ptr_expr, symbol_expr = call.args
             op = call_func.reduction_op
             stype = symbol_expr.dtype
             ptrtype = ptr_expr.dtype
 
-            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
-            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
+            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(
+                ptrtype, PsPointerType
+            )
+            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(
+                stype, PsScalarType
+            )
 
             if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
-                NotImplementedError("atomic operations are only available for float32/64 datatypes")
+                NotImplementedError(
+                    "atomic operations are only available for float32/64 datatypes"
+                )
 
             # workaround for subtractions -> use additions for reducing intermediate results
             # similar to OpenMP reductions: local copies (negative sign) are added at the end
@@ -274,36 +298,60 @@ class GenericGpu(Platform):
                 # perform local warp reductions
                 def gen_shuffle_instr(offset: int):
                     full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
-                    return PsCall(CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
-                                  [full_mask, symbol_expr, PsConstantExpr(PsConstant(offset, SInt(32)))])
+                    return PsCall(
+                        CFunction(
+                            "__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype
+                        ),
+                        [
+                            full_mask,
+                            symbol_expr,
+                            PsConstantExpr(PsConstant(offset, SInt(32))),
+                        ],
+                    )
 
                 # set up shuffle instructions for warp-level reduction
                 num_shuffles = math.frexp(self._warp_size)[1]
-                shuffles = tuple(PsAssignment(symbol_expr,
-                                              compound_op_to_expr(actual_op,
-                                                                  symbol_expr, gen_shuffle_instr(pow(2, i - 1))))
-                                 for i in reversed(range(1, num_shuffles)))
+                shuffles = tuple(
+                    PsAssignment(
+                        symbol_expr,
+                        compound_op_to_expr(
+                            actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1))
+                        ),
+                    )
+                    for i in reversed(range(1, num_shuffles))
+                )
 
                 # find first thread in warp
                 thread_indices_per_dim = [
-                    idx * PsConstantExpr(PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32)))
-                    for i, idx in enumerate(THREAD_IDX[:ispace.rank])
+                    idx
+                    * PsConstantExpr(
+                        PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
+                    )
+                    for i, idx in enumerate(THREAD_IDX[: ispace.rank])
                 ]
                 tid: PsExpression = thread_indices_per_dim[0]
                 for t in thread_indices_per_dim[1:]:
                     tid = PsAdd(tid, t)
-                first_thread_in_warp = PsEq(PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
-                                            PsConstantExpr(PsConstant(0, SInt(32))))
+                first_thread_in_warp = PsEq(
+                    PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
+                    PsConstantExpr(PsConstant(0, SInt(32))),
+                )
 
                 # set condition to only execute atomic operation on first valid thread in warp
-                cond = PsAnd(is_valid_thread, first_thread_in_warp) if is_valid_thread else first_thread_in_warp
+                cond = (
+                    PsAnd(is_valid_thread, first_thread_in_warp)
+                    if is_valid_thread
+                    else first_thread_in_warp
+                )
             else:
                 # no optimization: only execute atomic add on valid thread
                 shuffles = ()
                 cond = is_valid_thread
 
             # use atomic operation
-            call.function = CFunction(f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void"))
+            call.function = CFunction(
+                f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")
+            )
             call.args = (ptr_expr, symbol_expr)
 
             # assemble warp reduction
@@ -321,7 +369,9 @@ class GenericGpu(Platform):
                 case NumericLimitsFunctions.Max:
                     define = "POS_INFINITY"
                 case _:
-                    raise MaterializationError(f"Cannot materialize call to function {func}")
+                    raise MaterializationError(
+                        f"Cannot materialize call to function {func}"
+                    )
 
             return PsLiteralExpr(PsLiteral(define, dtype))
 
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index 4f738dd5d..7b81865ae 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -12,7 +12,7 @@ class Platform(ABC):
     """Abstract base class for all supported platforms.
 
     The platform performs all target-dependent tasks during code generation:
-    
+
     - Translation of the iteration space to an index source (loop nest, GPU indexing, ...)
     - Platform-specific optimizations (e.g. vectorization, OpenMP)
     """
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index 78af01b2f..22d60f9b0 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -56,7 +56,9 @@ class SyclPlatform(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         assert isinstance(call.function, PsMathFunction)
 
         func = call.function.func
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index df0945006..add38cfe4 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -359,7 +359,11 @@ def _x86_op_intrin(
             atype = vtype.scalar_type
         case PsVecHorizontal():
             # horizontal add instead of sub avoids double inversion of sign
-            actual_op = ReductionOp.Add if op.reduction_op == ReductionOp.Sub else op.reduction_op
+            actual_op = (
+                ReductionOp.Add
+                if op.reduction_op == ReductionOp.Sub
+                else op.reduction_op
+            )
             opstr = f"horizontal_{actual_op.name.lower()}"
             rtype = vtype.scalar_type
             atypes = (vtype.scalar_type, vtype)
@@ -409,7 +413,9 @@ def _x86_op_intrin(
                 case (SInt(64), Fp()) | (
                     Fp(),
                     SInt(64),
-                ) if varch < X86VectorArch.AVX512:
+                ) if (
+                    varch < X86VectorArch.AVX512
+                ):
                     panic()
                 # AVX512 only: cvtepiA_epiT if A > T
                 case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512:
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index c9e8b3994..fa466e495 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -126,9 +126,13 @@ class AddOpenMP:
         if bool(ctx.symbols_reduction_info):
             for symbol, reduction_info in ctx.symbols_reduction_info.items():
                 if isinstance(symbol.dtype, PsScalarType):
-                    pragma_text += f" reduction({reduction_info.op.value}: {symbol.name})"
+                    pragma_text += (
+                        f" reduction({reduction_info.op.value}: {symbol.name})"
+                    )
                 else:
-                    NotImplementedError("OMP: Reductions for non-scalar data types are not supported yet.")
+                    NotImplementedError(
+                        "OMP: Reductions for non-scalar data types are not supported yet."
+                    )
 
         if collapse is not None:
             if collapse <= 0:
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index a96c6af4b..04d7d20f0 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -7,7 +7,13 @@ from ...types import PsVectorType, PsScalarType
 from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
-from ..ast.structural import PsLoop, PsBlock, PsDeclaration, PsAssignment, PsStructuralNode
+from ..ast.structural import (
+    PsLoop,
+    PsBlock,
+    PsDeclaration,
+    PsAssignment,
+    PsStructuralNode,
+)
 from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr
 from ..ast.vector import PsVecBroadcast, PsVecHorizontal
 from ..ast.analysis import collect_undefined_symbols
@@ -142,13 +148,25 @@ class LoopVectorizer:
             vector_symb = vc.vectorize_symbol(symb)
 
             # Declare and init vector
-            simd_init_local_reduction_vars += [PsDeclaration(
-                PsSymbolExpr(vector_symb), PsVecBroadcast(self._lanes, PsSymbolExpr(symb)))]
+            simd_init_local_reduction_vars += [
+                PsDeclaration(
+                    PsSymbolExpr(vector_symb),
+                    PsVecBroadcast(self._lanes, PsSymbolExpr(symb)),
+                )
+            ]
 
             # Write back vectorization result
-            simd_writeback_local_reduction_vars += [PsAssignment(
-                PsSymbolExpr(symb), PsVecHorizontal(self._lanes, PsSymbolExpr(symb), PsSymbolExpr(vector_symb),
-                                                    reduction_info.op))]
+            simd_writeback_local_reduction_vars += [
+                PsAssignment(
+                    PsSymbolExpr(symb),
+                    PsVecHorizontal(
+                        self._lanes,
+                        PsSymbolExpr(symb),
+                        PsSymbolExpr(vector_symb),
+                        reduction_info.op,
+                    ),
+                )
+            ]
 
         #   Generate vectorized loop body
         simd_body = self._vectorize_ast(loop.body, vc)
@@ -241,11 +259,7 @@ class LoopVectorizer:
 
                 return PsBlock(
                     simd_init_local_reduction_vars
-                    + [
-                        simd_stop_decl,
-                        simd_step_decl,
-                        simd_loop
-                    ]
+                    + [simd_stop_decl, simd_step_decl, simd_loop]
                     + simd_writeback_local_reduction_vars
                     + [
                         trailing_start_decl,
diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py
index 576cebad1..9ce404693 100644
--- a/src/pystencils/backend/transformations/select_functions.py
+++ b/src/pystencils/backend/transformations/select_functions.py
@@ -21,7 +21,9 @@ class SelectFunctions:
 
         if isinstance(node, PsAssignment):
             rhs = node.rhs
-            if isinstance(rhs, PsCall) and isinstance(rhs.function, PsReductionFunction):
+            if isinstance(rhs, PsCall) and isinstance(
+                rhs.function, PsReductionFunction
+            ):
                 resolved_func = self._platform.select_function(rhs)
 
                 match resolved_func:
@@ -30,7 +32,9 @@ class SelectFunctions:
 
                         match new_rhs:
                             case PsExpression():
-                                return PsBlock(prepend + (PsAssignment(node.lhs, new_rhs),))
+                                return PsBlock(
+                                    prepend + (PsAssignment(node.lhs, new_rhs),)
+                                )
                             case PsStructuralNode():
                                 # special case: produces structural with atomic operation writing value back to ptr
                                 return PsBlock(prepend + (new_rhs,))
diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py
index 49fb9bb08..b20614393 100644
--- a/src/pystencils/backend/transformations/select_intrinsics.py
+++ b/src/pystencils/backend/transformations/select_intrinsics.py
@@ -101,7 +101,9 @@ class SelectIntrinsics:
             if isinstance(expr, PsVecHorizontal):
                 scalar_op = expr.scalar_operand
                 vector_op_to_scalar = self.visit_expr(expr.vector_operand, sc)
-                return self._platform.op_intrinsic(expr, [scalar_op, vector_op_to_scalar])
+                return self._platform.op_intrinsic(
+                    expr, [scalar_op, vector_op_to_scalar]
+                )
             else:
                 return expr
 
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 3962c316b..c285dd7bf 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -26,7 +26,13 @@ from ..types import PsIntegerType, PsScalarType
 from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
 from ..backend.functions import PsReductionFunction, ReductionFunctions
-from ..backend.ast.expressions import PsExpression, PsSymbolExpr, PsCall, PsMemAcc, PsConstantExpr
+from ..backend.ast.expressions import (
+    PsExpression,
+    PsSymbolExpr,
+    PsCall,
+    PsMemAcc,
+    PsConstantExpr,
+)
 from ..backend.ast.structural import PsBlock, PsLoop, PsDeclaration, PsAssignment
 from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
@@ -191,12 +197,20 @@ class DefaultKernelCreationDriver:
             ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol))
             init_val = typify(reduction_info.init_val)
 
-            ptr_access = PsMemAcc(ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype)))
-            write_back_ptr = PsCall(PsReductionFunction(ReductionFunctions.WriteBackToPtr, reduction_info.op),
-                                    [ptr_symbol_expr, symbol_expr])
+            ptr_access = PsMemAcc(
+                ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))
+            )
+            write_back_ptr = PsCall(
+                PsReductionFunction(
+                    ReductionFunctions.WriteBackToPtr, reduction_info.op
+                ),
+                [ptr_symbol_expr, symbol_expr],
+            )
 
-            prepend_ast = [PsDeclaration(symbol_expr, init_val)]     # declare and init local copy with neutral element
-            append_ast = [PsAssignment(ptr_access, write_back_ptr)]  # write back result to reduction target variable
+            # declare and init local copy with neutral element
+            prepend_ast = [PsDeclaration(symbol_expr, init_val)]
+            # write back result to reduction target variable
+            append_ast = [PsAssignment(ptr_access, write_back_ptr)]
 
             kernel_ast.statements = prepend_ast + kernel_ast.statements
             kernel_ast.statements += append_ast
@@ -423,14 +437,18 @@ class DefaultKernelCreationDriver:
 
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
-        assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
+        assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option(
+            "assume_warp_aligned_block_size"
+        )
         warp_size: int | None = self._cfg.gpu.get_option("warp_size")
 
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
 
         if warp_size is None and assume_warp_aligned_block_size:
-            warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.")
+            warn(
+                "GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`."
+            )
 
         return GpuIndexing(
             self._ctx,
@@ -475,7 +493,9 @@ class DefaultKernelCreationDriver:
                 else None
             )
 
-            assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
+            assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option(
+                "assume_warp_aligned_block_size"
+            )
             warp_size: int | None = self._cfg.gpu.get_option("warp_size")
 
             GpuPlatform: type
diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/compound_op_mapping.py
index f256369f9..193b308d0 100644
--- a/src/pystencils/compound_op_mapping.py
+++ b/src/pystencils/compound_op_mapping.py
@@ -3,7 +3,12 @@ from .backend.exceptions import FreezeError
 from .backend.functions import PsMathFunction, MathFunctions
 from .sympyextensions.reduction import ReductionOp
 
-_available_operator_interface: set[ReductionOp] = {ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Div}
+_available_operator_interface: set[ReductionOp] = {
+    ReductionOp.Add,
+    ReductionOp.Sub,
+    ReductionOp.Mul,
+    ReductionOp.Div,
+}
 
 
 def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
@@ -18,7 +23,9 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
             case ReductionOp.Div:
                 operator = PsDiv
             case _:
-                raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.")
+                raise FreezeError(
+                    f"Found unsupported operation type for compound assignments: {op}."
+                )
         return operator(op1, op2)
     else:
         match op:
@@ -27,4 +34,6 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
             case ReductionOp.Max:
                 return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2])
             case _:
-                raise FreezeError(f"Found unsupported operation type for compound assignments: {op}.")
+                raise FreezeError(
+                    f"Found unsupported operation type for compound assignments: {op}."
+                )
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index 03260f649..4d76ea9ca 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -92,6 +92,7 @@ class PsKernelExtensioNModule:
 
         #   Kernels and call wrappers
         from ..backend.emission import CAstPrinter
+
         printer = CAstPrinter(func_prefix="FUNC_PREFIX")
 
         for name, kernel in self._kernels.items():
@@ -293,7 +294,9 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
             self._buffer_types[ptr] = ptr_dtype.base_type
             self.extract_buffer(ptr, param.name)
             buffer = self.get_buffer(param.name)
-            code = f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;"
+            code = (
+                f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;"
+            )
 
             assert code is not None
 
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index 71f9a049a..bd0fa1fe9 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -28,7 +28,7 @@ from .math import (
     count_operations_in_ast,
     common_denominator,
     get_symmetric_part,
-    SymbolCreator
+    SymbolCreator,
 )
 
 
@@ -67,5 +67,5 @@ __all__ = [
     "common_denominator",
     "get_symmetric_part",
     "SymbolCreator",
-    "DynamicType"
+    "DynamicType",
 ]
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index cebfcb2f7..e95e37c24 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -22,6 +22,7 @@ class ReductionAssignment(AssignmentBase):
     reduction_op : ReductionOp
        Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc.
     """
+
     _reduction_op = None  # type: ReductionOp
 
     @property
@@ -55,9 +56,13 @@ class MaxReductionAssignment(ReductionAssignment):
 
 # Mapping from ReductionOp enum to ReductionAssigment classes
 _reduction_assignment_classes = {
-    cls.reduction_op: cls for cls in [
-        AddReductionAssignment, SubReductionAssignment, MulReductionAssignment,
-        MinReductionAssignment, MaxReductionAssignment
+    cls.reduction_op: cls
+    for cls in [
+        AddReductionAssignment,
+        SubReductionAssignment,
+        MulReductionAssignment,
+        MinReductionAssignment,
+        MaxReductionAssignment,
     ]
 }
 
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index c3775964b..6e2b2f3fe 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -10,7 +10,7 @@ SIZE = 15
 SOLUTION = {
     "+": INIT_W + INIT_ARR * SIZE,
     "-": INIT_W - INIT_ARR * SIZE,
-    "*": INIT_W * INIT_ARR ** SIZE,
+    "*": INIT_W * INIT_ARR**SIZE,
     "min": min(INIT_W, INIT_ARR),
     "max": max(INIT_W, INIT_ARR),
 }
@@ -18,7 +18,7 @@ SOLUTION = {
 
 # get AST for kernel with reduction assignment
 def get_reduction_assign_ast(dtype, op, config):
-    x = ps.fields(f'x: {dtype}[1d]')
+    x = ps.fields(f"x: {dtype}[1d]")
     w = ps.TypedSymbol("w", dtype)
 
     red_assign = reduction_assignment_from_str(w, op, x.center())
@@ -26,13 +26,18 @@ def get_reduction_assign_ast(dtype, op, config):
     return ps.create_kernel([red_assign], config, default_dtype=dtype)
 
 
-@pytest.mark.parametrize('instruction_set', ['sse', 'avx'])
-@pytest.mark.parametrize('dtype', ["float64", "float32"])
+@pytest.mark.parametrize("instruction_set", ["sse", "avx"])
+@pytest.mark.parametrize("dtype", ["float64", "float32"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction_cpu(instruction_set, dtype, op):
-    vectorize_info = {'instruction_set': instruction_set, 'assume_inner_stride_one': True}
+    vectorize_info = {
+        "instruction_set": instruction_set,
+        "assume_inner_stride_one": True,
+    }
 
-    config = ps.CreateKernelConfig(target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info)
+    config = ps.CreateKernelConfig(
+        target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info
+    )
 
     ast_reduction = get_reduction_assign_ast(dtype, op, config)
     ps.show_code(ast_reduction)
@@ -45,7 +50,7 @@ def test_reduction_cpu(instruction_set, dtype, op):
     assert np.allclose(reduction_array, SOLUTION[op])
 
 
-@pytest.mark.parametrize('dtype', ["float64", "float32"])
+@pytest.mark.parametrize("dtype", ["float64", "float32"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
 def test_reduction_gpu(dtype, op):
     try:
@@ -57,7 +62,9 @@ def test_reduction_gpu(dtype, op):
     except ImportError:
         pytest.skip(reason="CuPy is not available", allow_module_level=True)
     except CUDARuntimeError:
-        pytest.skip(reason="No CUDA capable device is detected", allow_module_level=True)
+        pytest.skip(
+            reason="No CUDA capable device is detected", allow_module_level=True
+        )
 
     config = ps.CreateKernelConfig(target=ps.Target.GPU)
 
-- 
GitLab


From 16a6e80d32e7ff12e151fd75ec6f2905903620df Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 20 Mar 2025 17:11:13 +0100
Subject: [PATCH 129/180] Add missing type fold for loop vectorizer again

---
 src/pystencils/backend/transformations/loop_vectorizer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index 04d7d20f0..48b9ad0da 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -149,9 +149,11 @@ class LoopVectorizer:
 
             # Declare and init vector
             simd_init_local_reduction_vars += [
-                PsDeclaration(
-                    PsSymbolExpr(vector_symb),
-                    PsVecBroadcast(self._lanes, PsSymbolExpr(symb)),
+                self._type_fold(
+                    PsDeclaration(
+                        PsSymbolExpr(vector_symb),
+                        PsVecBroadcast(self._lanes, PsSymbolExpr(symb)),
+                    )
                 )
             ]
 
-- 
GitLab


From 2c507d796f0c5abbff32386f4268d3bdb988c6fa Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 20 Mar 2025 18:18:10 +0100
Subject: [PATCH 130/180] Fix required headers for cuda/hip platforms

---
 src/pystencils/backend/platforms/cuda.py | 4 +++-
 src/pystencils/backend/platforms/hip.py  | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 98ff3e3d3..c05c45f04 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -8,4 +8,6 @@ class CudaPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return set()
+        return {
+            '"gpu_atomics.h"',
+        }
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
index c758995a0..65d844bbb 100644
--- a/src/pystencils/backend/platforms/hip.py
+++ b/src/pystencils/backend/platforms/hip.py
@@ -8,4 +8,7 @@ class HipPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return {'"pystencils_runtime/hip.h"'}
+        return {
+            '"gpu_atomics.h"',
+            '"pystencils_runtime/hip.h"',
+        }
-- 
GitLab


From 6a6e57f08bec708036387c78919f7d4e028d86d4 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 20 Mar 2025 18:19:03 +0100
Subject: [PATCH 131/180] Fix wrong rank being used for obtaining default block
 sizes

---
 src/pystencils/codegen/gpu_indexing.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 43b612bd7..f5606da02 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -260,6 +260,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     def __init__(
         self,
+        rank: int,
         num_work_items: _Dim3Lambda,
         hw_props: HardwareProperties,
         assume_warp_aligned_block_size: bool,
@@ -270,7 +271,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
-        default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items))
+        default_bs = GpuLaunchConfiguration.get_default_block_size(rank)
         self._default_block_size = default_bs
         self._init_block_size: dim3 = default_bs
         self._compute_block_size: (
@@ -598,6 +599,7 @@ class GpuIndexing:
 
         def factory():
             return DynamicBlockSizeLaunchConfiguration(
+                rank,
                 num_work_items,
                 self._hw_props,
                 self._assume_warp_aligned_block_size,
-- 
GitLab


From c31d407433bb075a42691866e87e59208bf99d90 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 20 Mar 2025 18:56:41 +0100
Subject: [PATCH 132/180] Try fixing required headers for cuda and hip for
 reductions

---
 src/pystencils/backend/platforms/cuda.py        |  4 +---
 src/pystencils/backend/platforms/generic_gpu.py | 12 ++++++++++--
 src/pystencils/backend/platforms/hip.py         |  3 +--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index c05c45f04..bbb608f5c 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -8,6 +8,4 @@ class CudaPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return {
-            '"gpu_atomics.h"',
-        }
+        return super().required_headers
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 2a12d6b7b..4f97264b0 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -198,6 +198,14 @@ class GenericGpu(Platform):
         thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
     """
 
+    @property
+    @abstractmethod
+    def required_headers(self) -> set[str]:
+        return {
+            '"gpu_atomics.h"',
+            "<cmath>",
+        }
+
     def __init__(
         self,
         ctx: KernelCreationContext,
@@ -365,9 +373,9 @@ class GenericGpu(Platform):
 
             match func:
                 case NumericLimitsFunctions.Min:
-                    define = "NEG_INFINITY"
+                    define = "-INFINITY"
                 case NumericLimitsFunctions.Max:
-                    define = "POS_INFINITY"
+                    define = "INFINITY"
                 case _:
                     raise MaterializationError(
                         f"Cannot materialize call to function {func}"
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
index 65d844bbb..c5e8b3882 100644
--- a/src/pystencils/backend/platforms/hip.py
+++ b/src/pystencils/backend/platforms/hip.py
@@ -8,7 +8,6 @@ class HipPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return {
-            '"gpu_atomics.h"',
+        return super().required_headers | {
             '"pystencils_runtime/hip.h"',
         }
-- 
GitLab


From f77909540a9d26ce9dc1decde5c9508bc1f2d14a Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 24 Mar 2025 16:22:40 +0100
Subject: [PATCH 133/180] Use NPP library for numeric limits for CUDA, use std
 limits for HIP

---
 src/pystencils/backend/platforms/cuda.py      | 25 +++++++++++++++++--
 .../backend/platforms/generic_gpu.py          | 21 +++++-----------
 src/pystencils/backend/platforms/hip.py       | 17 ++++++++++++-
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index bbb608f5c..b5b3478e4 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,11 +1,32 @@
 from __future__ import annotations
 
 from .generic_gpu import GenericGpu
+from ..ast.expressions import PsExpression, PsLiteralExpr
+from ..functions import PsFunction, NumericLimitsFunctions
+from ..literals import PsLiteral
+from ...types import PsType, PsIeeeFloatType
 
 
 class CudaPlatform(GenericGpu):
-    """Platform for the CUDA GPU taret."""
+    """Platform for the CUDA GPU target."""
 
     @property
     def required_headers(self) -> set[str]:
-        return super().required_headers
+        return super().required_headers | {
+            '"npp.h"',
+        }
+
+    def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression:
+        assert isinstance(dtype, PsIeeeFloatType)
+
+        match func:
+            case NumericLimitsFunctions.Min:
+                define = f"NPP_MINABS_{dtype.width}F"
+            case NumericLimitsFunctions.Max:
+                define = f"NPP_MAXABS_{dtype.width}F"
+            case _:
+                raise MaterializationError(
+                    f"Cannot materialize call to function {func}"
+                )
+
+        return PsLiteralExpr(PsLiteral(define, dtype))
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 4f97264b0..787b390fe 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -9,7 +9,7 @@ from ..ast import PsAstNode
 from ..constants import PsConstant
 from ...compound_op_mapping import compound_op_to_expr
 from ...sympyextensions.reduction import ReductionOp
-from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType
+from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType, PsType
 from ...types.quick import UInt, SInt
 from ..exceptions import MaterializationError
 from .platform import Platform
@@ -203,9 +203,12 @@ class GenericGpu(Platform):
     def required_headers(self) -> set[str]:
         return {
             '"gpu_atomics.h"',
-            "<cmath>",
         }
 
+    @abstractmethod
+    def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression:
+        pass
+
     def __init__(
         self,
         ctx: KernelCreationContext,
@@ -369,19 +372,7 @@ class GenericGpu(Platform):
         arg_types = (dtype,) * func.num_args
 
         if isinstance(dtype, PsScalarType) and func in NumericLimitsFunctions:
-            assert isinstance(dtype, PsIeeeFloatType)
-
-            match func:
-                case NumericLimitsFunctions.Min:
-                    define = "-INFINITY"
-                case NumericLimitsFunctions.Max:
-                    define = "INFINITY"
-                case _:
-                    raise MaterializationError(
-                        f"Cannot materialize call to function {func}"
-                    )
-
-            return PsLiteralExpr(PsLiteral(define, dtype))
+            return self.resolve_numeric_limits(func, dtype)
 
         if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions:
             match func:
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
index c5e8b3882..60e249aeb 100644
--- a/src/pystencils/backend/platforms/hip.py
+++ b/src/pystencils/backend/platforms/hip.py
@@ -1,13 +1,28 @@
 from __future__ import annotations
 
 from .generic_gpu import GenericGpu
+from ..ast.expressions import PsExpression, PsLiteralExpr
+from ..functions import PsMathFunction
+from ..literals import PsLiteral
+from ...types import PsType, PsIeeeFloatType
 
 
 class HipPlatform(GenericGpu):
-    """Platform for the HIP GPU taret."""
+    """Platform for the HIP GPU target."""
 
     @property
     def required_headers(self) -> set[str]:
         return super().required_headers | {
             '"pystencils_runtime/hip.h"',
+            "<limits>"
         }
+
+    def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression:
+        assert isinstance(dtype, PsIeeeFloatType)
+
+        return PsLiteralExpr(
+            PsLiteral(
+                f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()",
+                dtype,
+            )
+        )
-- 
GitLab


From 4e5c89b9cd23610f27d61612b14a12e968724e3f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 24 Mar 2025 16:54:50 +0100
Subject: [PATCH 134/180] Fix lint, typecheck

---
 src/pystencils/backend/platforms/cuda.py        | 5 +++--
 src/pystencils/backend/platforms/generic_gpu.py | 4 ++--
 src/pystencils/backend/platforms/hip.py         | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index b5b3478e4..7a5074677 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -2,7 +2,8 @@ from __future__ import annotations
 
 from .generic_gpu import GenericGpu
 from ..ast.expressions import PsExpression, PsLiteralExpr
-from ..functions import PsFunction, NumericLimitsFunctions
+from ..exceptions import MaterializationError
+from ..functions import NumericLimitsFunctions
 from ..literals import PsLiteral
 from ...types import PsType, PsIeeeFloatType
 
@@ -16,7 +17,7 @@ class CudaPlatform(GenericGpu):
             '"npp.h"',
         }
 
-    def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression:
+    def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression:
         assert isinstance(dtype, PsIeeeFloatType)
 
         match func:
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 787b390fe..8b7eead8d 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -206,7 +206,7 @@ class GenericGpu(Platform):
         }
 
     @abstractmethod
-    def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression:
+    def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression:
         pass
 
     def __init__(
@@ -371,7 +371,7 @@ class GenericGpu(Platform):
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        if isinstance(dtype, PsScalarType) and func in NumericLimitsFunctions:
+        if isinstance(dtype, PsScalarType) and isinstance(func, NumericLimitsFunctions):
             return self.resolve_numeric_limits(func, dtype)
 
         if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions:
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
index 60e249aeb..45d60452b 100644
--- a/src/pystencils/backend/platforms/hip.py
+++ b/src/pystencils/backend/platforms/hip.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
 from .generic_gpu import GenericGpu
 from ..ast.expressions import PsExpression, PsLiteralExpr
-from ..functions import PsMathFunction
+from ..functions import NumericLimitsFunctions
 from ..literals import PsLiteral
 from ...types import PsType, PsIeeeFloatType
 
@@ -17,7 +17,7 @@ class HipPlatform(GenericGpu):
             "<limits>"
         }
 
-    def resolve_numeric_limits(self, func: PsMathFunction, dtype: PsType) -> PsExpression:
+    def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression:
         assert isinstance(dtype, PsIeeeFloatType)
 
         return PsLiteralExpr(
-- 
GitLab


From 806dcb6b2aaa42849d8af7f12b62b730eec7fa0e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 27 Mar 2025 17:02:57 +0100
Subject: [PATCH 135/180] Move resolution of reductions to concrete gpu
 platform classes

---
 src/pystencils/backend/platforms/cuda.py      | 114 +++++++++++++-
 .../backend/platforms/generic_gpu.py          | 143 ++++++------------
 src/pystencils/backend/platforms/hip.py       |  22 ++-
 3 files changed, 172 insertions(+), 107 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 7a5074677..da8375c5e 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,11 +1,32 @@
 from __future__ import annotations
 
+import math
+
 from .generic_gpu import GenericGpu
-from ..ast.expressions import PsExpression, PsLiteralExpr
+from ..ast import PsAstNode
+from ..ast.expressions import (
+    PsExpression,
+    PsLiteralExpr,
+    PsCall,
+    PsAnd,
+    PsConstantExpr,
+    PsSymbolExpr,
+)
+from ..ast.structural import (
+    PsConditional,
+    PsStatement,
+    PsAssignment,
+    PsBlock,
+    PsStructuralNode,
+)
+from ..constants import PsConstant
 from ..exceptions import MaterializationError
-from ..functions import NumericLimitsFunctions
+from ..functions import NumericLimitsFunctions, CFunction
 from ..literals import PsLiteral
-from ...types import PsType, PsIeeeFloatType
+from ...compound_op_mapping import compound_op_to_expr
+from ...sympyextensions import ReductionOp
+from ...types import PsType, PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType
+from ...types.quick import SInt, UInt
 
 
 class CudaPlatform(GenericGpu):
@@ -17,7 +38,92 @@ class CudaPlatform(GenericGpu):
             '"npp.h"',
         }
 
-    def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression:
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        stype = symbol_expr.dtype
+        ptrtype = ptr_expr.dtype
+
+        assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
+        assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
+
+        if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
+            NotImplementedError(
+                "atomic operations are only available for float32/64 datatypes"
+            )
+
+        # workaround for subtractions -> use additions for reducing intermediate results
+        # similar to OpenMP reductions: local copies (negative sign) are added at the end
+        match reduction_op:
+            case ReductionOp.Sub:
+                actual_reduction_op = ReductionOp.Add
+            case _:
+                actual_reduction_op = reduction_op
+
+        # check if thread is valid for performing reduction
+        ispace = self._ctx.get_iteration_space()
+        is_valid_thread = self._get_condition_for_translation(ispace)
+
+        cond: PsExpression
+        shuffles: tuple[PsAssignment, ...]
+        if self._warp_size and self._assume_warp_aligned_block_size:
+            # perform local warp reductions
+            def gen_shuffle_instr(offset: int):
+                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
+                return PsCall(
+                    CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                    [
+                        full_mask,
+                        symbol_expr,
+                        PsConstantExpr(PsConstant(offset, SInt(32))),
+                    ],
+                )
+
+            # set up shuffle instructions for warp-level reduction
+            num_shuffles = math.frexp(self._warp_size)[1]
+            shuffles = tuple(
+                PsAssignment(
+                    symbol_expr,
+                    compound_op_to_expr(
+                        actual_reduction_op,
+                        symbol_expr,
+                        gen_shuffle_instr(pow(2, i - 1)),
+                    ),
+                )
+                for i in reversed(range(1, num_shuffles))
+            )
+
+            # find first thread in warp
+            first_thread_in_warp = self._first_thread_in_warp(ispace)
+
+            # set condition to only execute atomic operation on first valid thread in warp
+            cond = (
+                PsAnd(is_valid_thread, first_thread_in_warp)
+                if is_valid_thread
+                else first_thread_in_warp
+            )
+        else:
+            # no optimization: only execute atomic add on valid thread
+            shuffles = ()
+            cond = is_valid_thread
+
+        # use atomic operation
+        func = CFunction(
+            f"atomic{actual_reduction_op.name}", [ptrtype, stype], PsCustomType("void")
+        )
+        func_args = (ptr_expr, symbol_expr)
+
+        # assemble warp reduction
+        return shuffles, PsConditional(
+            cond, PsBlock([PsStatement(PsCall(func, func_args))])
+        )
+
+    def resolve_numeric_limits(
+        self, func: NumericLimitsFunctions, dtype: PsType
+    ) -> PsExpression:
         assert isinstance(dtype, PsIeeeFloatType)
 
         match func:
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 8b7eead8d..8a4dd11a2 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,16 +1,19 @@
 from __future__ import annotations
 
-import math
 import operator
 from abc import ABC, abstractmethod
 from functools import reduce
 
 from ..ast import PsAstNode
 from ..constants import PsConstant
-from ...compound_op_mapping import compound_op_to_expr
 from ...sympyextensions.reduction import ReductionOp
-from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType, PsType
-from ...types.quick import UInt, SInt
+from ...types import (
+    constify,
+    deconstify,
+    PsScalarType,
+    PsType,
+)
+from ...types.quick import SInt
 from ..exceptions import MaterializationError
 from .platform import Platform
 
@@ -28,8 +31,6 @@ from ..ast.structural import (
     PsBlock,
     PsConditional,
     PsDeclaration,
-    PsStatement,
-    PsAssignment,
     PsStructuralNode,
 )
 from ..ast.expressions import (
@@ -39,7 +40,6 @@ from ..ast.expressions import (
     PsCall,
     PsLookup,
     PsBufferAcc,
-    PsSymbolExpr,
     PsConstantExpr,
     PsAdd,
     PsRem,
@@ -206,7 +206,18 @@ class GenericGpu(Platform):
         }
 
     @abstractmethod
-    def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression:
+    def resolve_numeric_limits(
+        self, func: NumericLimitsFunctions, dtype: PsType
+    ) -> PsExpression:
+        pass
+
+    @abstractmethod
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         pass
 
     def __init__(
@@ -262,6 +273,31 @@ class GenericGpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
+    @staticmethod
+    def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
+        """Returns thread indices multiplied with block dimension strides per dimension."""
+
+        return tuple(
+            idx
+            * PsConstantExpr(
+                PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
+            )
+            for i, idx in enumerate(THREAD_IDX[: ispace.rank])
+        )
+
+    def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression:
+        """Returns expression that determines whether a thread is the first within a warp."""
+
+        tids_per_dim = GenericGpu._thread_index_per_dim(ispace)
+        tid: PsExpression = tids_per_dim[0]
+        for t in tids_per_dim[1:]:
+            tid = PsAdd(tid, t)
+
+        return PsEq(
+            PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
+            PsConstantExpr(PsConstant(0, SInt(32))),
+        )
+
     def select_function(
         self, call: PsCall
     ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
@@ -276,97 +312,8 @@ class GenericGpu(Platform):
         ):
             ptr_expr, symbol_expr = call.args
             op = call_func.reduction_op
-            stype = symbol_expr.dtype
-            ptrtype = ptr_expr.dtype
-
-            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(
-                ptrtype, PsPointerType
-            )
-            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(
-                stype, PsScalarType
-            )
-
-            if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
-                NotImplementedError(
-                    "atomic operations are only available for float32/64 datatypes"
-                )
-
-            # workaround for subtractions -> use additions for reducing intermediate results
-            # similar to OpenMP reductions: local copies (negative sign) are added at the end
-            match op:
-                case ReductionOp.Sub:
-                    actual_op = ReductionOp.Add
-                case _:
-                    actual_op = op
-
-            # check if thread is valid for performing reduction
-            ispace = self._ctx.get_iteration_space()
-            is_valid_thread = self._get_condition_for_translation(ispace)
-
-            cond: PsExpression
-            shuffles: tuple[PsAssignment, ...]
-            if self._warp_size and self._assume_warp_aligned_block_size:
-                # perform local warp reductions
-                def gen_shuffle_instr(offset: int):
-                    full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
-                    return PsCall(
-                        CFunction(
-                            "__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype
-                        ),
-                        [
-                            full_mask,
-                            symbol_expr,
-                            PsConstantExpr(PsConstant(offset, SInt(32))),
-                        ],
-                    )
-
-                # set up shuffle instructions for warp-level reduction
-                num_shuffles = math.frexp(self._warp_size)[1]
-                shuffles = tuple(
-                    PsAssignment(
-                        symbol_expr,
-                        compound_op_to_expr(
-                            actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1))
-                        ),
-                    )
-                    for i in reversed(range(1, num_shuffles))
-                )
-
-                # find first thread in warp
-                thread_indices_per_dim = [
-                    idx
-                    * PsConstantExpr(
-                        PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
-                    )
-                    for i, idx in enumerate(THREAD_IDX[: ispace.rank])
-                ]
-                tid: PsExpression = thread_indices_per_dim[0]
-                for t in thread_indices_per_dim[1:]:
-                    tid = PsAdd(tid, t)
-                first_thread_in_warp = PsEq(
-                    PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
-                    PsConstantExpr(PsConstant(0, SInt(32))),
-                )
-
-                # set condition to only execute atomic operation on first valid thread in warp
-                cond = (
-                    PsAnd(is_valid_thread, first_thread_in_warp)
-                    if is_valid_thread
-                    else first_thread_in_warp
-                )
-            else:
-                # no optimization: only execute atomic add on valid thread
-                shuffles = ()
-                cond = is_valid_thread
-
-            # use atomic operation
-            call.function = CFunction(
-                f"atomic{actual_op.name}", [ptrtype, stype], PsCustomType("void")
-            )
-            call.args = (ptr_expr, symbol_expr)
 
-            # assemble warp reduction
-            return shuffles, PsConditional(cond, PsBlock([PsStatement(call)]))
+            return self.resolve_reduction(ptr_expr, symbol_expr, op)
 
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
index 45d60452b..404d9bb27 100644
--- a/src/pystencils/backend/platforms/hip.py
+++ b/src/pystencils/backend/platforms/hip.py
@@ -1,9 +1,13 @@
 from __future__ import annotations
 
 from .generic_gpu import GenericGpu
+from ..ast import PsAstNode
 from ..ast.expressions import PsExpression, PsLiteralExpr
+from ..ast.structural import PsStructuralNode
+from ..exceptions import MaterializationError
 from ..functions import NumericLimitsFunctions
 from ..literals import PsLiteral
+from ...sympyextensions import ReductionOp
 from ...types import PsType, PsIeeeFloatType
 
 
@@ -12,12 +16,11 @@ class HipPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return super().required_headers | {
-            '"pystencils_runtime/hip.h"',
-            "<limits>"
-        }
+        return super().required_headers | {'"pystencils_runtime/hip.h"', "<limits>"}
 
-    def resolve_numeric_limits(self, func: NumericLimitsFunctions, dtype: PsType) -> PsExpression:
+    def resolve_numeric_limits(
+        self, func: NumericLimitsFunctions, dtype: PsType
+    ) -> PsExpression:
         assert isinstance(dtype, PsIeeeFloatType)
 
         return PsLiteralExpr(
@@ -26,3 +29,12 @@ class HipPlatform(GenericGpu):
                 dtype,
             )
         )
+
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+
+        raise MaterializationError("Reductions are yet not supported in HIP backend.")
-- 
GitLab


From b008a9e9954b83fd371c572c321b26597211a9c1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 28 Mar 2025 13:26:09 +0100
Subject: [PATCH 136/180] Adapt guards for generated avx512 horizontal ops

---
 src/pystencils/include/simd_horizontal_helpers.h | 2 +-
 util/generate_simd_horizontal_op.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h
index cd4bd5730..bd1889153 100644
--- a/src/pystencils/include/simd_horizontal_helpers.h
+++ b/src/pystencils/include/simd_horizontal_helpers.h
@@ -120,7 +120,7 @@ inline float _mm256_horizontal_max_ps(float dst, __m256 src) {
 
 #endif
 
-#if defined(__AVX512VL__)
+#if defined(__AVX512F__)
 #include <immintrin.h>
 
 inline double _mm512_horizontal_add_pd(double dst, __m512d src) { 
diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py
index aebbf35bb..1d652c6e1 100644
--- a/util/generate_simd_horizontal_op.py
+++ b/util/generate_simd_horizontal_op.py
@@ -277,7 +277,7 @@ vtypes_for_instruction_set = {
 guards_for_instruction_sets = {
     InstructionSets.SSE3: "__SSE3__",
     InstructionSets.AVX: "__AVX__",
-    InstructionSets.AVX512: '__AVX512VL__',
+    InstructionSets.AVX512: '__AVX512F__',
     InstructionSets.NEON: '_M_ARM64',
 }
 
-- 
GitLab


From 7b43ffd2e12f5d69ccc30fdf451ee47d7caea6ae Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 31 Mar 2025 13:45:15 +0200
Subject: [PATCH 137/180] Add minor comment to ReductionInfo dataclass

---
 src/pystencils/backend/kernelcreation/context.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 536c73c7f..358b5ff6c 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -49,6 +49,8 @@ FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array"))
 
 @dataclass(frozen=True)
 class ReductionInfo:
+    """Information about a reduction operation, its neutral element in form of an initial value
+    and the pointer used by the kernel as write-back argument."""
 
     op: ReductionOp
     init_val: PsExpression
-- 
GitLab


From 3484e7f794e4e720d7bc5931b90a4b7caf2ffc59 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 31 Mar 2025 13:49:47 +0200
Subject: [PATCH 138/180] Rename compound op mapping funcs

---
 src/pystencils/backend/kernelcreation/freeze.py        | 10 ++++++----
 src/pystencils/backend/platforms/generic_cpu.py        |  4 ++--
 src/pystencils/backend/platforms/generic_gpu.py        |  4 ++--
 ...{compound_op_mapping.py => reduction_op_mapping.py} |  6 +++---
 4 files changed, 13 insertions(+), 11 deletions(-)
 rename src/pystencils/{compound_op_mapping.py => reduction_op_mapping.py} (82%)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 63e9ea5b1..5cdb3864c 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -13,7 +13,7 @@ from ...sympyextensions import (
     integer_functions,
     ConditionalFieldAccess,
 )
-from ...compound_op_mapping import compound_op_to_expr
+from ...reduction_op_mapping import reduction_op_to_expr
 from ...sympyextensions.typed_sympy import TypedSymbol, TypeCast, DynamicType
 from ...sympyextensions.pointers import AddressOf, mem_acc
 from ...sympyextensions.reduction import ReductionAssignment, ReductionOp
@@ -174,15 +174,17 @@ class FreezeExpressions:
         assert isinstance(lhs, PsExpression)
         assert isinstance(rhs, PsExpression)
 
-        _str_to_compound_op: dict[str, ReductionOp] = {
+        # transform augmented assignment to reduction op
+        str_to_reduction_op: dict[str, ReductionOp] = {
             "+=": ReductionOp.Add,
             "-=": ReductionOp.Sub,
             "*=": ReductionOp.Mul,
             "/=": ReductionOp.Div,
         }
 
+        # reuse existing handling for transforming reduction ops to expressions
         return PsAssignment(
-            lhs, compound_op_to_expr(_str_to_compound_op[expr.op], lhs.clone(), rhs)
+            lhs, reduction_op_to_expr(str_to_reduction_op[expr.op], lhs.clone(), rhs)
         )
 
     def map_ReductionAssignment(self, expr: ReductionAssignment):
@@ -208,7 +210,7 @@ class FreezeExpressions:
         new_lhs = PsSymbolExpr(new_lhs_symb)
 
         # get new rhs from augmented assignment
-        new_rhs: PsExpression = compound_op_to_expr(op, new_lhs.clone(), rhs)
+        new_rhs: PsExpression = reduction_op_to_expr(op, new_lhs.clone(), rhs)
 
         # match for reduction operation and set neutral init_val
         init_val: PsExpression
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index ccef61817..3de7cf696 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -13,7 +13,7 @@ from ..functions import (
     PsReductionFunction,
 )
 from ..literals import PsLiteral
-from ...compound_op_mapping import compound_op_to_expr
+from ...reduction_op_mapping import reduction_op_to_expr
 from ...sympyextensions import ReductionOp
 from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType
 
@@ -97,7 +97,7 @@ class GenericCpu(Platform):
             actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
 
             # create binop and potentially select corresponding function for e.g. min or max
-            potential_call = compound_op_to_expr(actual_op, ptr_access, symbol_expr)
+            potential_call = reduction_op_to_expr(actual_op, ptr_access, symbol_expr)
             if isinstance(potential_call, PsCall):
                 potential_call.dtype = symbol_expr.dtype
                 return self.select_function(potential_call)
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 8b7eead8d..349e79d4b 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -7,7 +7,7 @@ from functools import reduce
 
 from ..ast import PsAstNode
 from ..constants import PsConstant
-from ...compound_op_mapping import compound_op_to_expr
+from ...reduction_op_mapping import reduction_op_to_expr
 from ...sympyextensions.reduction import ReductionOp
 from ...types import constify, deconstify, PsPointerType, PsScalarType, PsCustomType, PsType
 from ...types.quick import UInt, SInt
@@ -325,7 +325,7 @@ class GenericGpu(Platform):
                 shuffles = tuple(
                     PsAssignment(
                         symbol_expr,
-                        compound_op_to_expr(
+                        reduction_op_to_expr(
                             actual_op, symbol_expr, gen_shuffle_instr(pow(2, i - 1))
                         ),
                     )
diff --git a/src/pystencils/compound_op_mapping.py b/src/pystencils/reduction_op_mapping.py
similarity index 82%
rename from src/pystencils/compound_op_mapping.py
rename to src/pystencils/reduction_op_mapping.py
index 193b308d0..06fb8aa3e 100644
--- a/src/pystencils/compound_op_mapping.py
+++ b/src/pystencils/reduction_op_mapping.py
@@ -11,7 +11,7 @@ _available_operator_interface: set[ReductionOp] = {
 }
 
 
-def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
+def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
     if op in _available_operator_interface:
         match op:
             case ReductionOp.Add:
@@ -24,7 +24,7 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
                 operator = PsDiv
             case _:
                 raise FreezeError(
-                    f"Found unsupported operation type for compound assignments: {op}."
+                    f"Found unsupported operation type for reduction assignments: {op}."
                 )
         return operator(op1, op2)
     else:
@@ -35,5 +35,5 @@ def compound_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
                 return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2])
             case _:
                 raise FreezeError(
-                    f"Found unsupported operation type for compound assignments: {op}."
+                    f"Found unsupported operation type for reduction assignments: {op}."
                 )
-- 
GitLab


From 492fb30a9da935fe0273201de79e90b1eb1ae338 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Mon, 31 Mar 2025 13:53:34 +0200
Subject: [PATCH 139/180] Adapt error messages for reduced assignments in
 freeze.py

---
 src/pystencils/backend/kernelcreation/freeze.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 5cdb3864c..9dc3928b3 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -200,7 +200,8 @@ class FreezeExpressions:
         orig_lhs_symb = lhs.symbol
         dtype = lhs.dtype
 
-        assert isinstance(dtype, PsNumericType)
+        assert isinstance(dtype, PsNumericType), \
+            "Reduction assignments require type information of the lhs symbol."
 
         # replace original symbol with pointer-based type used for export
         orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype))
@@ -226,7 +227,7 @@ class FreezeExpressions:
             case ReductionOp.Max:
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
             case _:
-                raise FreezeError(f"Unsupported reduced assignment: {op}.")
+                raise FreezeError(f"Unsupported kind of reduction assignment: {op}.")
 
         reduction_info = ReductionInfo(op, init_val, orig_lhs_symb_as_ptr)
 
-- 
GitLab


From f469e70e18c278385ab4237b3f30afb70da4c606 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 2 Apr 2025 19:19:28 +0200
Subject: [PATCH 140/180] Extend test_reduction_gpu with
 assume_warp_aligned_block_size and use_block_fitting parameters

---
 tests/kernelcreation/test_reduction.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index 6e2b2f3fe..cd1710cf5 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -52,7 +52,14 @@ def test_reduction_cpu(instruction_set, dtype, op):
 
 @pytest.mark.parametrize("dtype", ["float64", "float32"])
 @pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
-def test_reduction_gpu(dtype, op):
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
+@pytest.mark.parametrize("use_block_fitting", [True, False])
+def test_reduction_gpu(
+        dtype: str,
+        op: str,
+        assume_warp_aligned_block_size: bool,
+        use_block_fitting: bool,
+):
     try:
         import cupy as cp
         from cupy_backends.cuda.api.runtime import CUDARuntimeError
@@ -66,12 +73,16 @@ def test_reduction_gpu(dtype, op):
             reason="No CUDA capable device is detected", allow_module_level=True
         )
 
-    config = ps.CreateKernelConfig(target=ps.Target.GPU)
+    cfg = ps.CreateKernelConfig(target=ps.Target.GPU)
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
-    ast_reduction = get_reduction_assign_ast(dtype, op, config)
+    ast_reduction = get_reduction_assign_ast(dtype, op, cfg)
     ps.show_code(ast_reduction)
     kernel_reduction = ast_reduction.compile()
 
+    if use_block_fitting:
+        kernel_reduction.launch_config.fit_block_size((32, 1, 1))
+
     array = np.full((SIZE,), INIT_ARR, dtype=dtype)
     reduction_array = np.full((1,), INIT_W, dtype=dtype)
 
-- 
GitLab


From 5bef84a8e28279c46cd73b67352201622d9aa89e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 2 Apr 2025 19:47:49 +0200
Subject: [PATCH 141/180] Write howto guide for reductions

---
 docs/source/backend/gpu_codegen.md       |   1 +
 docs/source/backend/index.rst            |   1 +
 docs/source/backend/reduction_codegen.md | 122 +++++++++++++++++++++
 docs/source/index.rst                    |   1 +
 docs/source/user_manual/reductions.md    | 132 +++++++++++++++++++++++
 5 files changed, 257 insertions(+)
 create mode 100644 docs/source/backend/reduction_codegen.md
 create mode 100644 docs/source/user_manual/reductions.md

diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 3fe00840e..a95c36566 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -1,3 +1,4 @@
+(gpu_codegen)=
 # GPU Code Generation
 
 The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components:
diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst
index 0d384c55b..b9b400544 100644
--- a/docs/source/backend/index.rst
+++ b/docs/source/backend/index.rst
@@ -16,6 +16,7 @@ who wish to customize or extend the behaviour of the code generator in their app
     iteration_space
     translation
     platforms
+    reduction_codegen
     transformations
     gpu_codegen
     errors
diff --git a/docs/source/backend/reduction_codegen.md b/docs/source/backend/reduction_codegen.md
new file mode 100644
index 000000000..360c69256
--- /dev/null
+++ b/docs/source/backend/reduction_codegen.md
@@ -0,0 +1,122 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell, raises-exception]
+
+import pystencils as ps
+import numpy as np
+import cupy as cp
+```
+
+(codegen_reductions)=
+# Code Generation for Reductions
+
+In this guide, we demonstrate how reduction kernels can be generated for different platforms and what impact certain
+optimization strategies have.
+For this, we set up the update rule for a simple dot product kernel:
+
+```{code-cell} ipython3
+r = ps.TypedSymbol("r", "double")
+x, y = ps.fields(f"x, y: double[3D]", layout="fzyx")
+
+assign_dot_prod = ps.AddReductionAssignment(r, x.center() * y.center())
+```
+
+## CPU Platforms
+
+We first consider a base variant for CPUs without employing any optimizations.
+The generated code for this variant looks as follows:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
+kernel_cpu = ps.create_kernel(assign_dot_prod, cfg)
+
+ps.inspect(kernel_cpu)
+```
+
+We want the reduction kernel to be SIMD vectorized and employ shared-memory parallelism using OpenMP.
+The supported SIMD instruction sets for reductions are:
+* SSE3
+* AVX/AVX2
+* AVX512
+
+Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`.
+**Note that reductions require `assume_inner_stride_one` to be enabled.**
+This is due to the fact that other inner strides would require masked SIMD operations 
+which are not supported yet.
+
+```{code-cell} ipython3
+# configure SIMD vectorization
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.X86_AVX,
+)
+cfg.cpu.vectorize.enable = True
+cfg.cpu.vectorize.assume_inner_stride_one = True
+
+# configure OpenMP parallelization
+cfg.cpu.openmp.enable = True
+cfg.cpu.openmp.num_threads = 8
+
+kernel_cpu_opt = ps.create_kernel(assign_dot_prod, cfg)
+
+ps.inspect(kernel_cpu_opt)
+```
+
+## GPU Platforms
+
+Reductions are currently only supported for CUDA platforms.
+Similar to the CPU section, a base variant for GPUs without explicitly employing any optimizations is shown:
+
+```{code-cell} ipython3
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    kernel_gpu = ps.create_kernel(assign_dot_prod, cfg)
+
+    ps.inspect(kernel_gpu)
+```
+
+As evident from the code, the generated kernel employs atomic operations for updating the pointer 
+holding the reduction result.
+Using the explicit warp-level instructions provided by CUDA allows us to achieve higher performance compared to
+only using atomic operations.
+To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by 
+the hardware's warp size.
+**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this
+criterion by enabling `assume_warp_aligned_block_size`.**
+While the default block sizes provided by the code generator already fulfill this criterion,
+we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space.
+
+You can find more detailed information about warp size alignment in {ref}`gpu_codegen`.
+
+```{code-cell} ipython3
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+    cfg.gpu.assume_warp_aligned_block_size = True
+
+    kernel_gpu_opt = ps.create_kernel(assign_dot_prod, cfg)
+    
+    kernel_func = kernel_gpu_opt.compile()
+    kernel_func.launch_config.fit_block_size((32, 1, 1))
+
+    ps.inspect(kernel_gpu_opt)
+```
+
+:::{admonition} Developers To Do:
+
+- Support for HIP platforms
+- Support vectorization using NEON intrinsics
+:::
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 6dba50af1..4e1070979 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -81,6 +81,7 @@ Topics
 
   user_manual/symbolic_language
   user_manual/kernelcreation
+  user_manual/reductions
   user_manual/gpu_kernels
   user_manual/WorkingWithTypes
 
diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md
new file mode 100644
index 000000000..454fd7df4
--- /dev/null
+++ b/docs/source/user_manual/reductions.md
@@ -0,0 +1,132 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell, raises-exception]
+
+import sympy as sp
+import pystencils as ps
+import numpy as np
+import cupy as cp
+
+from enum import Enum
+```
+
+(guide_reductions)=
+# Reductions in Pystencils
+
+Reductions play a vital role in numerical simulations as they allow aggregating data across multiple elements, 
+such as computing sums, products over an array or finding its minima or maxima.
+
+## Specifying Assignments with Reductions
+
+In pystencils, reductions are made available via specialized assignments, namely `ReductionAssignment`.
+Here is a snippet creating a reduction assignment for adding up all elements of a field:
+
+```{code-cell} ipython3
+r = ps.TypedSymbol("r", "double")
+x = ps.fields(f"x: double[3D]", layout="fzyx")
+
+assign_sum = ps.AddReductionAssignment(r, x.center())
+```
+
+For each point in the iteration space, the left-hand side symbol `r` accumulates the contents of the 
+right-hand side `x.center()`. In our case, the `AddReductionAssignment` denotes an accumulation via additions.
+
+**Pystencils requires type information about the reduction symbols and thus requires `r` to be a `TypedSymbol`.**
+
+The following reduction assignment classes are available in pystencils:    
+* `AddReductionAssignment`: Builds sum over elements
+* `SubReductionAssignment`: Builds difference over elements
+* `MulReductionAssignment`: Builds product over elements
+* `MinReductionAssignment`: Finds minimum element
+* `MaxReductionAssignment`: Finds maximum element
+
+:::{note}
+AlternatÃvely, you can also make use of the `reduction_assignment` or `reduction_assignment_from_str` functions
+to specify reduction assignments:
+:::
+
+```{code-cell} ipython3
+from pystencils.sympyextensions import reduction_assignment, reduction_assignment_from_str
+from pystencils.sympyextensions.reduction import ReductionOp
+
+assign_sum = reduction_assignment(r, ReductionOp.Add, x.center())
+
+assign_sum = reduction_assignment_from_str(r, "+", x.center())
+```
+
+For other reduction operations, the following enums can be passed to `reduction_assignment` 
+or the corresponding strings can be passed to `reduction_assignment_from_str`.
+
+```{code-cell} python3
+class ReductionOp(Enum):
+    Add = "+"
+    Sub = "-"
+    Mul = "*"
+    Min = "min"
+    Max = "max"
+```
+
+## Generating Reduction Kernels
+
+With the assignments being fully assembled, we can finally invoke the code generator and 
+create the kernel object via the {any}`create_kernel` function. 
+For this example, we assume a kernel configuration where no optimizations are explicitly enabled.
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
+kernel = ps.create_kernel(assign_sum, cfg)
+
+ps.inspect(kernel)
+```
+
+:::{note}
+The generated reduction kernels may vary vastly for different platforms and optimizations.
+For the sake of compactness, the impact of different backend or optimization choices is left out.
+
+A detailed description of configuration choices and their impact on the generated code can be found in
+{ref}`codegen_reductions`.
+:::
+
+The kernel can be compiled and run immediately.
+
+To execute the kernel on CPUs, not only a {any}`numpy.ndarray` has to be passed for each field
+but also one for exporting reduction results. 
+The export mechanism can be seen in the previously generated code snippet. 
+Here, the kernel obtains a pointer with the name of the reduction symbol (here: `r`).
+This pointer not only allows providing initial values for the reduction but is also used for writing back the
+reduction result. 
+Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value.
+
+```{code-cell} ipython3
+    kernel_func = kernel.compile()
+
+    x_array = np.ones((4, 4, 4), dtype="float64")
+    reduction_result = np.zeros((1,), dtype="float64")
+
+    kernel_func(x=x_array, r=reduction_result)
+    
+    reduction_result[0]
+```
+
+For GPU platforms, the concepts remain the same but the fields and the write-back pointer now require device memory, 
+i.e. instances of {any}`cupy.ndarray`.
+
+:::{admonition} Developers To Do:
+
+- Support for higher-order data types for reductions, e.g. vector/matrix reductions
+:::
-- 
GitLab


From a2060520faddc99c58c8f22eedfe267d09d525b1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 10:54:44 +0200
Subject: [PATCH 142/180] Get rid of reduction_assignment_from_str

---
 docs/source/user_manual/reductions.md      |  9 +++------
 src/pystencils/sympyextensions/__init__.py |  3 +--
 tests/kernelcreation/test_reduction.py     | 18 +++++++++---------
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md
index 454fd7df4..6af0e8580 100644
--- a/docs/source/user_manual/reductions.md
+++ b/docs/source/user_manual/reductions.md
@@ -56,21 +56,18 @@ The following reduction assignment classes are available in pystencils:
 * `MaxReductionAssignment`: Finds maximum element
 
 :::{note}
-AlternatÃvely, you can also make use of the `reduction_assignment` or `reduction_assignment_from_str` functions
+AlternatÃvely, you can also make use of the `reduction_assignment` function
 to specify reduction assignments:
 :::
 
 ```{code-cell} ipython3
-from pystencils.sympyextensions import reduction_assignment, reduction_assignment_from_str
+from pystencils.sympyextensions import reduction_assignment
 from pystencils.sympyextensions.reduction import ReductionOp
 
 assign_sum = reduction_assignment(r, ReductionOp.Add, x.center())
-
-assign_sum = reduction_assignment_from_str(r, "+", x.center())
 ```
 
-For other reduction operations, the following enums can be passed to `reduction_assignment` 
-or the corresponding strings can be passed to `reduction_assignment_from_str`.
+For other reduction operations, the following enums can be passed to `reduction_assignment`.
 
 ```{code-cell} python3
 class ReductionOp(Enum):
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index bd0fa1fe9..c575feeb3 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,7 +1,7 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc, tcast, DynamicType
 from .pointers import mem_acc
-from .reduction import reduction_assignment, reduction_assignment_from_str, ReductionOp
+from .reduction import reduction_assignment, ReductionOp
 
 from .math import (
     prod,
@@ -35,7 +35,6 @@ from .math import (
 __all__ = [
     "ConditionalFieldAccess",
     "reduction_assignment",
-    "reduction_assignment_from_str",
     "ReductionOp",
     "TypedSymbol",
     "CastFunc",
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
index cd1710cf5..1fb8efc81 100644
--- a/tests/kernelcreation/test_reduction.py
+++ b/tests/kernelcreation/test_reduction.py
@@ -2,17 +2,17 @@ import pytest
 import numpy as np
 
 import pystencils as ps
-from pystencils.sympyextensions import reduction_assignment_from_str
+from pystencils.sympyextensions import ReductionOp, reduction_assignment
 
 INIT_W = 5
 INIT_ARR = 2
 SIZE = 15
 SOLUTION = {
-    "+": INIT_W + INIT_ARR * SIZE,
-    "-": INIT_W - INIT_ARR * SIZE,
-    "*": INIT_W * INIT_ARR**SIZE,
-    "min": min(INIT_W, INIT_ARR),
-    "max": max(INIT_W, INIT_ARR),
+    ReductionOp.Add: INIT_W + INIT_ARR * SIZE,
+    ReductionOp.Sub: INIT_W - INIT_ARR * SIZE,
+    ReductionOp.Mul: INIT_W * INIT_ARR**SIZE,
+    ReductionOp.Min: min(INIT_W, INIT_ARR),
+    ReductionOp.Max: max(INIT_W, INIT_ARR),
 }
 
 
@@ -21,14 +21,14 @@ def get_reduction_assign_ast(dtype, op, config):
     x = ps.fields(f"x: {dtype}[1d]")
     w = ps.TypedSymbol("w", dtype)
 
-    red_assign = reduction_assignment_from_str(w, op, x.center())
+    red_assign = reduction_assignment(w, op, x.center())
 
     return ps.create_kernel([red_assign], config, default_dtype=dtype)
 
 
 @pytest.mark.parametrize("instruction_set", ["sse", "avx"])
 @pytest.mark.parametrize("dtype", ["float64", "float32"])
-@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
+@pytest.mark.parametrize("op", [ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Min, ReductionOp.Max])
 def test_reduction_cpu(instruction_set, dtype, op):
     vectorize_info = {
         "instruction_set": instruction_set,
@@ -51,7 +51,7 @@ def test_reduction_cpu(instruction_set, dtype, op):
 
 
 @pytest.mark.parametrize("dtype", ["float64", "float32"])
-@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
+@pytest.mark.parametrize("op", [ReductionOp.Add, ReductionOp.Sub, ReductionOp.Mul, ReductionOp.Min, ReductionOp.Max])
 @pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
 @pytest.mark.parametrize("use_block_fitting", [True, False])
 def test_reduction_gpu(
-- 
GitLab


From 21df6f4b5915134b537c20398b8bdeec00b6b28e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 10:57:43 +0200
Subject: [PATCH 143/180] Omit admonitions from docs

---
 docs/source/backend/reduction_codegen.md | 6 ------
 docs/source/user_manual/reductions.md    | 5 -----
 2 files changed, 11 deletions(-)

diff --git a/docs/source/backend/reduction_codegen.md b/docs/source/backend/reduction_codegen.md
index 360c69256..f08fa980a 100644
--- a/docs/source/backend/reduction_codegen.md
+++ b/docs/source/backend/reduction_codegen.md
@@ -114,9 +114,3 @@ You can find more detailed information about warp size alignment in {ref}`gpu_co
     ps.inspect(kernel_gpu_opt)
 ```
 
-:::{admonition} Developers To Do:
-
-- Support for HIP platforms
-- Support vectorization using NEON intrinsics
-:::
-
diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md
index 6af0e8580..0b0c04279 100644
--- a/docs/source/user_manual/reductions.md
+++ b/docs/source/user_manual/reductions.md
@@ -122,8 +122,3 @@ Since our reduction result is a single scalar value, it is sufficient to set up
 
 For GPU platforms, the concepts remain the same but the fields and the write-back pointer now require device memory, 
 i.e. instances of {any}`cupy.ndarray`.
-
-:::{admonition} Developers To Do:
-
-- Support for higher-order data types for reductions, e.g. vector/matrix reductions
-:::
-- 
GitLab


From 99726a97a0a834dfb53960ad1fb72be4f72849e2 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 11:30:29 +0200
Subject: [PATCH 144/180] Add docstring to PsVecHorizontal

---
 src/pystencils/backend/ast/vector.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 4141b0296..291d76e50 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -41,7 +41,16 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
 
 
 class PsVecHorizontal(PsBinOp, PsVectorOp):
-    """Extracts scalar value from N vector lanes."""
+    """Represents a binary operation between a scalar and a vector operand.
+    With the binary operation not being vectorized, a horizontal reduction
+    along the lanes of the vector operand is required to extract a scalar value.
+    The result type will be equal to the scalar operand.
+
+    Args:
+        scalar_operand: Scalar operand
+        vector_operand: Vector operand to be converted to a scalar value
+        reduction_op: Binary operation that is also used for the horizontal reduction
+    """
 
     __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op")
 
-- 
GitLab


From f7142b16ae26f30f96f07e0063c65389fbc34b3f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 11:32:47 +0200
Subject: [PATCH 145/180] Remove lanes arg from PsVecHorizontal

---
 src/pystencils/backend/ast/vector.py                | 13 +------------
 src/pystencils/backend/emission/ir_printer.py       |  4 ++--
 .../backend/transformations/loop_vectorizer.py      |  1 -
 3 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 291d76e50..a074ea6ff 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -56,23 +56,13 @@ class PsVecHorizontal(PsBinOp, PsVectorOp):
 
     def __init__(
         self,
-        lanes: int,
         scalar_operand: PsExpression,
         vector_operand: PsExpression,
         reduction_op: ReductionOp,
     ):
         super().__init__(scalar_operand, vector_operand)
-        self._lanes = lanes
         self._reduction_op = reduction_op
 
-    @property
-    def lanes(self) -> int:
-        return self._lanes
-
-    @lanes.setter
-    def lanes(self, n: int):
-        self._lanes = n
-
     @property
     def scalar_operand(self) -> PsExpression:
         return self._op1
@@ -99,7 +89,7 @@ class PsVecHorizontal(PsBinOp, PsVectorOp):
 
     def _clone_expr(self) -> PsVecHorizontal:
         return PsVecHorizontal(
-            self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op
+            self._op1.clone(), self._op2.clone(), self._reduction_op
         )
 
     def structurally_equal(self, other: PsAstNode) -> bool:
@@ -107,7 +97,6 @@ class PsVecHorizontal(PsBinOp, PsVectorOp):
             return False
         return (
             super().structurally_equal(other)
-            and self._lanes == other._lanes
             and self._reduction_op == other._reduction_op
         )
 
diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py
index 22ae2f91a..5a3836d50 100644
--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -77,14 +77,14 @@ class IRAstPrinter(BasePrinter):
                     f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest
                 )
 
-            case PsVecHorizontal(lanes, scalar_operand, vector_operand, reduction_op):
+            case PsVecHorizontal(scalar_operand, vector_operand, reduction_op):
                 pc.push_op(Ops.Weakest, LR.Middle)
                 scalar_operand_code = self.visit(scalar_operand, pc)
                 vector_operand_code = self.visit(vector_operand, pc)
                 pc.pop_op()
 
                 return pc.parenthesize(
-                    f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({scalar_operand_code, vector_operand_code})",
+                    f"vec_horizontal_{reduction_op.name.lower()}({scalar_operand_code, vector_operand_code})",
                     Ops.Weakest,
                 )
 
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index 48b9ad0da..09b0aa5dd 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -162,7 +162,6 @@ class LoopVectorizer:
                 PsAssignment(
                     PsSymbolExpr(symb),
                     PsVecHorizontal(
-                        self._lanes,
                         PsSymbolExpr(symb),
                         PsSymbolExpr(vector_symb),
                         reduction_info.op,
-- 
GitLab


From 427e53442e166cf34a23e7588a1f8cd72ce52438 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 11:35:47 +0200
Subject: [PATCH 146/180] Remove more parts of reduction_assignment_from_str

---
 src/pystencils/sympyextensions/reduction.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index e95e37c24..81da0dde9 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -66,17 +66,8 @@ _reduction_assignment_classes = {
     ]
 }
 
-# Mapping from ReductionOp str to ReductionAssigment classes
-_reduction_assignment_classes_for_str = {
-    cls.value: cls for cls in _reduction_assignment_classes
-}
-
 
 def reduction_assignment(lhs, op: ReductionOp, rhs):
     if op not in _reduction_assignment_classes:
         raise ValueError("Unrecognized operator %s" % op)
     return _reduction_assignment_classes[op](lhs, rhs)
-
-
-def reduction_assignment_from_str(lhs, op: str, rhs):
-    return reduction_assignment(lhs, _reduction_assignment_classes_for_str[op], rhs)
-- 
GitLab


From 36124cd24e89342ae96dd0bee8a05bdfc2542092 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 11:58:34 +0200
Subject: [PATCH 147/180] Add check for typed symbols for ReductionAssignment
 constructor

---
 src/pystencils/sympyextensions/reduction.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index 81da0dde9..794c40451 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -2,6 +2,8 @@ from enum import Enum
 
 from sympy.codegen.ast import AssignmentBase
 
+from pystencils import TypedSymbol
+
 
 class ReductionOp(Enum):
     Add = "+"
@@ -33,6 +35,13 @@ class ReductionAssignment(AssignmentBase):
     def reduction_op(self, op):
         self._reduction_op = op
 
+    @classmethod
+    def _check_args(cls, lhs, rhs):
+        super()._check_args(lhs, rhs)
+
+        if not isinstance(lhs, TypedSymbol):
+            raise TypeError(f"lhs of needs to be a TypedSymbol. Got {type(lhs)} instead.")
+
 
 class AddReductionAssignment(ReductionAssignment):
     reduction_op = ReductionOp.Add
-- 
GitLab


From 9e61ccebe4e6a37205441f94573f714e987d17cd Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 12:58:38 +0200
Subject: [PATCH 148/180] Omit lanes for match args in PsVecHorizontal

---
 src/pystencils/backend/ast/vector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index a074ea6ff..55db67e7c 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -52,7 +52,7 @@ class PsVecHorizontal(PsBinOp, PsVectorOp):
         reduction_op: Binary operation that is also used for the horizontal reduction
     """
 
-    __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op")
+    __match_args__ = ("scalar_operand", "vector_operand", "reduction_op")
 
     def __init__(
         self,
-- 
GitLab


From 7b74cafaedc589a52f91e7a28d60528956edd19f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 13:01:20 +0200
Subject: [PATCH 149/180] Fix import

---
 src/pystencils/sympyextensions/reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
index 794c40451..a1a9a026c 100644
--- a/src/pystencils/sympyextensions/reduction.py
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -2,7 +2,7 @@ from enum import Enum
 
 from sympy.codegen.ast import AssignmentBase
 
-from pystencils import TypedSymbol
+from . import TypedSymbol
 
 
 class ReductionOp(Enum):
-- 
GitLab


From 8394f0f4c115cb37c5e53f3e08f62a32bd128a3e Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 14:18:29 +0200
Subject: [PATCH 150/180] Move reduction_op_mapping.py

---
 src/pystencils/backend/kernelcreation/freeze.py      | 2 +-
 src/pystencils/backend/platforms/cuda.py             | 2 +-
 src/pystencils/backend/platforms/generic_cpu.py      | 2 +-
 src/pystencils/{ => backend}/reduction_op_mapping.py | 8 ++++----
 4 files changed, 7 insertions(+), 7 deletions(-)
 rename src/pystencils/{ => backend}/reduction_op_mapping.py (82%)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 9dc3928b3..c5ff43fb9 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -13,7 +13,7 @@ from ...sympyextensions import (
     integer_functions,
     ConditionalFieldAccess,
 )
-from ...reduction_op_mapping import reduction_op_to_expr
+from ..reduction_op_mapping import reduction_op_to_expr
 from ...sympyextensions.typed_sympy import TypedSymbol, TypeCast, DynamicType
 from ...sympyextensions.pointers import AddressOf, mem_acc
 from ...sympyextensions.reduction import ReductionAssignment, ReductionOp
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 8c3cd45fa..05e95011d 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -23,7 +23,7 @@ from ..constants import PsConstant
 from ..exceptions import MaterializationError
 from ..functions import NumericLimitsFunctions, CFunction
 from ..literals import PsLiteral
-from ...reduction_op_mapping import reduction_op_to_expr
+from ..reduction_op_mapping import reduction_op_to_expr
 from ...sympyextensions import ReductionOp
 from ...types import PsType, PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType
 from ...types.quick import SInt, UInt
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index 3de7cf696..4f8b562fa 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -13,7 +13,7 @@ from ..functions import (
     PsReductionFunction,
 )
 from ..literals import PsLiteral
-from ...reduction_op_mapping import reduction_op_to_expr
+from ..reduction_op_mapping import reduction_op_to_expr
 from ...sympyextensions import ReductionOp
 from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType
 
diff --git a/src/pystencils/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py
similarity index 82%
rename from src/pystencils/reduction_op_mapping.py
rename to src/pystencils/backend/reduction_op_mapping.py
index 06fb8aa3e..876912acd 100644
--- a/src/pystencils/reduction_op_mapping.py
+++ b/src/pystencils/backend/reduction_op_mapping.py
@@ -1,7 +1,7 @@
-from .backend.ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv
-from .backend.exceptions import FreezeError
-from .backend.functions import PsMathFunction, MathFunctions
-from .sympyextensions.reduction import ReductionOp
+from .ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv
+from .exceptions import FreezeError
+from .functions import PsMathFunction, MathFunctions
+from ..sympyextensions.reduction import ReductionOp
 
 _available_operator_interface: set[ReductionOp] = {
     ReductionOp.Add,
-- 
GitLab


From 935beb559495c5094a42cf0e807ca8ba203beb6d Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 14:41:04 +0200
Subject: [PATCH 151/180] Move kernel AST modifications for reductions to
 distinct function

---
 src/pystencils/codegen/driver.py | 53 +++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index c285dd7bf..3d107eda3 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -19,6 +19,7 @@ from .properties import PsSymbolProperty, FieldBasePtr
 from .parameters import Parameter
 from .functions import Lambda
 from .gpu_indexing import GpuIndexing, GpuLaunchConfiguration
+from ..backend.kernelcreation.context import ReductionInfo
 
 from ..field import Field
 from ..types import PsIntegerType, PsScalarType
@@ -192,28 +193,7 @@ class DefaultKernelCreationDriver:
 
         #   Extensions for reductions
         for symbol, reduction_info in self._ctx.symbols_reduction_info.items():
-            typify = Typifier(self._ctx)
-            symbol_expr = typify(PsSymbolExpr(symbol))
-            ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol))
-            init_val = typify(reduction_info.init_val)
-
-            ptr_access = PsMemAcc(
-                ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))
-            )
-            write_back_ptr = PsCall(
-                PsReductionFunction(
-                    ReductionFunctions.WriteBackToPtr, reduction_info.op
-                ),
-                [ptr_symbol_expr, symbol_expr],
-            )
-
-            # declare and init local copy with neutral element
-            prepend_ast = [PsDeclaration(symbol_expr, init_val)]
-            # write back result to reduction target variable
-            append_ast = [PsAssignment(ptr_access, write_back_ptr)]
-
-            kernel_ast.statements = prepend_ast + kernel_ast.statements
-            kernel_ast.statements += append_ast
+            self._modify_kernel_ast_for_reductions(symbol, reduction_info, kernel_ast)
 
         #   Target-Specific optimizations
         if self._target.is_cpu():
@@ -315,6 +295,35 @@ class DefaultKernelCreationDriver:
 
         return kernel_body
 
+    def _modify_kernel_ast_for_reductions(self,
+                                          symbol: PsSymbol,
+                                          reduction_info: ReductionInfo,
+                                          kernel_ast: PsBlock):
+        # typify local symbol and write-back pointer expressions and initial value
+        typify = Typifier(self._ctx)
+        symbol_expr = typify(PsSymbolExpr(symbol))
+        ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol))
+        init_val = typify(reduction_info.init_val)
+
+        ptr_access = PsMemAcc(
+            ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))
+        )
+        write_back_ptr = PsCall(
+            PsReductionFunction(
+                ReductionFunctions.WriteBackToPtr, reduction_info.op
+            ),
+            [ptr_symbol_expr, symbol_expr],
+        )
+
+        # declare and init local copy with neutral element
+        prepend_ast = [PsDeclaration(symbol_expr, init_val)]
+        # write back result to reduction target variable
+        append_ast = [PsAssignment(ptr_access, write_back_ptr)]
+
+        # modify AST
+        kernel_ast.statements = prepend_ast + kernel_ast.statements
+        kernel_ast.statements += append_ast
+
     def _transform_for_cpu(self, kernel_ast: PsBlock) -> PsBlock:
         canonicalize = CanonicalizeSymbols(self._ctx, True)
         kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
-- 
GitLab


From b4cabdd81a3e94f2881cb26798d6588badb1c377 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 14:57:25 +0200
Subject: [PATCH 152/180] Add consistency check for PsVecHorizontal in typifier

---
 src/pystencils/backend/kernelcreation/typification.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index b457f39a0..9585cb23f 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -605,6 +605,12 @@ class Typifier:
                         f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}"
                     )
 
+                if vector_op_tc.target_type.scalar_type is not scalar_op_tc.target_type:
+                    raise TypificationError(
+                        f"Scalar type of vector operand {vector_op_tc.target_type} "
+                        f"does not correspond to type of scalar operand {scalar_op_tc.target_type}"
+                    )
+
                 tc.apply_dtype(scalar_op_tc.target_type, expr)
 
             case PsBinOp(op1, op2):
-- 
GitLab


From 3c3283cd4a823b7aeac883a794c2e2fd05e984e2 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 15:44:30 +0200
Subject: [PATCH 153/180] Merge reduction user guides into one document

---
 docs/source/backend/reduction_codegen.md | 116 -----------------------
 docs/source/user_manual/reductions.md    |  90 +++++++++++++++++-
 2 files changed, 85 insertions(+), 121 deletions(-)
 delete mode 100644 docs/source/backend/reduction_codegen.md

diff --git a/docs/source/backend/reduction_codegen.md b/docs/source/backend/reduction_codegen.md
deleted file mode 100644
index f08fa980a..000000000
--- a/docs/source/backend/reduction_codegen.md
+++ /dev/null
@@ -1,116 +0,0 @@
----
-jupytext:
-  formats: md:myst
-  text_representation:
-    extension: .md
-    format_name: myst
-    format_version: 0.13
-    jupytext_version: 1.16.4
-kernelspec:
-  display_name: Python 3 (ipykernel)
-  language: python
-  name: python3
-mystnb:
-  execution_mode: cache
----
-
-```{code-cell} ipython3
-:tags: [remove-cell, raises-exception]
-
-import pystencils as ps
-import numpy as np
-import cupy as cp
-```
-
-(codegen_reductions)=
-# Code Generation for Reductions
-
-In this guide, we demonstrate how reduction kernels can be generated for different platforms and what impact certain
-optimization strategies have.
-For this, we set up the update rule for a simple dot product kernel:
-
-```{code-cell} ipython3
-r = ps.TypedSymbol("r", "double")
-x, y = ps.fields(f"x, y: double[3D]", layout="fzyx")
-
-assign_dot_prod = ps.AddReductionAssignment(r, x.center() * y.center())
-```
-
-## CPU Platforms
-
-We first consider a base variant for CPUs without employing any optimizations.
-The generated code for this variant looks as follows:
-
-```{code-cell} ipython3
-cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
-kernel_cpu = ps.create_kernel(assign_dot_prod, cfg)
-
-ps.inspect(kernel_cpu)
-```
-
-We want the reduction kernel to be SIMD vectorized and employ shared-memory parallelism using OpenMP.
-The supported SIMD instruction sets for reductions are:
-* SSE3
-* AVX/AVX2
-* AVX512
-
-Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`.
-**Note that reductions require `assume_inner_stride_one` to be enabled.**
-This is due to the fact that other inner strides would require masked SIMD operations 
-which are not supported yet.
-
-```{code-cell} ipython3
-# configure SIMD vectorization
-cfg = ps.CreateKernelConfig(
-  target=ps.Target.X86_AVX,
-)
-cfg.cpu.vectorize.enable = True
-cfg.cpu.vectorize.assume_inner_stride_one = True
-
-# configure OpenMP parallelization
-cfg.cpu.openmp.enable = True
-cfg.cpu.openmp.num_threads = 8
-
-kernel_cpu_opt = ps.create_kernel(assign_dot_prod, cfg)
-
-ps.inspect(kernel_cpu_opt)
-```
-
-## GPU Platforms
-
-Reductions are currently only supported for CUDA platforms.
-Similar to the CPU section, a base variant for GPUs without explicitly employing any optimizations is shown:
-
-```{code-cell} ipython3
-    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
-
-    kernel_gpu = ps.create_kernel(assign_dot_prod, cfg)
-
-    ps.inspect(kernel_gpu)
-```
-
-As evident from the code, the generated kernel employs atomic operations for updating the pointer 
-holding the reduction result.
-Using the explicit warp-level instructions provided by CUDA allows us to achieve higher performance compared to
-only using atomic operations.
-To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by 
-the hardware's warp size.
-**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this
-criterion by enabling `assume_warp_aligned_block_size`.**
-While the default block sizes provided by the code generator already fulfill this criterion,
-we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space.
-
-You can find more detailed information about warp size alignment in {ref}`gpu_codegen`.
-
-```{code-cell} ipython3
-    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
-    cfg.gpu.assume_warp_aligned_block_size = True
-
-    kernel_gpu_opt = ps.create_kernel(assign_dot_prod, cfg)
-    
-    kernel_func = kernel_gpu_opt.compile()
-    kernel_func.launch_config.fit_block_size((32, 1, 1))
-
-    ps.inspect(kernel_gpu_opt)
-```
-
diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md
index 0b0c04279..46e935bbb 100644
--- a/docs/source/user_manual/reductions.md
+++ b/docs/source/user_manual/reductions.md
@@ -78,11 +78,14 @@ class ReductionOp(Enum):
     Max = "max"
 ```
 
-## Generating Reduction Kernels
+## Generating and Running Reduction Kernels
 
 With the assignments being fully assembled, we can finally invoke the code generator and 
-create the kernel object via the {any}`create_kernel` function. 
-For this example, we assume a kernel configuration where no optimizations are explicitly enabled.
+create the kernel object via the {any}`create_kernel` function.
+
+### CPU Platforms
+
+For this example, we assume a kernel configuration for CPU platforms with no optimizations explicitly enabled.
 
 ```{code-cell} ipython3
 cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
@@ -120,5 +123,82 @@ Since our reduction result is a single scalar value, it is sufficient to set up
     reduction_result[0]
 ```
 
-For GPU platforms, the concepts remain the same but the fields and the write-back pointer now require device memory, 
-i.e. instances of {any}`cupy.ndarray`.
+### GPU Platforms
+
+Please note that **reductions are currently only supported for CUDA platforms**.
+Similar to the CPU section, a base variant for NVIDIA GPUs without 
+explicitly employing any optimizations is shown:
+
+```{code-cell} ipython3
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    kernel_gpu = ps.create_kernel(assign_sum, cfg)
+
+    ps.inspect(kernel_gpu)
+```
+
+The steps for running the generated code on NVIDIA GPUs are identical but the fields and the write-back pointer 
+now require device memory, i.e. instances of {any}`cupy.ndarray`.
+
+## Optimizations for Reductions
+
+Going beyond the aforementioned basic kernel configurations,
+we now demonstrate optimization strategies for different platforms 
+that can be applied to reduction kernels and show what impact they have.
+
+### CPU Platforms
+
+For CPU platforms, standard optimizations are employing SIMD vectorization and shared-memory parallelism using OpenMP.
+The supported SIMD instruction sets for reductions are:
+* SSE3
+* AVX/AVX2
+* AVX512
+
+Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`.
+**Note that reductions require `assume_inner_stride_one` to be enabled.**
+This is due to the fact that other inner strides would require masked SIMD operations 
+which are not supported yet.
+
+```{code-cell} ipython3
+# configure SIMD vectorization
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.X86_AVX,
+)
+cfg.cpu.vectorize.enable = True
+cfg.cpu.vectorize.assume_inner_stride_one = True
+
+# configure OpenMP parallelization
+cfg.cpu.openmp.enable = True
+cfg.cpu.openmp.num_threads = 8
+
+kernel_cpu_opt = ps.create_kernel(assign_sum, cfg)
+
+ps.inspect(kernel_cpu_opt)
+```
+
+### GPU Platforms
+
+As evident from the generated kernel for the base variant, atomic operations are employed 
+for updating the pointer holding the reduction result.
+Using the *explicit warp-level instructions* provided by CUDA allows us to achieve higher performance compared to
+only using atomic operations.
+To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by 
+the hardware's warp size.
+**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this
+criterion by enabling `assume_warp_aligned_block_size`.**
+While the default block sizes provided by the code generator already fulfill this criterion,
+we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space.
+
+You can find more detailed information about warp size alignment in {ref}`gpu_codegen`.
+
+```{code-cell} ipython3
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+    cfg.gpu.assume_warp_aligned_block_size = True
+
+    kernel_gpu_opt = ps.create_kernel(assign_sum, cfg)
+    
+    kernel_func = kernel_gpu_opt.compile()
+    kernel_func.launch_config.fit_block_size((32, 1, 1))
+
+    ps.inspect(kernel_gpu_opt)
+```
-- 
GitLab


From b0904d93d48f0a165e4b1561e3b267db09986221 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 16:45:26 +0200
Subject: [PATCH 154/180] Fix typecheck

---
 src/pystencils/backend/reduction_op_mapping.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py
index 876912acd..832f5d0bf 100644
--- a/src/pystencils/backend/reduction_op_mapping.py
+++ b/src/pystencils/backend/reduction_op_mapping.py
@@ -15,18 +15,17 @@ def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
     if op in _available_operator_interface:
         match op:
             case ReductionOp.Add:
-                operator = PsAdd
+                return PsAdd(op1, op2)
             case ReductionOp.Sub:
-                operator = PsSub
+                return PsSub(op1, op2)
             case ReductionOp.Mul:
-                operator = PsMul
+                return PsMul(op1, op2)
             case ReductionOp.Div:
-                operator = PsDiv
+                return PsDiv(op1, op2)
             case _:
                 raise FreezeError(
                     f"Found unsupported operation type for reduction assignments: {op}."
                 )
-        return operator(op1, op2)
     else:
         match op:
             case ReductionOp.Min:
-- 
GitLab


From 603e6a3fce53a0bbd4d2b21faa12c34da3936b1b Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 23 Apr 2025 16:46:27 +0200
Subject: [PATCH 155/180] Fix docs

---
 docs/source/backend/index.rst         | 1 -
 docs/source/user_manual/reductions.md | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst
index b9b400544..0d384c55b 100644
--- a/docs/source/backend/index.rst
+++ b/docs/source/backend/index.rst
@@ -16,7 +16,6 @@ who wish to customize or extend the behaviour of the code generator in their app
     iteration_space
     translation
     platforms
-    reduction_codegen
     transformations
     gpu_codegen
     errors
diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md
index 46e935bbb..7c7904dfc 100644
--- a/docs/source/user_manual/reductions.md
+++ b/docs/source/user_manual/reductions.md
@@ -96,10 +96,7 @@ ps.inspect(kernel)
 
 :::{note}
 The generated reduction kernels may vary vastly for different platforms and optimizations.
-For the sake of compactness, the impact of different backend or optimization choices is left out.
-
-A detailed description of configuration choices and their impact on the generated code can be found in
-{ref}`codegen_reductions`.
+You can find a  detailed description of configuration choices and their impact on the generated code below.
 :::
 
 The kernel can be compiled and run immediately.
-- 
GitLab


From 6046db6e13ddd875e7c68edd9f99fb7a7101f47f Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 24 Apr 2025 17:54:31 +0200
Subject: [PATCH 156/180] Move symbol handling for reductions to context and
 add more checks

---
 .../backend/kernelcreation/context.py         | 51 ++++++++++-----
 .../backend/kernelcreation/freeze.py          | 64 +++++++++++--------
 .../backend/transformations/add_pragmas.py    |  8 +--
 .../transformations/loop_vectorizer.py        | 11 ++--
 src/pystencils/codegen/driver.py              |  9 ++-
 5 files changed, 89 insertions(+), 54 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 358b5ff6c..58a4bd7d1 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -54,7 +54,8 @@ class ReductionInfo:
 
     op: ReductionOp
     init_val: PsExpression
-    ptr_symbol: PsSymbol
+    local_symbol: PsSymbol
+    writeback_ptr_symbol: PsSymbol
 
 
 class KernelCreationContext:
@@ -88,7 +89,7 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
-        self._symbols_reduction_info: dict[PsSymbol, ReductionInfo] = dict()
+        self._reduction_data: dict[str, ReductionInfo] = dict()
 
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
@@ -193,19 +194,39 @@ class KernelCreationContext:
 
         self._symbols[old.name] = new
 
-    def add_symbol_reduction_info(
-        self, local_symb: PsSymbol, reduction_info: ReductionInfo
+    def add_reduction_info(
+        self,
+        lhs_name: str,
+        lhs_dtype: PsType,
+        reduction_op: ReductionOp,
+        init_value: PsExpression,
     ):
-        """Adds entry for a symbol and its reduction info to its corresponding lookup table.
+        """Create ReductionInfo instance and add to its corresponding lookup table for a given symbol name."""
 
-        The symbol ``symbol`` shall not exist in the symbol table already.
-        """
-        if local_symb in self._symbols_reduction_info:
-            raise PsInternalCompilerError(
-                f"add_symbol_reduction_info: {local_symb.name} already exist in the symbol table"
-            )
+        # replace datatype of lhs symbol with pointer datatype for write-back mechanism
+        symb = self.get_symbol(lhs_name, lhs_dtype)
+        pointer_symb = PsSymbol(lhs_name, PsPointerType(lhs_dtype))
+        self.replace_symbol(symb, pointer_symb)
+
+        # create kernel-local copy of lhs symbol
+        local_symb = PsSymbol(f"{lhs_name}_local", lhs_dtype)
+        self.add_symbol(local_symb)
 
-        self._symbols_reduction_info[local_symb] = reduction_info
+        # create reduction info and add to set
+        reduction_info = ReductionInfo(
+            reduction_op, init_value, local_symb, pointer_symb
+        )
+        self._reduction_data[lhs_name] = reduction_info
+
+        return reduction_info
+
+    def find_reduction_info(self, name: str) -> ReductionInfo | None:
+        """Find a ReductionInfo with the given name in the lookup table, if it exists.
+
+        Returns:
+            The ReductionInfo with the given name, or `None` if it does not exist.
+        """
+        return self._reduction_data.get(name, None)
 
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
@@ -243,9 +264,9 @@ class KernelCreationContext:
         return self._symbols.values()
 
     @property
-    def symbols_reduction_info(self) -> dict[PsSymbol, ReductionInfo]:
-        """Return a dictionary holding kernel-local reduction symbols and their reduction properties."""
-        return self._symbols_reduction_info
+    def reduction_data(self) -> dict[str, ReductionInfo]:
+        """Return a dictionary holding kernel-local reduction information for given symbol names."""
+        return self._reduction_data
 
     #   Fields and Arrays
 
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index c5ff43fb9..2f00df4e8 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -6,7 +6,6 @@ import sympy as sp
 import sympy.logic.boolalg
 from sympy.codegen.ast import AssignmentBase, AugmentedAssignment
 
-from ..memory import PsSymbol
 from ...assignment import Assignment
 from ...simp import AssignmentCollection
 from ...sympyextensions import (
@@ -19,7 +18,7 @@ from ...sympyextensions.pointers import AddressOf, mem_acc
 from ...sympyextensions.reduction import ReductionAssignment, ReductionOp
 from ...field import Field, FieldType
 
-from .context import KernelCreationContext, ReductionInfo
+from .context import KernelCreationContext
 
 from ..ast.structural import (
     PsAstNode,
@@ -62,7 +61,7 @@ from ..ast.expressions import (
 from ..ast.vector import PsVecMemAcc
 
 from ..constants import PsConstant
-from ...types import PsNumericType, PsStructType, PsType, PsPointerType
+from ...types import PsNumericType, PsStructType, PsType
 from ..exceptions import PsInputError
 from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
 from ..exceptions import FreezeError
@@ -190,32 +189,41 @@ class FreezeExpressions:
     def map_ReductionAssignment(self, expr: ReductionAssignment):
         assert isinstance(expr.lhs, TypedSymbol)
 
+        # make sure that either:
+        # 1) lhs symbol never occurred
+        # 2) that it is at least known as lhs of an existing reduction operation
+        if self._ctx.find_symbol(expr.lhs.name):
+            # make sure that reduction operations are not mixed within a kernel
+            if info := self._ctx.find_reduction_info(expr.lhs.name):
+                if info.op is not expr.reduction_op:
+                    raise FreezeError(
+                        f"Different reduction operation {info.op} already exists "
+                        f"for {expr.lhs} with target reduction op {expr.reduction_op}."
+                    )
+                else:
+                    raise FreezeError(
+                        f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table."
+                        f"Make sure that it is exclusively used within the kernel to conduct ReductionAssignment's."
+                    )
+
         lhs = self.visit(expr.lhs)
         rhs = self.visit(expr.rhs)
 
         assert isinstance(rhs, PsExpression)
         assert isinstance(lhs, PsSymbolExpr)
 
-        op = expr.reduction_op
-        orig_lhs_symb = lhs.symbol
-        dtype = lhs.dtype
-
-        assert isinstance(dtype, PsNumericType), \
-            "Reduction assignments require type information of the lhs symbol."
-
-        # replace original symbol with pointer-based type used for export
-        orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype))
-
-        # create kernel-local copy of lhs symbol to work with
-        new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype)
-        new_lhs = PsSymbolExpr(new_lhs_symb)
+        reduction_op = expr.reduction_op
+        lhs_symbol = lhs.symbol
+        lhs_dtype = lhs_symbol.dtype
+        lhs_name = lhs_symbol.name
 
-        # get new rhs from augmented assignment
-        new_rhs: PsExpression = reduction_op_to_expr(op, new_lhs.clone(), rhs)
+        assert isinstance(
+            lhs_dtype, PsNumericType
+        ), "Reduction assignments require type information of the lhs symbol."
 
         # match for reduction operation and set neutral init_val
         init_val: PsExpression
-        match op:
+        match reduction_op:
             case ReductionOp.Add:
                 init_val = PsConstantExpr(PsConstant(0))
             case ReductionOp.Sub:
@@ -227,14 +235,20 @@ class FreezeExpressions:
             case ReductionOp.Max:
                 init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
             case _:
-                raise FreezeError(f"Unsupported kind of reduction assignment: {op}.")
+                raise FreezeError(
+                    f"Unsupported kind of reduction assignment: {reduction_op}."
+                )
 
-        reduction_info = ReductionInfo(op, init_val, orig_lhs_symb_as_ptr)
+        # get reduction info from context
+        reduction_info = self._ctx.add_reduction_info(
+            lhs_name, lhs_dtype, reduction_op, init_val
+        )
+
+        # create new lhs from newly created local lhs symbol
+        new_lhs = PsSymbolExpr(reduction_info.local_symbol)
 
-        # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info
-        self._ctx.add_symbol(new_lhs_symb)
-        self._ctx.add_symbol_reduction_info(new_lhs_symb, reduction_info)
-        self._ctx.replace_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr)
+        # get new rhs from augmented assignment
+        new_rhs: PsExpression = reduction_op_to_expr(reduction_op, new_lhs, rhs)
 
         return PsAssignment(new_lhs, new_rhs)
 
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index fa466e495..1d1cb6a8d 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -123,11 +123,11 @@ class AddOpenMP:
         if num_threads is not None:
             pragma_text += f" num_threads({str(num_threads)})"
 
-        if bool(ctx.symbols_reduction_info):
-            for symbol, reduction_info in ctx.symbols_reduction_info.items():
-                if isinstance(symbol.dtype, PsScalarType):
+        if bool(ctx.reduction_data):
+            for _, reduction_info in ctx.reduction_data.items():
+                if isinstance(reduction_info.local_symbol.dtype, PsScalarType):
                     pragma_text += (
-                        f" reduction({reduction_info.op.value}: {symbol.name})"
+                        f" reduction({reduction_info.op.value}: {reduction_info.local_symbol.name})"
                     )
                 else:
                     NotImplementedError(
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index 09b0aa5dd..8061240b7 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -143,16 +143,17 @@ class LoopVectorizer:
         #   Prepare reductions
         simd_init_local_reduction_vars: list[PsStructuralNode] = []
         simd_writeback_local_reduction_vars: list[PsStructuralNode] = []
-        for symb, reduction_info in self._ctx.symbols_reduction_info.items():
+        for _, reduction_info in self._ctx.reduction_data.items():
             # Vectorize symbol for local copy
-            vector_symb = vc.vectorize_symbol(symb)
+            local_symbol = reduction_info.local_symbol
+            vector_symb = vc.vectorize_symbol(local_symbol)
 
             # Declare and init vector
             simd_init_local_reduction_vars += [
                 self._type_fold(
                     PsDeclaration(
                         PsSymbolExpr(vector_symb),
-                        PsVecBroadcast(self._lanes, PsSymbolExpr(symb)),
+                        PsVecBroadcast(self._lanes, PsSymbolExpr(local_symbol)),
                     )
                 )
             ]
@@ -160,9 +161,9 @@ class LoopVectorizer:
             # Write back vectorization result
             simd_writeback_local_reduction_vars += [
                 PsAssignment(
-                    PsSymbolExpr(symb),
+                    PsSymbolExpr(local_symbol),
                     PsVecHorizontal(
-                        PsSymbolExpr(symb),
+                        PsSymbolExpr(local_symbol),
                         PsSymbolExpr(vector_symb),
                         reduction_info.op,
                     ),
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 3d107eda3..74a07b902 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -192,8 +192,8 @@ class DefaultKernelCreationDriver:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
         #   Extensions for reductions
-        for symbol, reduction_info in self._ctx.symbols_reduction_info.items():
-            self._modify_kernel_ast_for_reductions(symbol, reduction_info, kernel_ast)
+        for _, reduction_info in self._ctx.reduction_data.items():
+            self._modify_kernel_ast_for_reductions(reduction_info, kernel_ast)
 
         #   Target-Specific optimizations
         if self._target.is_cpu():
@@ -296,13 +296,12 @@ class DefaultKernelCreationDriver:
         return kernel_body
 
     def _modify_kernel_ast_for_reductions(self,
-                                          symbol: PsSymbol,
                                           reduction_info: ReductionInfo,
                                           kernel_ast: PsBlock):
         # typify local symbol and write-back pointer expressions and initial value
         typify = Typifier(self._ctx)
-        symbol_expr = typify(PsSymbolExpr(symbol))
-        ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol))
+        symbol_expr = typify(PsSymbolExpr(reduction_info.local_symbol))
+        ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.writeback_ptr_symbol))
         init_val = typify(reduction_info.init_val)
 
         ptr_access = PsMemAcc(
-- 
GitLab


From 081d11ad152bfa6e1f809af7d20789f00d39e5a9 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 24 Apr 2025 19:10:21 +0200
Subject: [PATCH 157/180] Fix indent for error handling in freeze

---
 src/pystencils/backend/kernelcreation/freeze.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 2f00df4e8..045aca1d1 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -200,11 +200,11 @@ class FreezeExpressions:
                         f"Different reduction operation {info.op} already exists "
                         f"for {expr.lhs} with target reduction op {expr.reduction_op}."
                     )
-                else:
-                    raise FreezeError(
-                        f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table."
-                        f"Make sure that it is exclusively used within the kernel to conduct ReductionAssignment's."
-                    )
+            else:
+                raise FreezeError(
+                    f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table."
+                    f"Make sure that it is exclusively used within the kernel to conduct ReductionAssignment's."
+                )
 
         lhs = self.visit(expr.lhs)
         rhs = self.visit(expr.rhs)
-- 
GitLab


From f0aba2e948fef8f535644e80b0f7d35e0e5f60e0 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 24 Apr 2025 20:06:25 +0200
Subject: [PATCH 158/180] Document attributes of ReductionInfo

---
 src/pystencils/backend/kernelcreation/context.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 58a4bd7d1..63bfc2f7b 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -50,7 +50,16 @@ FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array"))
 @dataclass(frozen=True)
 class ReductionInfo:
     """Information about a reduction operation, its neutral element in form of an initial value
-    and the pointer used by the kernel as write-back argument."""
+    and the pointer used by the kernel as write-back argument.
+
+    Attributes:
+    ===========
+
+    reduction_op : Reduction operation being performed
+    init_val : Initial value used to initialize local symbol
+    local_symbol : Kernel-local symbol used to accumulate intermediate reduction result
+    writeback_ptr_symbol : Symbol that is used to export the final reduction result
+    """
 
     op: ReductionOp
     init_val: PsExpression
-- 
GitLab


From eb6e8c0f8c98d00cb56e6dfc7b7add5036e766ae Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 24 Apr 2025 20:35:16 +0200
Subject: [PATCH 159/180] Check if reduction symbol is (illegally) accessed
 before/after reduction assignment

---
 .../backend/kernelcreation/freeze.py          | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 045aca1d1..e0ed0f1f7 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -189,22 +189,12 @@ class FreezeExpressions:
     def map_ReductionAssignment(self, expr: ReductionAssignment):
         assert isinstance(expr.lhs, TypedSymbol)
 
-        # make sure that either:
-        # 1) lhs symbol never occurred
-        # 2) that it is at least known as lhs of an existing reduction operation
+        # make sure that lhs symbol never occurred before ReductionAssignment
         if self._ctx.find_symbol(expr.lhs.name):
-            # make sure that reduction operations are not mixed within a kernel
-            if info := self._ctx.find_reduction_info(expr.lhs.name):
-                if info.op is not expr.reduction_op:
-                    raise FreezeError(
-                        f"Different reduction operation {info.op} already exists "
-                        f"for {expr.lhs} with target reduction op {expr.reduction_op}."
-                    )
-            else:
-                raise FreezeError(
-                    f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table."
-                    f"Make sure that it is exclusively used within the kernel to conduct ReductionAssignment's."
-                )
+            raise FreezeError(
+                f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table. "
+                f"Make sure that it is only used once in a kernel's ReductionAssignment."
+            )
 
         lhs = self.visit(expr.lhs)
         rhs = self.visit(expr.rhs)
@@ -340,6 +330,16 @@ class FreezeExpressions:
 
     def map_TypedSymbol(self, expr: TypedSymbol):
         dtype = self._ctx.resolve_dynamic_type(expr.dtype)
+
+        # check if symbol is referenced after freezing a ReductionAssignment
+        if self._ctx.find_reduction_info(expr.name):
+            # check if types do not align since a ReductionAssignment modifies
+            # the symbol's type to PsPointerType in the context's symbol table
+            if (symbol := self._ctx.find_symbol(expr.name)) and symbol.dtype != dtype:
+                raise FreezeError(
+                    f"Illegal access to reduction symbol {symbol.name} after freezing a kernel's ReductionAssignment. "
+                )
+
         symb = self._ctx.get_symbol(expr.name, dtype)
         return PsSymbolExpr(symb)
 
-- 
GitLab


From f678a2fa2c38e2fa202a2c2a12472cdcc229b3d2 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 24 Apr 2025 20:36:36 +0200
Subject: [PATCH 160/180] Add unit test for checking border cases of freezing
 illegal usages of ReductionAssignments

---
 tests/nbackend/kernelcreation/test_freeze.py | 31 ++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py
index f6c8f85b2..7082f28ad 100644
--- a/tests/nbackend/kernelcreation/test_freeze.py
+++ b/tests/nbackend/kernelcreation/test_freeze.py
@@ -66,6 +66,7 @@ from pystencils.sympyextensions.integer_functions import (
     ceil_to_multiple,
     div_ceil,
 )
+from pystencils.sympyextensions.reduction import AddReductionAssignment
 
 
 def test_freeze_simple():
@@ -494,6 +495,36 @@ def test_invalid_arrays():
         _ = freeze(symb_arr)
 
 
+def test_invalid_reduction_assignments():
+    x = fields(f"x: float64[1d]")
+    w = TypedSymbol("w", "float64")
+
+    ctx = KernelCreationContext()
+    freeze = FreezeExpressions(ctx)
+
+    one = PsExpression.make(PsConstant(1, ctx.index_dtype))
+    counter = ctx.get_symbol("ctr", ctx.index_dtype)
+    ispace = FullIterationSpace(
+        ctx, [FullIterationSpace.Dimension(one, one, one, counter)]
+    )
+    ctx.set_iteration_space(ispace)
+
+    invalid_assignment = Assignment(w, -1 * x.center())
+    reduction_assignment = AddReductionAssignment(w, 3 * x.center())
+
+    # reduction symbol is used before ReductionAssignment
+    with pytest.raises(FreezeError):
+        _ = [freeze(asm) for asm in [invalid_assignment, reduction_assignment]]
+
+    # reduction symbol is used after ReductionAssignment
+    with pytest.raises(FreezeError):
+        _ = [freeze(asm) for asm in [reduction_assignment, invalid_assignment]]
+
+    # duplicate ReductionAssignment
+    with pytest.raises(FreezeError):
+        _ = [freeze(asm) for asm in [reduction_assignment, reduction_assignment]]
+
+
 def test_memory_access():
     ctx = KernelCreationContext()
     freeze = FreezeExpressions(ctx)
-- 
GitLab


From 94dcf3c5362238f1203b88f8a6a5873431749dc5 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 24 Apr 2025 20:47:07 +0200
Subject: [PATCH 161/180] Add unit test for freezing of ReductionAssignments

---
 tests/nbackend/kernelcreation/test_freeze.py | 26 ++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py
index 7082f28ad..a39a0a994 100644
--- a/tests/nbackend/kernelcreation/test_freeze.py
+++ b/tests/nbackend/kernelcreation/test_freeze.py
@@ -44,6 +44,7 @@ from pystencils.backend.ast.expressions import (
     PsArrayInitList,
     PsSubscript,
     PsMemAcc,
+    PsSymbolExpr,
 )
 from pystencils.backend.constants import PsConstant
 from pystencils.backend.functions import PsMathFunction, MathFunctions
@@ -495,6 +496,31 @@ def test_invalid_arrays():
         _ = freeze(symb_arr)
 
 
+def test_reduction_assignments():
+    x = fields(f"x: float64[1d]")
+    w = TypedSymbol("w", "float64")
+
+    ctx = KernelCreationContext()
+    freeze = FreezeExpressions(ctx)
+
+    one = PsExpression.make(PsConstant(1, ctx.index_dtype))
+    counter = ctx.get_symbol("ctr", ctx.index_dtype)
+    ispace = FullIterationSpace(
+        ctx, [FullIterationSpace.Dimension(one, one, one, counter)]
+    )
+    ctx.set_iteration_space(ispace)
+
+    expr = freeze(AddReductionAssignment(w, 3 * x.center()))
+
+    info = ctx.find_reduction_info(w.name)
+
+    assert isinstance(expr, PsAssignment)
+    assert isinstance(expr.lhs, PsSymbolExpr)
+
+    assert expr.lhs.symbol == info.local_symbol
+    assert expr.lhs.dtype == w.dtype
+
+
 def test_invalid_reduction_assignments():
     x = fields(f"x: float64[1d]")
     w = TypedSymbol("w", "float64")
-- 
GitLab


From 3010daed816e9fe4ed9b685b1f666c75736d8222 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Thu, 24 Apr 2025 21:08:49 +0200
Subject: [PATCH 162/180] Add typification test for PsVecHorizontal

---
 .../kernelcreation/test_typification.py       | 47 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/tests/nbackend/kernelcreation/test_typification.py b/tests/nbackend/kernelcreation/test_typification.py
index 3defe4ad5..31df7090d 100644
--- a/tests/nbackend/kernelcreation/test_typification.py
+++ b/tests/nbackend/kernelcreation/test_typification.py
@@ -5,6 +5,7 @@ import numpy as np
 from typing import cast
 
 from pystencils import Assignment, TypedSymbol, Field, FieldType, AddAugmentedAssignment
+from pystencils.sympyextensions import ReductionOp
 from pystencils.sympyextensions.pointers import mem_acc
 
 from pystencils.backend.ast.structural import (
@@ -34,7 +35,7 @@ from pystencils.backend.ast.expressions import (
     PsTernary,
     PsMemAcc
 )
-from pystencils.backend.ast.vector import PsVecBroadcast
+from pystencils.backend.ast.vector import PsVecBroadcast, PsVecHorizontal
 from pystencils.backend.constants import PsConstant
 from pystencils.backend.functions import CFunction
 from pystencils.types import constify, create_type, create_numeric_type, PsVectorType
@@ -649,6 +650,50 @@ def test_typify_bool_vectors():
     assert result.get_dtype() == PsVectorType(Bool(), 4)
 
 
+def test_typify_horizontal_vector_reductions():
+    ctx = KernelCreationContext()
+    typify = Typifier(ctx)
+
+    reduction_op = ReductionOp.Add
+    stype = Fp(32)
+    vtype = PsVectorType(stype, 4)
+
+    def create_symb_expr(name, tpe):
+        return PsExpression.make(ctx.get_symbol(name, tpe))
+
+    # create valid horizontal and check if expression type is scalar
+    result = typify(
+        PsVecHorizontal(
+            create_symb_expr("s1", stype), create_symb_expr("v1", vtype), ReductionOp.Add
+        )
+    )
+    assert result.get_dtype() == stype
+
+    # create invalid horizontal by using scalar type for expected vector type
+    with pytest.raises(TypificationError):
+        _ = typify(
+            PsVecHorizontal(
+                create_symb_expr("s2", stype), create_symb_expr("v2", stype), reduction_op
+            )
+        )
+
+    # create invalid horizontal by using vector type for expected scalar type
+    with pytest.raises(TypificationError):
+        _ = typify(
+            PsVecHorizontal(
+                create_symb_expr("s3", vtype), create_symb_expr("v3", vtype), reduction_op
+            )
+        )
+
+    # create invalid horizontal where base type of vector does not match with scalar type
+    with pytest.raises(TypificationError):
+        _ = typify(
+            PsVecHorizontal(
+                create_symb_expr("s4", Int(32)), create_symb_expr("v4", vtype), reduction_op
+            )
+        )
+
+
 def test_inference_fails():
     ctx = KernelCreationContext()
     typify = Typifier(ctx)
-- 
GitLab


From da00b78ff50a2b5e61ae0d9ad53f56d3a6bc4c4d Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 25 Apr 2025 12:58:57 +0200
Subject: [PATCH 163/180] Apply 1 suggestion(s) to 1 file(s)

Co-authored-by: Frederik Hennig <frederik.hennig@fau.de>
---
 src/pystencils/backend/ast/vector.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 55db67e7c..d7ae8d6a9 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -41,10 +41,9 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
 
 
 class PsVecHorizontal(PsBinOp, PsVectorOp):
-    """Represents a binary operation between a scalar and a vector operand.
-    With the binary operation not being vectorized, a horizontal reduction
-    along the lanes of the vector operand is required to extract a scalar value.
-    The result type will be equal to the scalar operand.
+    """Perform a horizontal reduction across a vector onto a scalar base value.
+
+    **Example:** vec_horizontal_add(s, v)` will compute `s + v[0] + v[1] + ... + v[n-1]`.
 
     Args:
         scalar_operand: Scalar operand
-- 
GitLab


From c0df001f196655dd5b3ada6f1fa3909b90584abd Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Fri, 25 Apr 2025 12:59:07 +0200
Subject: [PATCH 164/180] Apply 1 suggestion(s) to 1 file(s)

Co-authored-by: Frederik Hennig <frederik.hennig@fau.de>
---
 .../backend/reduction_op_mapping.py           | 41 ++++++++-----------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py
index 832f5d0bf..59273efab 100644
--- a/src/pystencils/backend/reduction_op_mapping.py
+++ b/src/pystencils/backend/reduction_op_mapping.py
@@ -12,27 +12,20 @@ _available_operator_interface: set[ReductionOp] = {
 
 
 def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
-    if op in _available_operator_interface:
-        match op:
-            case ReductionOp.Add:
-                return PsAdd(op1, op2)
-            case ReductionOp.Sub:
-                return PsSub(op1, op2)
-            case ReductionOp.Mul:
-                return PsMul(op1, op2)
-            case ReductionOp.Div:
-                return PsDiv(op1, op2)
-            case _:
-                raise FreezeError(
-                    f"Found unsupported operation type for reduction assignments: {op}."
-                )
-    else:
-        match op:
-            case ReductionOp.Min:
-                return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2])
-            case ReductionOp.Max:
-                return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2])
-            case _:
-                raise FreezeError(
-                    f"Found unsupported operation type for reduction assignments: {op}."
-                )
+    match op:
+        case ReductionOp.Add:
+            return PsAdd(op1, op2)
+        case ReductionOp.Sub:
+            return PsSub(op1, op2)
+        case ReductionOp.Mul:
+            return PsMul(op1, op2)
+        case ReductionOp.Div:
+            return PsDiv(op1, op2)
+        case ReductionOp.Min:
+            return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2])
+        case ReductionOp.Max:
+            return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2])
+        case _:
+            raise FreezeError(
+                f"Found unsupported operation type for reduction assignments: {op}."
+            )
-- 
GitLab


From 705ac53161f9a2fe49a2898de36f118a79e2d6a2 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 15:19:00 +0200
Subject: [PATCH 165/180] Minor doc change

---
 docs/source/user_manual/reductions.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md
index 7c7904dfc..5b45a921c 100644
--- a/docs/source/user_manual/reductions.md
+++ b/docs/source/user_manual/reductions.md
@@ -105,8 +105,9 @@ To execute the kernel on CPUs, not only a {any}`numpy.ndarray` has to be passed
 but also one for exporting reduction results. 
 The export mechanism can be seen in the previously generated code snippet. 
 Here, the kernel obtains a pointer with the name of the reduction symbol (here: `r`).
-This pointer not only allows providing initial values for the reduction but is also used for writing back the
-reduction result. 
+This pointer is used for exporting the reduction result back from the kernel.
+Please note that the **values passed via pointer will not be overwritten** 
+but will be incorporated in the reduction computation.
 Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value.
 
 ```{code-cell} ipython3
-- 
GitLab


From 0c8654e3251dea9c4d0ca9e3fa14f54dc970564b Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 15:36:58 +0200
Subject: [PATCH 166/180] Extend KernelAnalysis to check for invalid references
 to reduction symbols

---
 .../backend/kernelcreation/analysis.py        | 17 +++++++++
 .../nbackend/kernelcreation/test_analysis.py  | 38 +++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 tests/nbackend/kernelcreation/test_analysis.py

diff --git a/src/pystencils/backend/kernelcreation/analysis.py b/src/pystencils/backend/kernelcreation/analysis.py
index 1365e1ef3..e5f8b921e 100644
--- a/src/pystencils/backend/kernelcreation/analysis.py
+++ b/src/pystencils/backend/kernelcreation/analysis.py
@@ -13,6 +13,8 @@ from ...simp import AssignmentCollection
 from sympy.codegen.ast import AssignmentBase
 
 from ..exceptions import PsInternalCompilerError, KernelConstraintsError
+from ...sympyextensions.reduction import ReductionAssignment
+from ...sympyextensions.typed_sympy import TypedSymbol
 
 
 class KernelAnalysis:
@@ -54,6 +56,8 @@ class KernelAnalysis:
         self._check_access_independence = check_access_independence
         self._check_double_writes = check_double_writes
 
+        self._reduction_symbols: set[TypedSymbol] = set()
+
         #   Map pairs of fields and indices to offsets
         self._field_writes: dict[KernelAnalysis.FieldAndIndex, set[Any]] = defaultdict(
             set
@@ -88,6 +92,14 @@ class KernelAnalysis:
                 for asm in asms:
                     self._visit(asm)
 
+            case ReductionAssignment():
+                assert isinstance(obj.lhs, TypedSymbol)
+
+                self._reduction_symbols.add(obj.lhs)
+
+                self._handle_rhs(obj.rhs)
+                self._handle_lhs(obj.lhs)
+
             case AssignmentBase():
                 self._handle_rhs(obj.rhs)
                 self._handle_lhs(obj.lhs)
@@ -152,6 +164,11 @@ class KernelAnalysis:
                                     f"{field} is read at {offsets} and written at {write_offset}"
                                 )
                 case sp.Symbol():
+                    if expr in self._reduction_symbols:
+                        raise KernelConstraintsError(
+                            f"Illegal access to reduction symbol {expr.name} outside of ReductionAssignment. "
+                        )
+
                     self._scopes.access_symbol(expr)
 
             for arg in expr.args:
diff --git a/tests/nbackend/kernelcreation/test_analysis.py b/tests/nbackend/kernelcreation/test_analysis.py
new file mode 100644
index 000000000..d68c0a5f3
--- /dev/null
+++ b/tests/nbackend/kernelcreation/test_analysis.py
@@ -0,0 +1,38 @@
+import pytest
+
+from pystencils import fields, TypedSymbol, AddReductionAssignment, Assignment, KernelConstraintsError
+from pystencils.backend.kernelcreation import KernelCreationContext, KernelAnalysis
+from pystencils.sympyextensions import mem_acc
+from pystencils.types.quick import Ptr, Fp
+
+
+def test_invalid_reduction_symbol_reassign():
+    dtype = Fp(64)
+    ctx = KernelCreationContext(default_dtype=dtype)
+    analysis = KernelAnalysis(ctx)
+
+    x = fields(f"x: [1d]")
+    w = TypedSymbol("w", dtype)
+
+    # illegal reassign to already locally defined symbol (here: reduction symbol)
+    with pytest.raises(KernelConstraintsError):
+        analysis([
+            AddReductionAssignment(w, 3 * x.center()),
+            Assignment(w, 0)
+        ])
+
+def test_invalid_reduction_symbol_reference():
+    dtype = Fp(64)
+    ctx = KernelCreationContext(default_dtype=dtype)
+    analysis = KernelAnalysis(ctx)
+
+    x = fields(f"x: [1d]")
+    v = TypedSymbol("v", dtype)
+    w = TypedSymbol("w", dtype)
+
+    # do not allow reduction symbol to be referenced on rhs of other assignments
+    with pytest.raises(KernelConstraintsError):
+        analysis([
+            AddReductionAssignment(w, 3 * x.center()),
+            Assignment(v, w)
+        ])
\ No newline at end of file
-- 
GitLab


From 325ca38652751eccdd33fe2cc9dc55735c6dfc0c Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 15:44:08 +0200
Subject: [PATCH 167/180] Move checks and init value determination for
 ReductionAssignments to add_reduction_info

---
 .../backend/kernelcreation/context.py         | 31 ++++++++++++--
 .../backend/kernelcreation/freeze.py          | 41 +------------------
 2 files changed, 30 insertions(+), 42 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 63bfc2f7b..1d7e75db1 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -6,7 +6,8 @@ from itertools import chain, count
 from collections import namedtuple, defaultdict
 import re
 
-from ..ast.expressions import PsExpression
+from ..ast.expressions import PsExpression, PsConstantExpr, PsCall
+from ..functions import NumericLimitsFunctions, PsMathFunction
 from ...defaults import DEFAULTS
 from ...field import Field, FieldType
 from ...sympyextensions import ReductionOp
@@ -208,10 +209,16 @@ class KernelCreationContext:
         lhs_name: str,
         lhs_dtype: PsType,
         reduction_op: ReductionOp,
-        init_value: PsExpression,
     ):
         """Create ReductionInfo instance and add to its corresponding lookup table for a given symbol name."""
 
+        # make sure that lhs symbol never occurred before ReductionAssignment
+        if self.find_symbol(lhs_name):
+            raise KernelConstraintsError(
+                f"Left-hand side {lhs_name} of ReductionAssignment already exists in symbol table. "
+                f"Make sure that it is only used once in a kernel's ReductionAssignment."
+            )
+
         # replace datatype of lhs symbol with pointer datatype for write-back mechanism
         symb = self.get_symbol(lhs_name, lhs_dtype)
         pointer_symb = PsSymbol(lhs_name, PsPointerType(lhs_dtype))
@@ -221,9 +228,27 @@ class KernelCreationContext:
         local_symb = PsSymbol(f"{lhs_name}_local", lhs_dtype)
         self.add_symbol(local_symb)
 
+        # match for reduction operation and set neutral init_val
+        init_val: PsExpression
+        match reduction_op:
+            case ReductionOp.Add:
+                init_val = PsConstantExpr(PsConstant(0))
+            case ReductionOp.Sub:
+                init_val = PsConstantExpr(PsConstant(0))
+            case ReductionOp.Mul:
+                init_val = PsConstantExpr(PsConstant(1))
+            case ReductionOp.Min:
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
+            case ReductionOp.Max:
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
+            case _:
+                raise PsInternalCompilerError(
+                    f"Unsupported kind of reduction assignment: {reduction_op}."
+                )
+
         # create reduction info and add to set
         reduction_info = ReductionInfo(
-            reduction_op, init_value, local_symb, pointer_symb
+            reduction_op, init_val, local_symb, pointer_symb
         )
         self._reduction_data[lhs_name] = reduction_info
 
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index e0ed0f1f7..4c7b8fb23 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -189,21 +189,12 @@ class FreezeExpressions:
     def map_ReductionAssignment(self, expr: ReductionAssignment):
         assert isinstance(expr.lhs, TypedSymbol)
 
-        # make sure that lhs symbol never occurred before ReductionAssignment
-        if self._ctx.find_symbol(expr.lhs.name):
-            raise FreezeError(
-                f"Left-hand side {expr.lhs} of ReductionAssignment already exists in symbol table. "
-                f"Make sure that it is only used once in a kernel's ReductionAssignment."
-            )
-
-        lhs = self.visit(expr.lhs)
         rhs = self.visit(expr.rhs)
 
         assert isinstance(rhs, PsExpression)
-        assert isinstance(lhs, PsSymbolExpr)
 
         reduction_op = expr.reduction_op
-        lhs_symbol = lhs.symbol
+        lhs_symbol = expr.lhs
         lhs_dtype = lhs_symbol.dtype
         lhs_name = lhs_symbol.name
 
@@ -211,27 +202,9 @@ class FreezeExpressions:
             lhs_dtype, PsNumericType
         ), "Reduction assignments require type information of the lhs symbol."
 
-        # match for reduction operation and set neutral init_val
-        init_val: PsExpression
-        match reduction_op:
-            case ReductionOp.Add:
-                init_val = PsConstantExpr(PsConstant(0))
-            case ReductionOp.Sub:
-                init_val = PsConstantExpr(PsConstant(0))
-            case ReductionOp.Mul:
-                init_val = PsConstantExpr(PsConstant(1))
-            case ReductionOp.Min:
-                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
-            case ReductionOp.Max:
-                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
-            case _:
-                raise FreezeError(
-                    f"Unsupported kind of reduction assignment: {reduction_op}."
-                )
-
         # get reduction info from context
         reduction_info = self._ctx.add_reduction_info(
-            lhs_name, lhs_dtype, reduction_op, init_val
+            lhs_name, lhs_dtype, reduction_op
         )
 
         # create new lhs from newly created local lhs symbol
@@ -330,16 +303,6 @@ class FreezeExpressions:
 
     def map_TypedSymbol(self, expr: TypedSymbol):
         dtype = self._ctx.resolve_dynamic_type(expr.dtype)
-
-        # check if symbol is referenced after freezing a ReductionAssignment
-        if self._ctx.find_reduction_info(expr.name):
-            # check if types do not align since a ReductionAssignment modifies
-            # the symbol's type to PsPointerType in the context's symbol table
-            if (symbol := self._ctx.find_symbol(expr.name)) and symbol.dtype != dtype:
-                raise FreezeError(
-                    f"Illegal access to reduction symbol {symbol.name} after freezing a kernel's ReductionAssignment. "
-                )
-
         symb = self._ctx.get_symbol(expr.name, dtype)
         return PsSymbolExpr(symb)
 
-- 
GitLab


From 1580a2b06b0aa060def7428cf5ac3f7938b81f92 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 15:45:11 +0200
Subject: [PATCH 168/180] Adapt test_invalid_reduction_assignments to create
 new contexts for each subtest

---
 tests/nbackend/kernelcreation/test_freeze.py | 46 +++++++++++---------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py
index a39a0a994..987b68043 100644
--- a/tests/nbackend/kernelcreation/test_freeze.py
+++ b/tests/nbackend/kernelcreation/test_freeze.py
@@ -8,6 +8,7 @@ from pystencils import (
     create_numeric_type,
     TypedSymbol,
     DynamicType,
+    KernelConstraintsError,
 )
 from pystencils.sympyextensions import tcast
 from pystencils.sympyextensions.pointers import mem_acc
@@ -68,6 +69,7 @@ from pystencils.sympyextensions.integer_functions import (
     div_ceil,
 )
 from pystencils.sympyextensions.reduction import AddReductionAssignment
+from pystencils.types import PsTypeError
 
 
 def test_freeze_simple():
@@ -525,30 +527,34 @@ def test_invalid_reduction_assignments():
     x = fields(f"x: float64[1d]")
     w = TypedSymbol("w", "float64")
 
-    ctx = KernelCreationContext()
-    freeze = FreezeExpressions(ctx)
-
-    one = PsExpression.make(PsConstant(1, ctx.index_dtype))
-    counter = ctx.get_symbol("ctr", ctx.index_dtype)
-    ispace = FullIterationSpace(
-        ctx, [FullIterationSpace.Dimension(one, one, one, counter)]
-    )
-    ctx.set_iteration_space(ispace)
-
-    invalid_assignment = Assignment(w, -1 * x.center())
+    assignment = Assignment(w, -1 * x.center())
     reduction_assignment = AddReductionAssignment(w, 3 * x.center())
 
-    # reduction symbol is used before ReductionAssignment
-    with pytest.raises(FreezeError):
-        _ = [freeze(asm) for asm in [invalid_assignment, reduction_assignment]]
+    expected_errors_for_invalid_cases = [
+        # 1) Reduction symbol is used before ReductionAssignment.
+        #    May only be used for reductions -> KernelConstraintsError
+        ([assignment, reduction_assignment], KernelConstraintsError),
+        # 2) Reduction symbol is used after ReductionAssignment.
+        #    Reduction symbol is converted to pointer after freeze -> PsTypeError
+        ([reduction_assignment, assignment], PsTypeError),
+        # 3) Duplicate ReductionAssignment
+        #    May only be used once for now -> KernelConstraintsError
+        ([reduction_assignment, reduction_assignment], KernelConstraintsError)
+    ]
 
-    # reduction symbol is used after ReductionAssignment
-    with pytest.raises(FreezeError):
-        _ = [freeze(asm) for asm in [reduction_assignment, invalid_assignment]]
+    for invalid_assignment, error_class in expected_errors_for_invalid_cases:
+        ctx = KernelCreationContext()
+        freeze = FreezeExpressions(ctx)
 
-    # duplicate ReductionAssignment
-    with pytest.raises(FreezeError):
-        _ = [freeze(asm) for asm in [reduction_assignment, reduction_assignment]]
+        one = PsExpression.make(PsConstant(1, ctx.index_dtype))
+        counter = ctx.get_symbol("ctr", ctx.index_dtype)
+        ispace = FullIterationSpace(
+            ctx, [FullIterationSpace.Dimension(one, one, one, counter)]
+        )
+        ctx.set_iteration_space(ispace)
+
+        with pytest.raises(error_class):
+            _ = [freeze(asm) for asm in invalid_assignment]
 
 
 def test_memory_access():
-- 
GitLab


From 490ec9144b01c10c6a6e8d1a927fd42e37a56511 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 15:51:08 +0200
Subject: [PATCH 169/180] Omit old stuff from reduction_op_to_expr

---
 src/pystencils/backend/reduction_op_mapping.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py
index 59273efab..389c0940a 100644
--- a/src/pystencils/backend/reduction_op_mapping.py
+++ b/src/pystencils/backend/reduction_op_mapping.py
@@ -1,16 +1,8 @@
 from .ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv
-from .exceptions import FreezeError
+from .exceptions import PsInternalCompilerError
 from .functions import PsMathFunction, MathFunctions
 from ..sympyextensions.reduction import ReductionOp
 
-_available_operator_interface: set[ReductionOp] = {
-    ReductionOp.Add,
-    ReductionOp.Sub,
-    ReductionOp.Mul,
-    ReductionOp.Div,
-}
-
-
 def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
     match op:
         case ReductionOp.Add:
@@ -26,6 +18,6 @@ def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
         case ReductionOp.Max:
             return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2])
         case _:
-            raise FreezeError(
+            raise PsInternalCompilerError(
                 f"Found unsupported operation type for reduction assignments: {op}."
             )
-- 
GitLab


From 4106f9fdf0a4f1c0c6806ac3bca2a6ae467d31d1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 15:57:33 +0200
Subject: [PATCH 170/180] Parameterize test_reduction_assignments with
 reduction ops

---
 tests/nbackend/kernelcreation/test_freeze.py | 26 +++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/tests/nbackend/kernelcreation/test_freeze.py b/tests/nbackend/kernelcreation/test_freeze.py
index 987b68043..fe31cf94c 100644
--- a/tests/nbackend/kernelcreation/test_freeze.py
+++ b/tests/nbackend/kernelcreation/test_freeze.py
@@ -68,7 +68,13 @@ from pystencils.sympyextensions.integer_functions import (
     ceil_to_multiple,
     div_ceil,
 )
-from pystencils.sympyextensions.reduction import AddReductionAssignment
+from pystencils.sympyextensions.reduction import (
+    AddReductionAssignment,
+    SubReductionAssignment,
+    MulReductionAssignment,
+    MinReductionAssignment,
+    MaxReductionAssignment,
+)
 from pystencils.types import PsTypeError
 
 
@@ -498,10 +504,22 @@ def test_invalid_arrays():
         _ = freeze(symb_arr)
 
 
-def test_reduction_assignments():
+@pytest.mark.parametrize("reduction_assignment_rhs_type",
+                         [
+                             (AddReductionAssignment, PsAdd),
+                             (SubReductionAssignment, PsSub),
+                             (MulReductionAssignment, PsMul),
+                             (MinReductionAssignment, PsCall),
+                             (MaxReductionAssignment, PsCall),
+                         ])
+def test_reduction_assignments(
+        reduction_assignment_rhs_type
+):
     x = fields(f"x: float64[1d]")
     w = TypedSymbol("w", "float64")
 
+    reduction_op, rhs_type = reduction_assignment_rhs_type
+
     ctx = KernelCreationContext()
     freeze = FreezeExpressions(ctx)
 
@@ -512,7 +530,7 @@ def test_reduction_assignments():
     )
     ctx.set_iteration_space(ispace)
 
-    expr = freeze(AddReductionAssignment(w, 3 * x.center()))
+    expr = freeze(reduction_op(w, 3 * x.center()))
 
     info = ctx.find_reduction_info(w.name)
 
@@ -522,6 +540,8 @@ def test_reduction_assignments():
     assert expr.lhs.symbol == info.local_symbol
     assert expr.lhs.dtype == w.dtype
 
+    assert isinstance(expr.rhs, rhs_type)
+
 
 def test_invalid_reduction_assignments():
     x = fields(f"x: float64[1d]")
-- 
GitLab


From f3105780593ce575581cc6231c7b02f87fe40319 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 16:15:37 +0200
Subject: [PATCH 171/180] Add unit test for ReductionAssignment

---
 tests/frontend/test_sympyextensions.py | 38 +++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/tests/frontend/test_sympyextensions.py b/tests/frontend/test_sympyextensions.py
index ad5d2513b..152527441 100644
--- a/tests/frontend/test_sympyextensions.py
+++ b/tests/frontend/test_sympyextensions.py
@@ -3,7 +3,9 @@ import numpy as np
 import sympy as sp
 import pystencils
 
-from pystencils import Assignment
+import pytest
+
+from pystencils import Assignment, TypedSymbol
 from pystencils.sympyextensions import replace_second_order_products
 from pystencils.sympyextensions import remove_higher_order_terms
 from pystencils.sympyextensions import complete_the_squares_in_exp
@@ -27,6 +29,16 @@ from pystencils.sympyextensions.integer_functions import (
     div_ceil,
 )
 
+from pystencils.sympyextensions.reduction import (
+    ReductionOp,
+    AddReductionAssignment,
+    SubReductionAssignment,
+    MulReductionAssignment,
+    MinReductionAssignment,
+    MaxReductionAssignment,
+    reduction_assignment,
+)
+
 
 def test_utility():
     a = [1, 2]
@@ -199,6 +211,30 @@ def test_count_operations():
     assert ops["muls"] == 99
 
 
+@pytest.mark.parametrize("reduction_assignment_for_op", [
+    (ReductionOp.Add, AddReductionAssignment),
+    (ReductionOp.Sub, SubReductionAssignment),
+    (ReductionOp.Mul, MulReductionAssignment),
+    (ReductionOp.Min, MinReductionAssignment),
+    (ReductionOp.Max, MaxReductionAssignment),
+])
+def test_reduction_assignments(
+        reduction_assignment_for_op
+):
+    reduction_op, reduction_assignment_type = reduction_assignment_for_op
+
+    w = TypedSymbol("w", "float64")
+    v = sympy.symbols("v")
+
+    assignment = reduction_assignment(w, reduction_op, 0)
+
+    assert isinstance(assignment, reduction_assignment_type)
+
+    # invalid assignment since v is not a typed symbol
+    with pytest.raises(TypeError):
+        _ = reduction_assignment(v, reduction_op, 0)
+
+
 def test_common_denominator():
     x = sympy.symbols("x")
     expr = sympy.Rational(1, 2) + x * sympy.Rational(2, 3)
-- 
GitLab


From 3757e179c0e3b73f715819dbef1c316ba1b13af8 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 16:27:07 +0200
Subject: [PATCH 172/180] Omit unnecessary replace_symbol call

---
 src/pystencils/backend/kernelcreation/context.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 1d7e75db1..3e79bf24a 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -219,10 +219,8 @@ class KernelCreationContext:
                 f"Make sure that it is only used once in a kernel's ReductionAssignment."
             )
 
-        # replace datatype of lhs symbol with pointer datatype for write-back mechanism
-        symb = self.get_symbol(lhs_name, lhs_dtype)
-        pointer_symb = PsSymbol(lhs_name, PsPointerType(lhs_dtype))
-        self.replace_symbol(symb, pointer_symb)
+        # add symbol for lhs with pointer datatype for write-back mechanism
+        pointer_symb = self.get_symbol(lhs_name, PsPointerType(lhs_dtype))
 
         # create kernel-local copy of lhs symbol
         local_symb = PsSymbol(f"{lhs_name}_local", lhs_dtype)
-- 
GitLab


From 508701f6d3f352d0adfb655b8ea9aae3a15a5745 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 16:32:16 +0200
Subject: [PATCH 173/180] Fix lint

---
 src/pystencils/backend/kernelcreation/freeze.py | 2 +-
 src/pystencils/backend/reduction_op_mapping.py  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 4c7b8fb23..b1bb4cd4a 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -63,7 +63,7 @@ from ..ast.vector import PsVecMemAcc
 from ..constants import PsConstant
 from ...types import PsNumericType, PsStructType, PsType
 from ..exceptions import PsInputError
-from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
+from ..functions import PsMathFunction, MathFunctions
 from ..exceptions import FreezeError
 
 
diff --git a/src/pystencils/backend/reduction_op_mapping.py b/src/pystencils/backend/reduction_op_mapping.py
index 389c0940a..a97a496e0 100644
--- a/src/pystencils/backend/reduction_op_mapping.py
+++ b/src/pystencils/backend/reduction_op_mapping.py
@@ -3,6 +3,7 @@ from .exceptions import PsInternalCompilerError
 from .functions import PsMathFunction, MathFunctions
 from ..sympyextensions.reduction import ReductionOp
 
+
 def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
     match op:
         case ReductionOp.Add:
-- 
GitLab


From 9c5b9bce9065759461bb0c6ed2e88b4a665789d6 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 16:35:17 +0200
Subject: [PATCH 174/180] Omit unnecessary type context creations for
 PsVecHorizontal

---
 .../backend/kernelcreation/typification.py    | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index 9585cb23f..8dfc57a7a 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -584,34 +584,32 @@ class Typifier:
                 # bin op consisting of a scalar and a vector that is converted to a scalar
                 # -> whole expression should be treated as scalar
 
-                scalar_op_tc = TypeContext()
-                self.visit_expr(expr.scalar_operand, scalar_op_tc)
+                self.visit_expr(expr.scalar_operand, tc)
 
-                vector_op_tc = TypeContext()
-                self.visit_expr(expr.vector_operand, vector_op_tc)
+                self.visit_expr(expr.vector_operand, tc)
 
-                if scalar_op_tc.target_type is None or vector_op_tc.target_type is None:
+                if tc.target_type is None or tc.target_type is None:
                     raise TypificationError(
                         f"Unable to determine type of argument to vector horizontal: {expr}"
                     )
 
-                if not isinstance(scalar_op_tc.target_type, PsScalarType):
+                if not isinstance(tc.target_type, PsScalarType):
                     raise TypificationError(
-                        f"Illegal type in scalar operand (op1) to vector horizontal: {scalar_op_tc.target_type}"
+                        f"Illegal type in scalar operand (op1) to vector horizontal: {tc.target_type}"
                     )
 
-                if not isinstance(vector_op_tc.target_type, PsVectorType):
+                if not isinstance(tc.target_type, PsVectorType):
                     raise TypificationError(
-                        f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}"
+                        f"Illegal type in vector operand (op2) to vector horizontal: {tc.target_type}"
                     )
 
-                if vector_op_tc.target_type.scalar_type is not scalar_op_tc.target_type:
+                if tc.target_type.scalar_type is not tc.target_type:
                     raise TypificationError(
-                        f"Scalar type of vector operand {vector_op_tc.target_type} "
-                        f"does not correspond to type of scalar operand {scalar_op_tc.target_type}"
+                        f"Scalar type of vector operand {tc.target_type} "
+                        f"does not correspond to type of scalar operand {tc.target_type}"
                     )
 
-                tc.apply_dtype(scalar_op_tc.target_type, expr)
+                tc.apply_dtype(tc.target_type, expr)
 
             case PsBinOp(op1, op2):
                 self.visit_expr(op1, tc)
-- 
GitLab


From a14f13fb6bd17fb9dbe8e4db00a3ebd3fd2391a1 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Fri, 25 Apr 2025 17:39:22 +0200
Subject: [PATCH 175/180] Omit extra type context creation for scalar op in
 PsVecHorizontal

---
 .../backend/kernelcreation/typification.py        | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index 9585cb23f..1c34fac6f 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -584,20 +584,19 @@ class Typifier:
                 # bin op consisting of a scalar and a vector that is converted to a scalar
                 # -> whole expression should be treated as scalar
 
-                scalar_op_tc = TypeContext()
-                self.visit_expr(expr.scalar_operand, scalar_op_tc)
+                self.visit_expr(expr.scalar_operand, tc)
 
                 vector_op_tc = TypeContext()
                 self.visit_expr(expr.vector_operand, vector_op_tc)
 
-                if scalar_op_tc.target_type is None or vector_op_tc.target_type is None:
+                if tc.target_type is None or vector_op_tc.target_type is None:
                     raise TypificationError(
                         f"Unable to determine type of argument to vector horizontal: {expr}"
                     )
 
-                if not isinstance(scalar_op_tc.target_type, PsScalarType):
+                if not isinstance(tc.target_type, PsScalarType):
                     raise TypificationError(
-                        f"Illegal type in scalar operand (op1) to vector horizontal: {scalar_op_tc.target_type}"
+                        f"Illegal type in scalar operand (op1) to vector horizontal: {tc.target_type}"
                     )
 
                 if not isinstance(vector_op_tc.target_type, PsVectorType):
@@ -605,13 +604,13 @@ class Typifier:
                         f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}"
                     )
 
-                if vector_op_tc.target_type.scalar_type is not scalar_op_tc.target_type:
+                if vector_op_tc.target_type.scalar_type is not tc.target_type:
                     raise TypificationError(
                         f"Scalar type of vector operand {vector_op_tc.target_type} "
-                        f"does not correspond to type of scalar operand {scalar_op_tc.target_type}"
+                        f"does not correspond to type of scalar operand {tc.target_type}"
                     )
 
-                tc.apply_dtype(scalar_op_tc.target_type, expr)
+                tc.apply_dtype(tc.target_type, expr)
 
             case PsBinOp(op1, op2):
                 self.visit_expr(op1, tc)
-- 
GitLab


From a9c8d6a7ccf65e79fc57d3e92818becb513ae4d3 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Sat, 26 Apr 2025 21:01:57 +0200
Subject: [PATCH 176/180] Apply 4 suggestion(s) to 2 file(s)

Co-authored-by: Frederik Hennig <frederik.hennig@fau.de>
---
 src/pystencils/backend/kernelcreation/context.py | 9 ++++-----
 src/pystencils/backend/kernelcreation/freeze.py  | 5 ++---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 3e79bf24a..48e2f4a3a 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -207,7 +207,7 @@ class KernelCreationContext:
     def add_reduction_info(
         self,
         lhs_name: str,
-        lhs_dtype: PsType,
+        lhs_dtype: PsNumericType,
         reduction_op: ReductionOp,
     ):
         """Create ReductionInfo instance and add to its corresponding lookup table for a given symbol name."""
@@ -215,16 +215,15 @@ class KernelCreationContext:
         # make sure that lhs symbol never occurred before ReductionAssignment
         if self.find_symbol(lhs_name):
             raise KernelConstraintsError(
-                f"Left-hand side {lhs_name} of ReductionAssignment already exists in symbol table. "
-                f"Make sure that it is only used once in a kernel's ReductionAssignment."
+                f"Cannot create reduction with symbol {lhs_name}: "
+                "Another symbol with the same name already exist."
             )
 
         # add symbol for lhs with pointer datatype for write-back mechanism
         pointer_symb = self.get_symbol(lhs_name, PsPointerType(lhs_dtype))
 
         # create kernel-local copy of lhs symbol
-        local_symb = PsSymbol(f"{lhs_name}_local", lhs_dtype)
-        self.add_symbol(local_symb)
+        local_symb = self.get_new_symbol(f"{lhs_name}_local", lhs_dtype)
 
         # match for reduction operation and set neutral init_val
         init_val: PsExpression
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index b1bb4cd4a..598716567 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -198,9 +198,8 @@ class FreezeExpressions:
         lhs_dtype = lhs_symbol.dtype
         lhs_name = lhs_symbol.name
 
-        assert isinstance(
-            lhs_dtype, PsNumericType
-        ), "Reduction assignments require type information of the lhs symbol."
+        if not isinstance(lhs_dtype, PsNumericType):
+            raise FreezeError("Reduction symbol must have a numeric data type.")
 
         # get reduction info from context
         reduction_info = self._ctx.add_reduction_info(
-- 
GitLab


From 65362ddfc2bed0a28a7c071962366c206d552d97 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 30 Apr 2025 13:55:29 +0200
Subject: [PATCH 177/180] Fix typecheck note for match args of
 PsConstantFunction

---
 src/pystencils/backend/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index 3ff61e039..6a9d3e4f4 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -163,7 +163,7 @@ class PsConstantFunction(PsFunction):
     and will be broadcast by the vectorizer.
     """
 
-    __match_args__ = ("func,")
+    __match_args__ = ("func",)
 
     def __init__(
         self, func: ConstantFunctions, dtype: PsNumericType | None = None
-- 
GitLab


From 2ca58226cdac171f0373f4cd23b4efb77e9e7505 Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 30 Apr 2025 15:28:14 +0200
Subject: [PATCH 178/180] Minor adaptation of required gpu headers

---
 src/pystencils/backend/platforms/cuda.py        | 2 +-
 src/pystencils/backend/platforms/generic_gpu.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 6e6488ee1..60571db94 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -33,7 +33,7 @@ class CudaPlatform(GenericGpu):
 
     @property
     def required_headers(self) -> set[str]:
-        return super().required_headers | {'"pystencils_runtime/cuda.cuh"'}
+        return super().required_headers | {'"pystencils_runtime/cuda.cuh"', '"gpu_atomics.h"'}
 
     def resolve_reduction(
         self,
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 2cfe11d51..b87e6411f 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -203,9 +203,7 @@ class GenericGpu(Platform):
     @property
     @abstractmethod
     def required_headers(self) -> set[str]:
-        return {
-            '"gpu_atomics.h"',
-        }
+        return set()
 
     @abstractmethod
     def resolve_reduction(
-- 
GitLab


From c430559032934ce9234dfad11fea2d60bd37c6ff Mon Sep 17 00:00:00 2001
From: zy69guqi <richard.angersbach@fau.de>
Date: Wed, 30 Apr 2025 15:39:06 +0200
Subject: [PATCH 179/180] Fix missing resolution of ConstantFunctions on GPU
 platforms

---
 src/pystencils/backend/platforms/generic_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index b87e6411f..06b230454 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -313,7 +313,7 @@ class GenericGpu(Platform):
         arg_types = (dtype,) * call.function.arg_count
         expr: PsExpression | None = None
 
-        if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions:
+        if isinstance(dtype, PsIeeeFloatType):
             match func:
                 case (
                     MathFunctions.Exp
-- 
GitLab


From 3e66466514bc1d426056369e21d0ed53580082f5 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Wed, 30 Apr 2025 15:47:44 +0200
Subject: [PATCH 180/180] Apply 1 suggestion(s) to 1 file(s)

Co-authored-by: Frederik Hennig <frederik.hennig@fau.de>
---
 src/pystencils/codegen/driver.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index 74a07b902..59e313913 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -320,8 +320,7 @@ class DefaultKernelCreationDriver:
         append_ast = [PsAssignment(ptr_access, write_back_ptr)]
 
         # modify AST
-        kernel_ast.statements = prepend_ast + kernel_ast.statements
-        kernel_ast.statements += append_ast
+        kernel_ast.statements = prepend_ast + kernel_ast.statements + append_ast
 
     def _transform_for_cpu(self, kernel_ast: PsBlock) -> PsBlock:
         canonicalize = CanonicalizeSymbols(self._ctx, True)
-- 
GitLab