diff --git a/docs/source/backend/gpu_codegen.md b/docs/source/backend/gpu_codegen.md
index 3fe00840ed13faeb837d91711bca0be1a390aa4e..a95c365665bf803ce3cc683b40319c72dbf857ca 100644
--- a/docs/source/backend/gpu_codegen.md
+++ b/docs/source/backend/gpu_codegen.md
@@ -1,3 +1,4 @@
+(gpu_codegen)=
 # GPU Code Generation
 
 The code generation infrastructure for Nvidia and AMD GPUs using CUDA and HIP comprises the following components:
diff --git a/docs/source/backend/index.rst b/docs/source/backend/index.rst
index 0d384c55bc1e5933a055365f9d5ffe4143c902b6..b9b400544de4657d77bbc67a88db4092c3bd8001 100644
--- a/docs/source/backend/index.rst
+++ b/docs/source/backend/index.rst
@@ -16,6 +16,7 @@ who wish to customize or extend the behaviour of the code generator in their app
     iteration_space
     translation
     platforms
+    reduction_codegen
     transformations
     gpu_codegen
     errors
diff --git a/docs/source/backend/reduction_codegen.md b/docs/source/backend/reduction_codegen.md
new file mode 100644
index 0000000000000000000000000000000000000000..360c692568ada1147b99ef7c829ab6cdc9af02fa
--- /dev/null
+++ b/docs/source/backend/reduction_codegen.md
@@ -0,0 +1,122 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell, raises-exception]
+
+import pystencils as ps
+import numpy as np
+import cupy as cp
+```
+
+(codegen_reductions)=
+# Code Generation for Reductions
+
+In this guide, we demonstrate how reduction kernels can be generated for different platforms and what impact certain
+optimization strategies have.
+For this, we set up the update rule for a simple dot product kernel:
+
+```{code-cell} ipython3
+r = ps.TypedSymbol("r", "double")
+x, y = ps.fields(f"x, y: double[3D]", layout="fzyx")
+
+assign_dot_prod = ps.AddReductionAssignment(r, x.center() * y.center())
+```
+
+## CPU Platforms
+
+We first consider a base variant for CPUs without employing any optimizations.
+The generated code for this variant looks as follows:
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
+kernel_cpu = ps.create_kernel(assign_dot_prod, cfg)
+
+ps.inspect(kernel_cpu)
+```
+
+We want the reduction kernel to be SIMD vectorized and employ shared-memory parallelism using OpenMP.
+The supported SIMD instruction sets for reductions are:
+* SSE3
+* AVX/AVX2
+* AVX512
+
+Below you can see that an AVX vectorization was employed by using the target `Target.X86_AVX`.
+**Note that reductions require `assume_inner_stride_one` to be enabled.**
+This is due to the fact that other inner strides would require masked SIMD operations 
+which are not supported yet.
+
+```{code-cell} ipython3
+# configure SIMD vectorization
+cfg = ps.CreateKernelConfig(
+  target=ps.Target.X86_AVX,
+)
+cfg.cpu.vectorize.enable = True
+cfg.cpu.vectorize.assume_inner_stride_one = True
+
+# configure OpenMP parallelization
+cfg.cpu.openmp.enable = True
+cfg.cpu.openmp.num_threads = 8
+
+kernel_cpu_opt = ps.create_kernel(assign_dot_prod, cfg)
+
+ps.inspect(kernel_cpu_opt)
+```
+
+## GPU Platforms
+
+Reductions are currently only supported for CUDA platforms.
+Similar to the CPU section, a base variant for GPUs without explicitly employing any optimizations is shown:
+
+```{code-cell} ipython3
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+
+    kernel_gpu = ps.create_kernel(assign_dot_prod, cfg)
+
+    ps.inspect(kernel_gpu)
+```
+
+As evident from the code, the generated kernel employs atomic operations for updating the pointer 
+holding the reduction result.
+Using the explicit warp-level instructions provided by CUDA allows us to achieve higher performance compared to
+only using atomic operations.
+To generate kernels with warp-level reductions, the generator expects that CUDA block sizes are divisible by 
+the hardware's warp size.
+**Similar to the SIMD configuration, we assure the code generator that the configured block size fulfills this
+criterion by enabling `assume_warp_aligned_block_size`.**
+While the default block sizes provided by the code generator already fulfill this criterion,
+we employ a block fitting algorithm to obtain a block size that is also optimized for the kernel's iteration space.
+
+You can find more detailed information about warp size alignment in {ref}`gpu_codegen`.
+
+```{code-cell} ipython3
+    cfg = ps.CreateKernelConfig(target=ps.Target.CUDA)
+    cfg.gpu.assume_warp_aligned_block_size = True
+
+    kernel_gpu_opt = ps.create_kernel(assign_dot_prod, cfg)
+    
+    kernel_func = kernel_gpu_opt.compile()
+    kernel_func.launch_config.fit_block_size((32, 1, 1))
+
+    ps.inspect(kernel_gpu_opt)
+```
+
+:::{admonition} Developers To Do:
+
+- Support for HIP platforms
+- Support vectorization using NEON intrinsics
+:::
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 6dba50af184ee95c7378a2e923bd76a6d97883a2..4e1070979fb7e02f3c6be799761f3999fec10476 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -81,6 +81,7 @@ Topics
 
   user_manual/symbolic_language
   user_manual/kernelcreation
+  user_manual/reductions
   user_manual/gpu_kernels
   user_manual/WorkingWithTypes
 
diff --git a/docs/source/user_manual/reductions.md b/docs/source/user_manual/reductions.md
new file mode 100644
index 0000000000000000000000000000000000000000..454fd7df4c4f686075c70e45f38bf2d5563ae6af
--- /dev/null
+++ b/docs/source/user_manual/reductions.md
@@ -0,0 +1,132 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.16.4
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+mystnb:
+  execution_mode: cache
+---
+
+```{code-cell} ipython3
+:tags: [remove-cell, raises-exception]
+
+import sympy as sp
+import pystencils as ps
+import numpy as np
+import cupy as cp
+
+from enum import Enum
+```
+
+(guide_reductions)=
+# Reductions in Pystencils
+
+Reductions play a vital role in numerical simulations as they allow aggregating data across multiple elements, 
+such as computing sums, products over an array or finding its minima or maxima.
+
+## Specifying Assignments with Reductions
+
+In pystencils, reductions are made available via specialized assignments, namely `ReductionAssignment`.
+Here is a snippet creating a reduction assignment for adding up all elements of a field:
+
+```{code-cell} ipython3
+r = ps.TypedSymbol("r", "double")
+x = ps.fields(f"x: double[3D]", layout="fzyx")
+
+assign_sum = ps.AddReductionAssignment(r, x.center())
+```
+
+For each point in the iteration space, the left-hand side symbol `r` accumulates the contents of the 
+right-hand side `x.center()`. In our case, the `AddReductionAssignment` denotes an accumulation via additions.
+
+**Pystencils requires type information about the reduction symbols and thus requires `r` to be a `TypedSymbol`.**
+
+The following reduction assignment classes are available in pystencils:    
+* `AddReductionAssignment`: Builds sum over elements
+* `SubReductionAssignment`: Builds difference over elements
+* `MulReductionAssignment`: Builds product over elements
+* `MinReductionAssignment`: Finds minimum element
+* `MaxReductionAssignment`: Finds maximum element
+
+:::{note}
+AlternatÃvely, you can also make use of the `reduction_assignment` or `reduction_assignment_from_str` functions
+to specify reduction assignments:
+:::
+
+```{code-cell} ipython3
+from pystencils.sympyextensions import reduction_assignment, reduction_assignment_from_str
+from pystencils.sympyextensions.reduction import ReductionOp
+
+assign_sum = reduction_assignment(r, ReductionOp.Add, x.center())
+
+assign_sum = reduction_assignment_from_str(r, "+", x.center())
+```
+
+For other reduction operations, the following enums can be passed to `reduction_assignment` 
+or the corresponding strings can be passed to `reduction_assignment_from_str`.
+
+```{code-cell} python3
+class ReductionOp(Enum):
+    Add = "+"
+    Sub = "-"
+    Mul = "*"
+    Min = "min"
+    Max = "max"
+```
+
+## Generating Reduction Kernels
+
+With the assignments being fully assembled, we can finally invoke the code generator and 
+create the kernel object via the {any}`create_kernel` function. 
+For this example, we assume a kernel configuration where no optimizations are explicitly enabled.
+
+```{code-cell} ipython3
+cfg = ps.CreateKernelConfig(target=ps.Target.CurrentCPU)
+kernel = ps.create_kernel(assign_sum, cfg)
+
+ps.inspect(kernel)
+```
+
+:::{note}
+The generated reduction kernels may vary vastly for different platforms and optimizations.
+For the sake of compactness, the impact of different backend or optimization choices is left out.
+
+A detailed description of configuration choices and their impact on the generated code can be found in
+{ref}`codegen_reductions`.
+:::
+
+The kernel can be compiled and run immediately.
+
+To execute the kernel on CPUs, not only a {any}`numpy.ndarray` has to be passed for each field
+but also one for exporting reduction results. 
+The export mechanism can be seen in the previously generated code snippet. 
+Here, the kernel obtains a pointer with the name of the reduction symbol (here: `r`).
+This pointer not only allows providing initial values for the reduction but is also used for writing back the
+reduction result. 
+Since our reduction result is a single scalar value, it is sufficient to set up an array comprising a singular value.
+
+```{code-cell} ipython3
+    kernel_func = kernel.compile()
+
+    x_array = np.ones((4, 4, 4), dtype="float64")
+    reduction_result = np.zeros((1,), dtype="float64")
+
+    kernel_func(x=x_array, r=reduction_result)
+    
+    reduction_result[0]
+```
+
+For GPU platforms, the concepts remain the same but the fields and the write-back pointer now require device memory, 
+i.e. instances of {any}`cupy.ndarray`.
+
+:::{admonition} Developers To Do:
+
+- Support for higher-order data types for reductions, e.g. vector/matrix reductions
+:::
diff --git a/src/pystencils/__init__.py b/src/pystencils/__init__.py
index 07283d5294bc08c8e68e6a40af0f956b36a0129a..329f61d32087df392406d17b5db6bcb520798317 100644
--- a/src/pystencils/__init__.py
+++ b/src/pystencils/__init__.py
@@ -1,10 +1,6 @@
 """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions"""
 
-from .codegen import (
-    Target,
-    CreateKernelConfig,
-    AUTO
-)
+from .codegen import Target, CreateKernelConfig, AUTO
 from .defaults import DEFAULTS
 from . import fd
 from . import stencil as stencil
@@ -34,6 +30,13 @@ from .simp import AssignmentCollection
 from .sympyextensions.typed_sympy import TypedSymbol, DynamicType
 from .sympyextensions import SymbolCreator, tcast
 from .datahandling import create_data_handling
+from .sympyextensions.reduction import (
+    AddReductionAssignment,
+    SubReductionAssignment,
+    MulReductionAssignment,
+    MinReductionAssignment,
+    MaxReductionAssignment,
+)
 
 __all__ = [
     "Field",
@@ -61,6 +64,11 @@ __all__ = [
     "AssignmentCollection",
     "Assignment",
     "AddAugmentedAssignment",
+    "AddReductionAssignment",
+    "SubReductionAssignment",
+    "MulReductionAssignment",
+    "MinReductionAssignment",
+    "MaxReductionAssignment",
     "assignment_from_stencil",
     "SymbolCreator",
     "create_data_handling",
@@ -81,4 +89,5 @@ __all__ = [
 ]
 
 from . import _version
-__version__ = _version.get_versions()['version']
+
+__version__ = _version.get_versions()["version"]
diff --git a/src/pystencils/backend/ast/analysis.py b/src/pystencils/backend/ast/analysis.py
index edeba04f2b8e5d8727abe1150b9e574808e6811a..7032690a03dcf851a5b5227abd70c98301b907e7 100644
--- a/src/pystencils/backend/ast/analysis.py
+++ b/src/pystencils/backend/ast/analysis.py
@@ -62,7 +62,7 @@ class UndefinedSymbolsCollector:
 
             case PsAssignment(lhs, rhs):
                 undefined_vars = self(lhs) | self(rhs)
-                if isinstance(lhs, PsSymbolExpr):
+                if isinstance(node, PsDeclaration) and isinstance(lhs, PsSymbolExpr):
                     undefined_vars.remove(lhs.symbol)
                 return undefined_vars
 
diff --git a/src/pystencils/backend/ast/vector.py b/src/pystencils/backend/ast/vector.py
index 705d250949f3662695d506feeff30c20649eb1c5..4141b029677e8a79065421c2108da2215ce5ca2f 100644
--- a/src/pystencils/backend/ast/vector.py
+++ b/src/pystencils/backend/ast/vector.py
@@ -3,8 +3,9 @@ from __future__ import annotations
 from typing import cast
 
 from .astnode import PsAstNode
-from .expressions import PsExpression, PsLvalue, PsUnOp
+from .expressions import PsExpression, PsLvalue, PsUnOp, PsBinOp
 from .util import failing_cast
+from ...sympyextensions import ReductionOp
 
 from ...types import PsVectorType
 
@@ -17,7 +18,7 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
     """Broadcast a scalar value to N vector lanes."""
 
     __match_args__ = ("lanes", "operand")
-    
+
     def __init__(self, lanes: int, operand: PsExpression):
         super().__init__(operand)
         self._lanes = lanes
@@ -25,26 +26,86 @@ class PsVecBroadcast(PsUnOp, PsVectorOp):
     @property
     def lanes(self) -> int:
         return self._lanes
-    
+
     @lanes.setter
     def lanes(self, n: int):
         self._lanes = n
 
     def _clone_expr(self) -> PsVecBroadcast:
         return PsVecBroadcast(self._lanes, self._operand.clone())
-    
+
     def structurally_equal(self, other: PsAstNode) -> bool:
         if not isinstance(other, PsVecBroadcast):
             return False
+        return super().structurally_equal(other) and self._lanes == other._lanes
+
+
+class PsVecHorizontal(PsBinOp, PsVectorOp):
+    """Extracts scalar value from N vector lanes."""
+
+    __match_args__ = ("lanes", "scalar_operand", "vector_operand", "reduction_op")
+
+    def __init__(
+        self,
+        lanes: int,
+        scalar_operand: PsExpression,
+        vector_operand: PsExpression,
+        reduction_op: ReductionOp,
+    ):
+        super().__init__(scalar_operand, vector_operand)
+        self._lanes = lanes
+        self._reduction_op = reduction_op
+
+    @property
+    def lanes(self) -> int:
+        return self._lanes
+
+    @lanes.setter
+    def lanes(self, n: int):
+        self._lanes = n
+
+    @property
+    def scalar_operand(self) -> PsExpression:
+        return self._op1
+
+    @scalar_operand.setter
+    def scalar_operand(self, op: PsExpression):
+        self._op1 = op
+
+    @property
+    def vector_operand(self) -> PsExpression:
+        return self._op2
+
+    @vector_operand.setter
+    def vector_operand(self, op: PsExpression):
+        self._op2 = op
+
+    @property
+    def reduction_op(self) -> ReductionOp:
+        return self._reduction_op
+
+    @reduction_op.setter
+    def reduction_op(self, op: ReductionOp):
+        self._reduction_op = op
+
+    def _clone_expr(self) -> PsVecHorizontal:
+        return PsVecHorizontal(
+            self._lanes, self._op1.clone(), self._op2.clone(), self._reduction_op
+        )
+
+    def structurally_equal(self, other: PsAstNode) -> bool:
+        if not isinstance(other, PsVecHorizontal):
+            return False
         return (
             super().structurally_equal(other)
             and self._lanes == other._lanes
+            and self._reduction_op == other._reduction_op
         )
 
 
 class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
     """Pointer-based vectorized memory access.
-    
+
     Args:
         base_ptr: Pointer identifying the accessed memory region
         offset: Offset inside the memory region
@@ -95,7 +156,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
     @property
     def stride(self) -> PsExpression | None:
         return self._stride
-    
+
     @stride.setter
     def stride(self, expr: PsExpression | None):
         self._stride = expr
@@ -106,10 +167,12 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
 
     def get_vector_type(self) -> PsVectorType:
         return cast(PsVectorType, self._dtype)
-    
+
     def get_children(self) -> tuple[PsAstNode, ...]:
-        return (self._ptr, self._offset) + (() if self._stride is None else (self._stride,))
-    
+        return (self._ptr, self._offset) + (
+            () if self._stride is None else (self._stride,)
+        )
+
     def set_child(self, idx: int, c: PsAstNode):
         idx = [0, 1, 2][idx]
         match idx:
@@ -138,7 +201,7 @@ class PsVecMemAcc(PsExpression, PsLvalue, PsVectorOp):
             and self._vector_entries == other._vector_entries
             and self._aligned == other._aligned
         )
-    
+
     def __repr__(self) -> str:
         return (
             f"PsVecMemAcc({repr(self._ptr)}, {repr(self._offset)}, {repr(self._vector_entries)}, "
diff --git a/src/pystencils/backend/emission/ir_printer.py b/src/pystencils/backend/emission/ir_printer.py
index ffb65181ccd71ff95dffd6d006617dadc6809eea..22ae2f91a1ee25a3e8b5ab0097f46d5f8e6f05c2 100644
--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -10,7 +10,7 @@ from .base_printer import BasePrinter, Ops, LR
 
 from ..ast import PsAstNode
 from ..ast.expressions import PsBufferAcc
-from ..ast.vector import PsVecMemAcc, PsVecBroadcast
+from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal
 
 if TYPE_CHECKING:
     from ...codegen import Kernel
@@ -24,7 +24,7 @@ def emit_ir(ir: PsAstNode | Kernel):
 
 class IRAstPrinter(BasePrinter):
     """Print the IR AST as pseudo-code.
-    
+
     This printer produces a complete pseudocode representation of a pystencils AST.
     Other than the `CAstPrinter`, the `IRAstPrinter` is capable of emitting code for
     each node defined in `ast <pystencils.backend.ast>`.
@@ -77,6 +77,17 @@ class IRAstPrinter(BasePrinter):
                     f"vec_broadcast<{lanes}>({operand_code})", Ops.Weakest
                 )
 
+            case PsVecHorizontal(lanes, scalar_operand, vector_operand, reduction_op):
+                pc.push_op(Ops.Weakest, LR.Middle)
+                scalar_operand_code = self.visit(scalar_operand, pc)
+                vector_operand_code = self.visit(vector_operand, pc)
+                pc.pop_op()
+
+                return pc.parenthesize(
+                    f"vec_horizontal_{reduction_op.name.lower()}<{lanes}>({scalar_operand_code, vector_operand_code})",
+                    Ops.Weakest,
+                )
+
             case _:
                 return super().visit(node, pc)
 
diff --git a/src/pystencils/backend/functions.py b/src/pystencils/backend/functions.py
index f3d18f3498079dcd06c2f4ffa61a87ff59e47117..4e38de5e9f11ca1d971ae6659f04e6df7b47f64a 100644
--- a/src/pystencils/backend/functions.py
+++ b/src/pystencils/backend/functions.py
@@ -30,6 +30,7 @@ from typing import Any, Sequence, TYPE_CHECKING
 from abc import ABC
 from enum import Enum
 
+from ..sympyextensions import ReductionOp
 from ..types import PsType
 from .exceptions import PsInternalCompilerError
 
@@ -95,17 +96,31 @@ class MathFunctions(Enum):
         self.num_args = num_args
 
 
+class NumericLimitsFunctions(Enum):
+    """Numerical limits functions supported by the backend.
+
+    Each platform has to materialize these functions to a concrete implementation.
+    """
+
+    Min = ("min", 0)
+    Max = ("max", 0)
+
+    def __init__(self, func_name, num_args):
+        self.function_name = func_name
+        self.num_args = num_args
+
+
 class PsMathFunction(PsFunction):
     """Homogenously typed mathematical functions."""
 
     __match_args__ = ("func",)
 
-    def __init__(self, func: MathFunctions) -> None:
+    def __init__(self, func: MathFunctions | NumericLimitsFunctions) -> None:
         super().__init__(func.function_name, func.num_args)
         self._func = func
 
     @property
-    def func(self) -> MathFunctions:
+    def func(self) -> MathFunctions | NumericLimitsFunctions:
         return self._func
 
     def __str__(self) -> str:
@@ -121,6 +136,47 @@ class PsMathFunction(PsFunction):
         return hash(self._func)
 
 
+class ReductionFunctions(Enum):
+    """Function representing different steps in kernels with reductions supported by the backend.
+
+    Each platform has to materialize these functions to a concrete implementation.
+    """
+
+    WriteBackToPtr = ("WriteBackToPtr", 2)
+
+    def __init__(self, func_name, num_args):
+        self.function_name = func_name
+        self.num_args = num_args
+
+
+class PsReductionFunction(PsFunction):
+
+    def __init__(self, func: ReductionFunctions, reduction_op: ReductionOp) -> None:
+        super().__init__(func.function_name, func.num_args)
+        self._func = func
+        self._reduction_op = reduction_op
+
+    @property
+    def func(self) -> ReductionFunctions:
+        return self._func
+
+    @property
+    def reduction_op(self) -> ReductionOp:
+        return self._reduction_op
+
+    def __str__(self) -> str:
+        return f"{self._func.function_name}"
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, PsReductionFunction):
+            return False
+
+        return self._func == other._func and self._reduction_op == other._reduction_op
+
+    def __hash__(self) -> int:
+        return hash(self._func)
+
+
 class CFunction(PsFunction):
     """A concrete C function.
 
diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 68da893ff6204c73d61dcebddb1da602f37520c7..358b5ff6cdeb6cca1cfc232c5129ae239bdb3be9 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -1,12 +1,15 @@
 from __future__ import annotations
 
+from dataclasses import dataclass
 from typing import Iterable, Iterator, Any
 from itertools import chain, count
 from collections import namedtuple, defaultdict
 import re
 
+from ..ast.expressions import PsExpression
 from ...defaults import DEFAULTS
 from ...field import Field, FieldType
+from ...sympyextensions import ReductionOp
 from ...sympyextensions.typed_sympy import TypedSymbol, DynamicType
 
 from ..memory import PsSymbol, PsBuffer
@@ -44,6 +47,16 @@ class FieldsInKernel:
 FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array"))
 
 
+@dataclass(frozen=True)
+class ReductionInfo:
+    """Information about a reduction operation, its neutral element in form of an initial value
+    and the pointer used by the kernel as write-back argument."""
+
+    op: ReductionOp
+    init_val: PsExpression
+    ptr_symbol: PsSymbol
+
+
 class KernelCreationContext:
     """Manages the translation process from the SymPy frontend to the backend AST, and collects
     all necessary information for the translation:
@@ -75,6 +88,8 @@ class KernelCreationContext:
         self._symbol_ctr_pattern = re.compile(r"__[0-9]+$")
         self._symbol_dup_table: defaultdict[str, int] = defaultdict(lambda: 0)
 
+        self._symbols_reduction_info: dict[PsSymbol, ReductionInfo] = dict()
+
         self._fields_and_arrays: dict[str, FieldArrayPair] = dict()
         self._fields_collection = FieldsInKernel()
 
@@ -93,7 +108,7 @@ class KernelCreationContext:
     def index_dtype(self) -> PsIntegerType:
         """Data type used by default for index expressions"""
         return self._index_dtype
-    
+
     def resolve_dynamic_type(self, dtype: DynamicType | PsType) -> PsType:
         """Selects the appropriate data type for `DynamicType` instances, and returns all other types as they are."""
         match dtype:
@@ -178,6 +193,20 @@ class KernelCreationContext:
 
         self._symbols[old.name] = new
 
+    def add_symbol_reduction_info(
+        self, local_symb: PsSymbol, reduction_info: ReductionInfo
+    ):
+        """Adds entry for a symbol and its reduction info to its corresponding lookup table.
+
+        The symbol ``symbol`` shall not exist in the symbol table already.
+        """
+        if local_symb in self._symbols_reduction_info:
+            raise PsInternalCompilerError(
+                f"add_symbol_reduction_info: {local_symb.name} already exist in the symbol table"
+            )
+
+        self._symbols_reduction_info[local_symb] = reduction_info
+
     def duplicate_symbol(
         self, symb: PsSymbol, new_dtype: PsType | None = None
     ) -> PsSymbol:
@@ -213,6 +242,11 @@ class KernelCreationContext:
         """Return an iterable of all symbols listed in the symbol table."""
         return self._symbols.values()
 
+    @property
+    def symbols_reduction_info(self) -> dict[PsSymbol, ReductionInfo]:
+        """Return a dictionary holding kernel-local reduction symbols and their reduction properties."""
+        return self._symbols_reduction_info
+
     #   Fields and Arrays
 
     @property
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index b3ff5aefb525ef311d0e3199c79f60c52617a853..9dc3928b3549e7d0a9eae4ae2e78244e4e9e4b4c 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -1,23 +1,25 @@
 from typing import overload, cast, Any
 from functools import reduce
-from operator import add, mul, sub, truediv
+from operator import add, mul, sub
 
 import sympy as sp
-import sympy.core.relational
 import sympy.logic.boolalg
 from sympy.codegen.ast import AssignmentBase, AugmentedAssignment
 
+from ..memory import PsSymbol
 from ...assignment import Assignment
 from ...simp import AssignmentCollection
 from ...sympyextensions import (
     integer_functions,
     ConditionalFieldAccess,
 )
+from ...reduction_op_mapping import reduction_op_to_expr
 from ...sympyextensions.typed_sympy import TypedSymbol, TypeCast, DynamicType
 from ...sympyextensions.pointers import AddressOf, mem_acc
+from ...sympyextensions.reduction import ReductionAssignment, ReductionOp
 from ...field import Field, FieldType
 
-from .context import KernelCreationContext
+from .context import KernelCreationContext, ReductionInfo
 
 from ..ast.structural import (
     PsAstNode,
@@ -55,14 +57,14 @@ from ..ast.expressions import (
     PsAnd,
     PsOr,
     PsNot,
-    PsMemAcc
+    PsMemAcc,
 )
 from ..ast.vector import PsVecMemAcc
 
 from ..constants import PsConstant
-from ...types import PsNumericType, PsStructType, PsType
+from ...types import PsNumericType, PsStructType, PsType, PsPointerType
 from ..exceptions import PsInputError
-from ..functions import PsMathFunction, MathFunctions
+from ..functions import PsMathFunction, MathFunctions, NumericLimitsFunctions
 from ..exceptions import FreezeError
 
 
@@ -108,7 +110,9 @@ class FreezeExpressions:
 
     def __call__(self, obj: AssignmentCollection | sp.Basic) -> PsAstNode:
         if isinstance(obj, AssignmentCollection):
-            return PsBlock([cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments])
+            return PsBlock(
+                [cast(PsStructuralNode, self.visit(asm)) for asm in obj.all_assignments]
+            )
         elif isinstance(obj, AssignmentBase):
             return cast(PsAssignment, self.visit(obj))
         elif isinstance(obj, _ExprLike):
@@ -170,19 +174,69 @@ class FreezeExpressions:
         assert isinstance(lhs, PsExpression)
         assert isinstance(rhs, PsExpression)
 
-        match expr.op:
-            case "+=":
-                op = add
-            case "-=":
-                op = sub
-            case "*=":
-                op = mul
-            case "/=":
-                op = truediv
+        # transform augmented assignment to reduction op
+        str_to_reduction_op: dict[str, ReductionOp] = {
+            "+=": ReductionOp.Add,
+            "-=": ReductionOp.Sub,
+            "*=": ReductionOp.Mul,
+            "/=": ReductionOp.Div,
+        }
+
+        # reuse existing handling for transforming reduction ops to expressions
+        return PsAssignment(
+            lhs, reduction_op_to_expr(str_to_reduction_op[expr.op], lhs.clone(), rhs)
+        )
+
+    def map_ReductionAssignment(self, expr: ReductionAssignment):
+        assert isinstance(expr.lhs, TypedSymbol)
+
+        lhs = self.visit(expr.lhs)
+        rhs = self.visit(expr.rhs)
+
+        assert isinstance(rhs, PsExpression)
+        assert isinstance(lhs, PsSymbolExpr)
+
+        op = expr.reduction_op
+        orig_lhs_symb = lhs.symbol
+        dtype = lhs.dtype
+
+        assert isinstance(dtype, PsNumericType), \
+            "Reduction assignments require type information of the lhs symbol."
+
+        # replace original symbol with pointer-based type used for export
+        orig_lhs_symb_as_ptr = PsSymbol(orig_lhs_symb.name, PsPointerType(dtype))
+
+        # create kernel-local copy of lhs symbol to work with
+        new_lhs_symb = PsSymbol(f"{orig_lhs_symb.name}_local", dtype)
+        new_lhs = PsSymbolExpr(new_lhs_symb)
+
+        # get new rhs from augmented assignment
+        new_rhs: PsExpression = reduction_op_to_expr(op, new_lhs.clone(), rhs)
+
+        # match for reduction operation and set neutral init_val
+        init_val: PsExpression
+        match op:
+            case ReductionOp.Add:
+                init_val = PsConstantExpr(PsConstant(0))
+            case ReductionOp.Sub:
+                init_val = PsConstantExpr(PsConstant(0))
+            case ReductionOp.Mul:
+                init_val = PsConstantExpr(PsConstant(1))
+            case ReductionOp.Min:
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Max), [])
+            case ReductionOp.Max:
+                init_val = PsCall(PsMathFunction(NumericLimitsFunctions.Min), [])
             case _:
-                raise FreezeError(f"Unsupported augmented assignment: {expr.op}.")
+                raise FreezeError(f"Unsupported kind of reduction assignment: {op}.")
 
-        return PsAssignment(lhs, op(lhs.clone(), rhs))
+        reduction_info = ReductionInfo(op, init_val, orig_lhs_symb_as_ptr)
+
+        # add new symbol for local copy, replace original copy with pointer counterpart and add reduction info
+        self._ctx.add_symbol(new_lhs_symb)
+        self._ctx.add_symbol_reduction_info(new_lhs_symb, reduction_info)
+        self._ctx.replace_symbol(orig_lhs_symb, orig_lhs_symb_as_ptr)
+
+        return PsAssignment(new_lhs, new_rhs)
 
     def map_Symbol(self, spsym: sp.Symbol) -> PsSymbolExpr:
         symb = self._ctx.get_symbol(spsym.name)
@@ -214,7 +268,7 @@ class FreezeExpressions:
         exponent = expr.args[1]
 
         expr_frozen = self.visit_expr(base)
-        
+
         if isinstance(exponent, sp.Rational):
             #   Decompose rational exponent
             num: int = exponent.numerator
@@ -234,7 +288,7 @@ class FreezeExpressions:
                     denom = 1
 
                 assert denom == 1
-            
+
                 #   Pairwise multiplication for logarithmic runtime
                 factors = [expr_frozen] + [expr_frozen.clone() for _ in range(num - 1)]
                 while len(factors) > 1:
@@ -250,7 +304,7 @@ class FreezeExpressions:
                     expr_frozen = one / expr_frozen
 
                 return expr_frozen
-        
+
         #   If we got this far, use pow
         exponent_frozen = self.visit_expr(exponent)
         expr_frozen = PsMathFunction(MathFunctions.Pow)(expr_frozen, exponent_frozen)
@@ -280,22 +334,22 @@ class FreezeExpressions:
             raise FreezeError("Cannot translate an empty tuple.")
 
         items = [self.visit_expr(item) for item in expr]
-        
+
         if any(isinstance(i, PsArrayInitList) for i in items):
             #  base case: have nested arrays
             if not all(isinstance(i, PsArrayInitList) for i in items):
                 raise FreezeError(
                     f"Cannot translate nested arrays of non-uniform shape: {expr}"
                 )
-            
+
             subarrays = cast(list[PsArrayInitList], items)
             shape_tail = subarrays[0].shape
-            
+
             if not all(s.shape == shape_tail for s in subarrays[1:]):
                 raise FreezeError(
                     f"Cannot translate nested arrays of non-uniform shape: {expr}"
                 )
-            
+
             return PsArrayInitList([s.items_grid for s in subarrays])  # type: ignore
         else:
             #  base case: no nested arrays
diff --git a/src/pystencils/backend/kernelcreation/typification.py b/src/pystencils/backend/kernelcreation/typification.py
index 62feca26504b753a9898153b4e6fb85e3fc5b2e7..b457f39a078009e329176845bce7e4a021be6861 100644
--- a/src/pystencils/backend/kernelcreation/typification.py
+++ b/src/pystencils/backend/kernelcreation/typification.py
@@ -49,7 +49,7 @@ from ..ast.expressions import (
     PsNeg,
     PsNot,
 )
-from ..ast.vector import PsVecBroadcast, PsVecMemAcc
+from ..ast.vector import PsVecBroadcast, PsVecMemAcc, PsVecHorizontal
 from ..functions import PsMathFunction, CFunction
 from ..ast.util import determine_memory_object
 from ..exceptions import TypificationError
@@ -194,9 +194,10 @@ class TypeContext:
                             f"    Target type: {self._target_type}"
                         )
 
-                case PsNumericOpTrait() if not isinstance(
-                    self._target_type, PsNumericType
-                ) or self._target_type.is_bool():
+                case PsNumericOpTrait() if (
+                    not isinstance(self._target_type, PsNumericType)
+                    or self._target_type.is_bool()
+                ):
                     #   FIXME: PsBoolType derives from PsNumericType, but is not numeric
                     raise TypificationError(
                         f"Numerical operation encountered in non-numerical type context:\n"
@@ -579,6 +580,33 @@ class Typifier:
                 else:
                     tc.apply_dtype(PsBoolType(), expr)
 
+            case PsVecHorizontal():
+                # bin op consisting of a scalar and a vector that is converted to a scalar
+                # -> whole expression should be treated as scalar
+
+                scalar_op_tc = TypeContext()
+                self.visit_expr(expr.scalar_operand, scalar_op_tc)
+
+                vector_op_tc = TypeContext()
+                self.visit_expr(expr.vector_operand, vector_op_tc)
+
+                if scalar_op_tc.target_type is None or vector_op_tc.target_type is None:
+                    raise TypificationError(
+                        f"Unable to determine type of argument to vector horizontal: {expr}"
+                    )
+
+                if not isinstance(scalar_op_tc.target_type, PsScalarType):
+                    raise TypificationError(
+                        f"Illegal type in scalar operand (op1) to vector horizontal: {scalar_op_tc.target_type}"
+                    )
+
+                if not isinstance(vector_op_tc.target_type, PsVectorType):
+                    raise TypificationError(
+                        f"Illegal type in vector operand (op2) to vector horizontal: {vector_op_tc.target_type}"
+                    )
+
+                tc.apply_dtype(scalar_op_tc.target_type, expr)
+
             case PsBinOp(op1, op2):
                 self.visit_expr(op1, tc)
                 self.visit_expr(op2, tc)
diff --git a/src/pystencils/backend/platforms/cuda.py b/src/pystencils/backend/platforms/cuda.py
index 98ff3e3d332a46074931514ba3af1603dc6318b2..8c3cd45faf2a210bf8de16e3f25d9ba7897d4e0f 100644
--- a/src/pystencils/backend/platforms/cuda.py
+++ b/src/pystencils/backend/platforms/cuda.py
@@ -1,11 +1,139 @@
 from __future__ import annotations
 
+import math
+
 from .generic_gpu import GenericGpu
+from ..ast import PsAstNode
+from ..ast.expressions import (
+    PsExpression,
+    PsLiteralExpr,
+    PsCall,
+    PsAnd,
+    PsConstantExpr,
+    PsSymbolExpr,
+)
+from ..ast.structural import (
+    PsConditional,
+    PsStatement,
+    PsAssignment,
+    PsBlock,
+    PsStructuralNode,
+)
+from ..constants import PsConstant
+from ..exceptions import MaterializationError
+from ..functions import NumericLimitsFunctions, CFunction
+from ..literals import PsLiteral
+from ...reduction_op_mapping import reduction_op_to_expr
+from ...sympyextensions import ReductionOp
+from ...types import PsType, PsIeeeFloatType, PsCustomType, PsPointerType, PsScalarType
+from ...types.quick import SInt, UInt
 
 
 class CudaPlatform(GenericGpu):
-    """Platform for the CUDA GPU taret."""
+    """Platform for the CUDA GPU target."""
 
     @property
     def required_headers(self) -> set[str]:
-        return set()
+        return super().required_headers | {
+            '"npp.h"',
+        }
+
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        stype = symbol_expr.dtype
+        ptrtype = ptr_expr.dtype
+
+        assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(ptrtype, PsPointerType)
+        assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(stype, PsScalarType)
+
+        if not isinstance(stype, PsIeeeFloatType) or stype.width not in (32, 64):
+            NotImplementedError(
+                "atomic operations are only available for float32/64 datatypes"
+            )
+
+        # workaround for subtractions -> use additions for reducing intermediate results
+        # similar to OpenMP reductions: local copies (negative sign) are added at the end
+        match reduction_op:
+            case ReductionOp.Sub:
+                actual_reduction_op = ReductionOp.Add
+            case _:
+                actual_reduction_op = reduction_op
+
+        # check if thread is valid for performing reduction
+        ispace = self._ctx.get_iteration_space()
+        is_valid_thread = self._get_condition_for_translation(ispace)
+
+        cond: PsExpression
+        shuffles: tuple[PsAssignment, ...]
+        if self._warp_size and self._assume_warp_aligned_block_size:
+            # perform local warp reductions
+            def gen_shuffle_instr(offset: int):
+                full_mask = PsLiteralExpr(PsLiteral("0xffffffff", UInt(32)))
+                return PsCall(
+                    CFunction("__shfl_xor_sync", [UInt(32), stype, SInt(32)], stype),
+                    [
+                        full_mask,
+                        symbol_expr,
+                        PsConstantExpr(PsConstant(offset, SInt(32))),
+                    ],
+                )
+
+            # set up shuffle instructions for warp-level reduction
+            num_shuffles = math.frexp(self._warp_size)[1]
+            shuffles = tuple(
+                PsAssignment(
+                    symbol_expr,
+                    reduction_op_to_expr(
+                        actual_reduction_op,
+                        symbol_expr,
+                        gen_shuffle_instr(pow(2, i - 1)),
+                    ),
+                )
+                for i in reversed(range(1, num_shuffles))
+            )
+
+            # find first thread in warp
+            first_thread_in_warp = self._first_thread_in_warp(ispace)
+
+            # set condition to only execute atomic operation on first valid thread in warp
+            cond = (
+                PsAnd(is_valid_thread, first_thread_in_warp)
+                if is_valid_thread
+                else first_thread_in_warp
+            )
+        else:
+            # no optimization: only execute atomic add on valid thread
+            shuffles = ()
+            cond = is_valid_thread
+
+        # use atomic operation
+        func = CFunction(
+            f"atomic{actual_reduction_op.name}", [ptrtype, stype], PsCustomType("void")
+        )
+        func_args = (ptr_expr, symbol_expr)
+
+        # assemble warp reduction
+        return shuffles, PsConditional(
+            cond, PsBlock([PsStatement(PsCall(func, func_args))])
+        )
+
+    def resolve_numeric_limits(
+        self, func: NumericLimitsFunctions, dtype: PsType
+    ) -> PsExpression:
+        assert isinstance(dtype, PsIeeeFloatType)
+
+        match func:
+            case NumericLimitsFunctions.Min:
+                define = f"NPP_MINABS_{dtype.width}F"
+            case NumericLimitsFunctions.Max:
+                define = f"NPP_MAXABS_{dtype.width}F"
+            case _:
+                raise MaterializationError(
+                    f"Cannot materialize call to function {func}"
+                )
+
+        return PsLiteralExpr(PsLiteral(define, dtype))
diff --git a/src/pystencils/backend/platforms/generic_cpu.py b/src/pystencils/backend/platforms/generic_cpu.py
index fa8e54002679b9727578f923d4d409208268782c..3de7cf696ff47d65950e376e4aa38c63e1528dcf 100644
--- a/src/pystencils/backend/platforms/generic_cpu.py
+++ b/src/pystencils/backend/platforms/generic_cpu.py
@@ -1,10 +1,21 @@
 from abc import ABC, abstractmethod
 from typing import Sequence
 
-from pystencils.backend.ast.expressions import PsCall
-
-from ..functions import CFunction, PsMathFunction, MathFunctions
-from ...types import PsIntegerType, PsIeeeFloatType
+from ..ast.expressions import PsCall, PsMemAcc, PsConstantExpr
+
+from ..ast import PsAstNode
+from ..functions import (
+    CFunction,
+    MathFunctions,
+    NumericLimitsFunctions,
+    ReductionFunctions,
+    PsMathFunction,
+    PsReductionFunction,
+)
+from ..literals import PsLiteral
+from ...reduction_op_mapping import reduction_op_to_expr
+from ...sympyextensions import ReductionOp
+from ...types import PsIntegerType, PsIeeeFloatType, PsScalarType, PsPointerType
 
 from .platform import Platform
 from ..exceptions import MaterializationError
@@ -17,7 +28,7 @@ from ..kernelcreation.iteration_space import (
 )
 
 from ..constants import PsConstant
-from ..ast.structural import PsDeclaration, PsLoop, PsBlock
+from ..ast.structural import PsDeclaration, PsLoop, PsBlock, PsStructuralNode
 from ..ast.expressions import (
     PsSymbolExpr,
     PsExpression,
@@ -26,6 +37,7 @@ from ..ast.expressions import (
     PsGe,
     PsLe,
     PsTernary,
+    PsLiteralExpr,
 )
 from ..ast.vector import PsVecMemAcc
 from ...types import PsVectorType, PsCustomType
@@ -43,7 +55,7 @@ class GenericCpu(Platform):
 
     @property
     def required_headers(self) -> set[str]:
-        return {"<cmath>"}
+        return {"<cmath>", "<limits>"}
 
     def materialize_iteration_space(
         self, body: PsBlock, ispace: IterationSpace
@@ -55,13 +67,57 @@ class GenericCpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression:
-        assert isinstance(call.function, PsMathFunction)
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        call_func = call.function
+        assert isinstance(call_func, PsReductionFunction | PsMathFunction)
+
+        func = call_func.func
+
+        if (
+            isinstance(call_func, PsReductionFunction)
+            and func is ReductionFunctions.WriteBackToPtr
+        ):
+            ptr_expr, symbol_expr = call.args
+            op = call_func.reduction_op
+
+            assert isinstance(ptr_expr, PsSymbolExpr) and isinstance(
+                ptr_expr.dtype, PsPointerType
+            )
+            assert isinstance(symbol_expr, PsSymbolExpr) and isinstance(
+                symbol_expr.dtype, PsScalarType
+            )
+
+            ptr_access = PsMemAcc(
+                ptr_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))
+            )
+
+            # inspired by OpenMP: local reduction variable (negative sign) is added at the end
+            actual_op = ReductionOp.Add if op is ReductionOp.Sub else op
+
+            # create binop and potentially select corresponding function for e.g. min or max
+            potential_call = reduction_op_to_expr(actual_op, ptr_access, symbol_expr)
+            if isinstance(potential_call, PsCall):
+                potential_call.dtype = symbol_expr.dtype
+                return self.select_function(potential_call)
+
+            return potential_call
 
-        func = call.function.func
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
+        if isinstance(dtype, PsScalarType) and func in (
+            NumericLimitsFunctions.Min,
+            NumericLimitsFunctions.Max,
+        ):
+            return PsLiteralExpr(
+                PsLiteral(
+                    f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()",
+                    dtype,
+                )
+            )
+
         if isinstance(dtype, PsIeeeFloatType) and dtype.width in (32, 64):
             cfunc: CFunction
             match func:
diff --git a/src/pystencils/backend/platforms/generic_gpu.py b/src/pystencils/backend/platforms/generic_gpu.py
index 11425d9238e8270b06a628555908622b623931d6..8a4dd11a29c3243bc1cbad2a857646a1a7be038e 100644
--- a/src/pystencils/backend/platforms/generic_gpu.py
+++ b/src/pystencils/backend/platforms/generic_gpu.py
@@ -1,7 +1,19 @@
 from __future__ import annotations
-from abc import ABC, abstractmethod
 
-from ...types import constify, deconstify
+import operator
+from abc import ABC, abstractmethod
+from functools import reduce
+
+from ..ast import PsAstNode
+from ..constants import PsConstant
+from ...sympyextensions.reduction import ReductionOp
+from ...types import (
+    constify,
+    deconstify,
+    PsScalarType,
+    PsType,
+)
+from ...types.quick import SInt
 from ..exceptions import MaterializationError
 from .platform import Platform
 
@@ -15,7 +27,12 @@ from ..kernelcreation import (
 )
 
 from ..kernelcreation.context import KernelCreationContext
-from ..ast.structural import PsBlock, PsConditional, PsDeclaration
+from ..ast.structural import (
+    PsBlock,
+    PsConditional,
+    PsDeclaration,
+    PsStructuralNode,
+)
 from ..ast.expressions import (
     PsExpression,
     PsLiteralExpr,
@@ -23,12 +40,22 @@ from ..ast.expressions import (
     PsCall,
     PsLookup,
     PsBufferAcc,
+    PsConstantExpr,
+    PsAdd,
+    PsRem,
+    PsEq,
 )
 from ..ast.expressions import PsLt, PsAnd
 from ...types import PsSignedIntegerType, PsIeeeFloatType
 from ..literals import PsLiteral
-from ..functions import PsMathFunction, MathFunctions, CFunction
-
+from ..functions import (
+    MathFunctions,
+    CFunction,
+    ReductionFunctions,
+    NumericLimitsFunctions,
+    PsReductionFunction,
+    PsMathFunction,
+)
 
 int32 = PsSignedIntegerType(width=32, const=False)
 
@@ -117,7 +144,7 @@ class Blockwise4DMapping(ThreadMapping):
         THREAD_IDX[0],
         BLOCK_IDX[0],
         BLOCK_IDX[1],
-        BLOCK_IDX[2]
+        BLOCK_IDX[2],
     ]
 
     def __call__(self, ispace: IterationSpace) -> dict[PsSymbol, PsExpression]:
@@ -163,7 +190,7 @@ class Blockwise4DMapping(ThreadMapping):
 
 class GenericGpu(Platform):
     """Common base platform for CUDA- and HIP-type GPU targets.
-    
+
     Args:
         ctx: The kernel creation context
         omit_range_check: If `True`, generated index translation code will not check if the point identified
@@ -171,13 +198,40 @@ class GenericGpu(Platform):
         thread_mapping: Callback object which defines the mapping of thread indices onto iteration space points
     """
 
+    @property
+    @abstractmethod
+    def required_headers(self) -> set[str]:
+        return {
+            '"gpu_atomics.h"',
+        }
+
+    @abstractmethod
+    def resolve_numeric_limits(
+        self, func: NumericLimitsFunctions, dtype: PsType
+    ) -> PsExpression:
+        pass
+
+    @abstractmethod
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        pass
+
     def __init__(
         self,
         ctx: KernelCreationContext,
+        assume_warp_aligned_block_size: bool,
+        warp_size: int | None,
         thread_mapping: ThreadMapping | None = None,
     ) -> None:
         super().__init__(ctx)
 
+        self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
+        self._warp_size = warp_size
+
         self._thread_mapping = (
             thread_mapping if thread_mapping is not None else Linear3DMapping()
         )
@@ -194,14 +248,80 @@ class GenericGpu(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression:
-        assert isinstance(call.function, PsMathFunction)
+    @staticmethod
+    def _get_condition_for_translation(ispace: IterationSpace):
+
+        if isinstance(ispace, FullIterationSpace):
+            conds = []
+
+            dimensions = ispace.dimensions_in_loop_order()
+
+            for dim in dimensions:
+                ctr_expr = PsExpression.make(dim.counter)
+                conds.append(PsLt(ctr_expr, dim.stop))
+
+            condition: PsExpression = conds[0]
+            for cond in conds[1:]:
+                condition = PsAnd(condition, cond)
+
+            return condition
+        elif isinstance(ispace, SparseIterationSpace):
+            sparse_ctr_expr = PsExpression.make(ispace.sparse_counter)
+            stop = PsExpression.make(ispace.index_list.shape[0])
+
+            return PsLt(sparse_ctr_expr.clone(), stop)
+        else:
+            raise MaterializationError(f"Unknown type of iteration space: {ispace}")
+
+    @staticmethod
+    def _thread_index_per_dim(ispace: IterationSpace) -> tuple[PsExpression, ...]:
+        """Returns thread indices multiplied with block dimension strides per dimension."""
+
+        return tuple(
+            idx
+            * PsConstantExpr(
+                PsConstant(reduce(operator.mul, BLOCK_DIM[:i], 1), SInt(32))
+            )
+            for i, idx in enumerate(THREAD_IDX[: ispace.rank])
+        )
+
+    def _first_thread_in_warp(self, ispace: IterationSpace) -> PsExpression:
+        """Returns expression that determines whether a thread is the first within a warp."""
+
+        tids_per_dim = GenericGpu._thread_index_per_dim(ispace)
+        tid: PsExpression = tids_per_dim[0]
+        for t in tids_per_dim[1:]:
+            tid = PsAdd(tid, t)
+
+        return PsEq(
+            PsRem(tid, PsConstantExpr(PsConstant(self._warp_size, SInt(32)))),
+            PsConstantExpr(PsConstant(0, SInt(32))),
+        )
+
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+        call_func = call.function
+        assert isinstance(call_func, PsReductionFunction | PsMathFunction)
+
+        func = call_func.func
+
+        if (
+            isinstance(call_func, PsReductionFunction)
+            and func is ReductionFunctions.WriteBackToPtr
+        ):
+            ptr_expr, symbol_expr = call.args
+            op = call_func.reduction_op
+
+            return self.resolve_reduction(ptr_expr, symbol_expr, op)
 
-        func = call.function.func
         dtype = call.get_dtype()
         arg_types = (dtype,) * func.num_args
 
-        if isinstance(dtype, PsIeeeFloatType):
+        if isinstance(dtype, PsScalarType) and isinstance(func, NumericLimitsFunctions):
+            return self.resolve_numeric_limits(func, dtype)
+
+        if isinstance(dtype, PsIeeeFloatType) and func in MathFunctions:
             match func:
                 case (
                     MathFunctions.Exp
@@ -262,7 +382,6 @@ class GenericGpu(Platform):
         ctr_mapping = self._thread_mapping(ispace)
 
         indexing_decls = []
-        conds = []
 
         dimensions = ispace.dimensions_in_loop_order()
 
@@ -276,14 +395,9 @@ class GenericGpu(Platform):
             indexing_decls.append(
                 self._typify(PsDeclaration(ctr_expr, ctr_mapping[dim.counter]))
             )
-            conds.append(PsLt(ctr_expr, dim.stop))
 
-        condition: PsExpression = conds[0]
-        for cond in conds[1:]:
-            condition = PsAnd(condition, cond)
-        ast = PsBlock(indexing_decls + [PsConditional(condition, body)])
-
-        return ast
+        cond = self._get_condition_for_translation(ispace)
+        return PsBlock(indexing_decls + [PsConditional(cond, body)])
 
     def _prepend_sparse_translation(
         self, body: PsBlock, ispace: SparseIterationSpace
@@ -313,8 +427,5 @@ class GenericGpu(Platform):
         ]
         body.statements = mappings + body.statements
 
-        stop = PsExpression.make(ispace.index_list.shape[0])
-        condition = PsLt(sparse_ctr_expr.clone(), stop)
-        ast = PsBlock([sparse_idx_decl, PsConditional(condition, body)])
-
-        return ast
+        cond = self._get_condition_for_translation(ispace)
+        return PsBlock([sparse_idx_decl, PsConditional(cond, body)])
diff --git a/src/pystencils/backend/platforms/hip.py b/src/pystencils/backend/platforms/hip.py
index c758995a0d9f8fbbb2e9e424bf2cfa6ab7eca086..404d9bb27bf7eded15b9e2f7dc60fb70ceedb1fe 100644
--- a/src/pystencils/backend/platforms/hip.py
+++ b/src/pystencils/backend/platforms/hip.py
@@ -1,11 +1,40 @@
 from __future__ import annotations
 
 from .generic_gpu import GenericGpu
+from ..ast import PsAstNode
+from ..ast.expressions import PsExpression, PsLiteralExpr
+from ..ast.structural import PsStructuralNode
+from ..exceptions import MaterializationError
+from ..functions import NumericLimitsFunctions
+from ..literals import PsLiteral
+from ...sympyextensions import ReductionOp
+from ...types import PsType, PsIeeeFloatType
 
 
 class HipPlatform(GenericGpu):
-    """Platform for the HIP GPU taret."""
+    """Platform for the HIP GPU target."""
 
     @property
     def required_headers(self) -> set[str]:
-        return {'"pystencils_runtime/hip.h"'}
+        return super().required_headers | {'"pystencils_runtime/hip.h"', "<limits>"}
+
+    def resolve_numeric_limits(
+        self, func: NumericLimitsFunctions, dtype: PsType
+    ) -> PsExpression:
+        assert isinstance(dtype, PsIeeeFloatType)
+
+        return PsLiteralExpr(
+            PsLiteral(
+                f"std::numeric_limits<{dtype.c_string()}>::{func.function_name}()",
+                dtype,
+            )
+        )
+
+    def resolve_reduction(
+        self,
+        ptr_expr: PsExpression,
+        symbol_expr: PsExpression,
+        reduction_op: ReductionOp,
+    ) -> tuple[tuple[PsStructuralNode, ...], PsAstNode]:
+
+        raise MaterializationError("Reductions are yet not supported in HIP backend.")
diff --git a/src/pystencils/backend/platforms/platform.py b/src/pystencils/backend/platforms/platform.py
index 8ed4729a2d67777bbd132d9e48140e20e3656767..7b81865aef50d63e9913161933f90ee86b24533e 100644
--- a/src/pystencils/backend/platforms/platform.py
+++ b/src/pystencils/backend/platforms/platform.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 
-from ..ast.structural import PsBlock
+from ..ast import PsAstNode
+from ..ast.structural import PsBlock, PsStructuralNode
 from ..ast.expressions import PsCall, PsExpression
 
 from ..kernelcreation.context import KernelCreationContext
@@ -11,7 +12,7 @@ class Platform(ABC):
     """Abstract base class for all supported platforms.
 
     The platform performs all target-dependent tasks during code generation:
-    
+
     - Translation of the iteration space to an index source (loop nest, GPU indexing, ...)
     - Platform-specific optimizations (e.g. vectorization, OpenMP)
     """
@@ -37,7 +38,7 @@ class Platform(ABC):
     @abstractmethod
     def select_function(
         self, call: PsCall
-    ) -> PsExpression:
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         """Select an implementation for the given function on the given data type.
 
         If no viable implementation exists, raise a `MaterializationError`.
diff --git a/src/pystencils/backend/platforms/sycl.py b/src/pystencils/backend/platforms/sycl.py
index d16c4f51b0609e50b1047c8f2fbd3824b8630b18..22d60f9b0e9f8b18995338973ade8ebe8caab8bf 100644
--- a/src/pystencils/backend/platforms/sycl.py
+++ b/src/pystencils/backend/platforms/sycl.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
+from ..ast import PsAstNode
 from ..functions import CFunction, PsMathFunction, MathFunctions
 from ..kernelcreation.iteration_space import (
     IterationSpace,
     FullIterationSpace,
     SparseIterationSpace,
 )
-from ..ast.structural import PsDeclaration, PsBlock, PsConditional
+from ..ast.structural import PsDeclaration, PsBlock, PsConditional, PsStructuralNode
 from ..ast.expressions import (
     PsExpression,
     PsSymbolExpr,
@@ -55,7 +56,9 @@ class SyclPlatform(Platform):
         else:
             raise MaterializationError(f"Unknown type of iteration space: {ispace}")
 
-    def select_function(self, call: PsCall) -> PsExpression:
+    def select_function(
+        self, call: PsCall
+    ) -> PsExpression | tuple[tuple[PsStructuralNode, ...], PsAstNode]:
         assert isinstance(call.function, PsMathFunction)
 
         func = call.function.func
diff --git a/src/pystencils/backend/platforms/x86.py b/src/pystencils/backend/platforms/x86.py
index 3f0285f0c9d75e1df00d1b662ade818a8b50a9cc..add38cfe419ee352a23bf06aefa6f56c22c59d11 100644
--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Sequence
+from typing import Sequence, Tuple
 from enum import Enum
 from functools import cache
 
@@ -17,8 +17,9 @@ from ..ast.expressions import (
     PsCast,
     PsCall,
 )
-from ..ast.vector import PsVecMemAcc, PsVecBroadcast
-from ...types import PsCustomType, PsVectorType, PsPointerType
+from ..ast.vector import PsVecMemAcc, PsVecBroadcast, PsVecHorizontal
+from ...sympyextensions import ReductionOp
+from ...types import PsCustomType, PsVectorType, PsPointerType, PsType
 from ..constants import PsConstant
 
 from ..exceptions import MaterializationError
@@ -132,6 +133,8 @@ class X86VectorCpu(GenericVectorCpu):
         else:
             headers = {"<immintrin.h>"}
 
+        headers.update({'"simd_horizontal_helpers.h"'})
+
         return super().required_headers | headers
 
     def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType:
@@ -160,7 +163,14 @@ class X86VectorCpu(GenericVectorCpu):
     ) -> PsExpression:
         match expr:
             case PsUnOp() | PsBinOp():
-                func = _x86_op_intrin(self._vector_arch, expr, expr.get_dtype())
+                vtype: PsType
+                if isinstance(expr, PsVecHorizontal):
+                    # return type of expression itself is scalar, but input argument to intrinsic is a vector
+                    vtype = expr.vector_operand.get_dtype()
+                else:
+                    vtype = expr.get_dtype()
+
+                func = _x86_op_intrin(self._vector_arch, expr, vtype)
                 intrinsic = func(*operands)
                 intrinsic.dtype = func.return_type
                 return intrinsic
@@ -339,6 +349,7 @@ def _x86_op_intrin(
     prefix = varch.intrin_prefix(vtype)
     suffix = varch.intrin_suffix(vtype)
     rtype = atype = varch.intrin_type(vtype)
+    atypes: Tuple[PsType, ...] = ()
 
     match op:
         case PsVecBroadcast():
@@ -346,6 +357,16 @@ def _x86_op_intrin(
             if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4:
                 suffix += "x"
             atype = vtype.scalar_type
+        case PsVecHorizontal():
+            # horizontal add instead of sub avoids double inversion of sign
+            actual_op = (
+                ReductionOp.Add
+                if op.reduction_op == ReductionOp.Sub
+                else op.reduction_op
+            )
+            opstr = f"horizontal_{actual_op.name.lower()}"
+            rtype = vtype.scalar_type
+            atypes = (vtype.scalar_type, vtype)
         case PsAdd():
             opstr = "add"
         case PsSub():
@@ -392,7 +413,9 @@ def _x86_op_intrin(
                 case (SInt(64), Fp()) | (
                     Fp(),
                     SInt(64),
-                ) if varch < X86VectorArch.AVX512:
+                ) if (
+                    varch < X86VectorArch.AVX512
+                ):
                     panic()
                 # AVX512 only: cvtepiA_epiT if A > T
                 case (SInt(a), SInt(t)) if a > t and varch < X86VectorArch.AVX512:
@@ -408,4 +431,7 @@ def _x86_op_intrin(
             )
 
     num_args = 1 if isinstance(op, PsUnOp) else 2
-    return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype)
+    if not atypes:
+        atypes = (atype,) * num_args
+
+    return CFunction(f"{prefix}_{opstr}_{suffix}", atypes, rtype)
diff --git a/src/pystencils/backend/transformations/add_pragmas.py b/src/pystencils/backend/transformations/add_pragmas.py
index bd782422f1fa80b96ec7cf69473fda2b1f45c3d6..fa466e495c826fca6c8f0ae1f338dfd868436ebd 100644
--- a/src/pystencils/backend/transformations/add_pragmas.py
+++ b/src/pystencils/backend/transformations/add_pragmas.py
@@ -9,6 +9,7 @@ from ..ast import PsAstNode
 from ..ast.structural import PsBlock, PsLoop, PsPragma, PsStructuralNode
 from ..ast.expressions import PsExpression
 
+from ...types import PsScalarType
 
 __all__ = ["InsertPragmasAtLoops", "LoopPragma", "AddOpenMP"]
 
@@ -122,6 +123,17 @@ class AddOpenMP:
         if num_threads is not None:
             pragma_text += f" num_threads({str(num_threads)})"
 
+        if bool(ctx.symbols_reduction_info):
+            for symbol, reduction_info in ctx.symbols_reduction_info.items():
+                if isinstance(symbol.dtype, PsScalarType):
+                    pragma_text += (
+                        f" reduction({reduction_info.op.value}: {symbol.name})"
+                    )
+                else:
+                    NotImplementedError(
+                        "OMP: Reductions for non-scalar data types are not supported yet."
+                    )
+
         if collapse is not None:
             if collapse <= 0:
                 raise ValueError(
diff --git a/src/pystencils/backend/transformations/loop_vectorizer.py b/src/pystencils/backend/transformations/loop_vectorizer.py
index e1e4fea502c08de86e13de5e3c251f1b7a7d0ee6..48b9ad0da567a6221c33c594795c8d4b6e745ea0 100644
--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -7,9 +7,15 @@ from ...types import PsVectorType, PsScalarType
 from ..kernelcreation import KernelCreationContext
 from ..constants import PsConstant
 from ..ast import PsAstNode
-from ..ast.structural import PsLoop, PsBlock, PsDeclaration
-from ..ast.expressions import PsExpression, PsTernary, PsGt
-from ..ast.vector import PsVecBroadcast
+from ..ast.structural import (
+    PsLoop,
+    PsBlock,
+    PsDeclaration,
+    PsAssignment,
+    PsStructuralNode,
+)
+from ..ast.expressions import PsExpression, PsTernary, PsGt, PsSymbolExpr
+from ..ast.vector import PsVecBroadcast, PsVecHorizontal
 from ..ast.analysis import collect_undefined_symbols
 
 from .ast_vectorizer import VectorizationAxis, VectorizationContext, AstVectorizer
@@ -134,6 +140,36 @@ class LoopVectorizer:
         #   Prepare vectorization context
         vc = VectorizationContext(self._ctx, self._lanes, axis)
 
+        #   Prepare reductions
+        simd_init_local_reduction_vars: list[PsStructuralNode] = []
+        simd_writeback_local_reduction_vars: list[PsStructuralNode] = []
+        for symb, reduction_info in self._ctx.symbols_reduction_info.items():
+            # Vectorize symbol for local copy
+            vector_symb = vc.vectorize_symbol(symb)
+
+            # Declare and init vector
+            simd_init_local_reduction_vars += [
+                self._type_fold(
+                    PsDeclaration(
+                        PsSymbolExpr(vector_symb),
+                        PsVecBroadcast(self._lanes, PsSymbolExpr(symb)),
+                    )
+                )
+            ]
+
+            # Write back vectorization result
+            simd_writeback_local_reduction_vars += [
+                PsAssignment(
+                    PsSymbolExpr(symb),
+                    PsVecHorizontal(
+                        self._lanes,
+                        PsSymbolExpr(symb),
+                        PsSymbolExpr(vector_symb),
+                        reduction_info.op,
+                    ),
+                )
+            ]
+
         #   Generate vectorized loop body
         simd_body = self._vectorize_ast(loop.body, vc)
 
@@ -224,10 +260,10 @@ class LoopVectorizer:
                 )
 
                 return PsBlock(
-                    [
-                        simd_stop_decl,
-                        simd_step_decl,
-                        simd_loop,
+                    simd_init_local_reduction_vars
+                    + [simd_stop_decl, simd_step_decl, simd_loop]
+                    + simd_writeback_local_reduction_vars
+                    + [
                         trailing_start_decl,
                         trailing_loop,
                     ]
@@ -238,11 +274,13 @@ class LoopVectorizer:
 
             case LoopVectorizer.TrailingItersTreatment.NONE:
                 return PsBlock(
-                    [
+                    simd_init_local_reduction_vars
+                    + [
                         simd_stop_decl,
                         simd_step_decl,
                         simd_loop,
                     ]
+                    + simd_writeback_local_reduction_vars
                 )
 
     @overload
diff --git a/src/pystencils/backend/transformations/select_functions.py b/src/pystencils/backend/transformations/select_functions.py
index e41c345ae4ed71101d07fcaa5b9df88b1e0f54e2..9ce4046931036a5e16eda2c9cd16faa272752acc 100644
--- a/src/pystencils/backend/transformations/select_functions.py
+++ b/src/pystencils/backend/transformations/select_functions.py
@@ -1,7 +1,9 @@
+from ..ast.structural import PsAssignment, PsBlock, PsStructuralNode
+from ..exceptions import MaterializationError
 from ..platforms import Platform
 from ..ast import PsAstNode
-from ..ast.expressions import PsCall
-from ..functions import PsMathFunction
+from ..ast.expressions import PsCall, PsExpression
+from ..functions import PsMathFunction, PsReductionFunction
 
 
 class SelectFunctions:
@@ -17,7 +19,39 @@ class SelectFunctions:
     def visit(self, node: PsAstNode) -> PsAstNode:
         node.children = [self.visit(c) for c in node.children]
 
-        if isinstance(node, PsCall) and isinstance(node.function, PsMathFunction):
-            return self._platform.select_function(node)
+        if isinstance(node, PsAssignment):
+            rhs = node.rhs
+            if isinstance(rhs, PsCall) and isinstance(
+                rhs.function, PsReductionFunction
+            ):
+                resolved_func = self._platform.select_function(rhs)
+
+                match resolved_func:
+                    case (prepend, new_rhs):
+                        assert isinstance(prepend, tuple)
+
+                        match new_rhs:
+                            case PsExpression():
+                                return PsBlock(
+                                    prepend + (PsAssignment(node.lhs, new_rhs),)
+                                )
+                            case PsStructuralNode():
+                                # special case: produces structural with atomic operation writing value back to ptr
+                                return PsBlock(prepend + (new_rhs,))
+                            case _:
+                                assert False, "Unexpected output from SelectFunctions."
+                    case PsExpression():
+                        return PsAssignment(node.lhs, resolved_func)
+                    case _:
+                        raise MaterializationError(
+                            f"Wrong return type for resolved function {rhs.function.name} in SelectFunctions."
+                        )
+            else:
+                return node
+        elif isinstance(node, PsCall) and isinstance(node.function, PsMathFunction):
+            resolved_func = self._platform.select_function(node)
+            assert isinstance(resolved_func, PsExpression)
+
+            return resolved_func
         else:
             return node
diff --git a/src/pystencils/backend/transformations/select_intrinsics.py b/src/pystencils/backend/transformations/select_intrinsics.py
index 060192810a7ccb9ab9ed13f64dd7948791078ea4..b20614393b439a8e5e70d682362dbec6769df9c2 100644
--- a/src/pystencils/backend/transformations/select_intrinsics.py
+++ b/src/pystencils/backend/transformations/select_intrinsics.py
@@ -7,7 +7,7 @@ from ..ast.structural import PsAstNode, PsDeclaration, PsAssignment, PsStatement
 from ..ast.expressions import PsExpression, PsCall, PsCast, PsLiteral
 from ...types import PsCustomType, PsVectorType, constify, deconstify
 from ..ast.expressions import PsSymbolExpr, PsConstantExpr, PsUnOp, PsBinOp
-from ..ast.vector import PsVecMemAcc
+from ..ast.vector import PsVecMemAcc, PsVecHorizontal
 from ..exceptions import MaterializationError
 from ..functions import CFunction, PsMathFunction
 
@@ -86,6 +86,10 @@ class SelectIntrinsics:
                 new_rhs = self.visit_expr(rhs, sc)
                 return PsStatement(self._platform.vector_store(lhs, new_rhs))
 
+            case PsAssignment(lhs, rhs) if isinstance(rhs, PsVecHorizontal):
+                new_rhs = self.visit_expr(rhs, sc)
+                return PsAssignment(lhs, new_rhs)
+
             case _:
                 node.children = [self.visit(c, sc) for c in node.children]
 
@@ -93,7 +97,15 @@ class SelectIntrinsics:
 
     def visit_expr(self, expr: PsExpression, sc: SelectionContext) -> PsExpression:
         if not isinstance(expr.dtype, PsVectorType):
-            return expr
+            # special case: result type of horizontal reduction is scalar
+            if isinstance(expr, PsVecHorizontal):
+                scalar_op = expr.scalar_operand
+                vector_op_to_scalar = self.visit_expr(expr.vector_operand, sc)
+                return self._platform.op_intrinsic(
+                    expr, [scalar_op, vector_op_to_scalar]
+                )
+            else:
+                return expr
 
         match expr:
             case PsSymbolExpr(symb):
diff --git a/src/pystencils/codegen/driver.py b/src/pystencils/codegen/driver.py
index e9fc69b76b3d88024f9cbde880617d6a3a3696ff..c285dd7bf1a5fb786ed27472004325e0efcfe5f0 100644
--- a/src/pystencils/codegen/driver.py
+++ b/src/pystencils/codegen/driver.py
@@ -25,8 +25,15 @@ from ..types import PsIntegerType, PsScalarType
 
 from ..backend.memory import PsSymbol
 from ..backend.ast import PsAstNode
-from ..backend.ast.structural import PsBlock, PsLoop
-from ..backend.ast.expressions import PsExpression
+from ..backend.functions import PsReductionFunction, ReductionFunctions
+from ..backend.ast.expressions import (
+    PsExpression,
+    PsSymbolExpr,
+    PsCall,
+    PsMemAcc,
+    PsConstantExpr,
+)
+from ..backend.ast.structural import PsBlock, PsLoop, PsDeclaration, PsAssignment
 from ..backend.ast.analysis import collect_undefined_symbols, collect_required_headers
 from ..backend.kernelcreation import (
     KernelCreationContext,
@@ -183,6 +190,31 @@ class DefaultKernelCreationDriver:
         if self._intermediates is not None:
             self._intermediates.constants_eliminated = kernel_ast.clone()
 
+        #   Extensions for reductions
+        for symbol, reduction_info in self._ctx.symbols_reduction_info.items():
+            typify = Typifier(self._ctx)
+            symbol_expr = typify(PsSymbolExpr(symbol))
+            ptr_symbol_expr = typify(PsSymbolExpr(reduction_info.ptr_symbol))
+            init_val = typify(reduction_info.init_val)
+
+            ptr_access = PsMemAcc(
+                ptr_symbol_expr, PsConstantExpr(PsConstant(0, self._ctx.index_dtype))
+            )
+            write_back_ptr = PsCall(
+                PsReductionFunction(
+                    ReductionFunctions.WriteBackToPtr, reduction_info.op
+                ),
+                [ptr_symbol_expr, symbol_expr],
+            )
+
+            # declare and init local copy with neutral element
+            prepend_ast = [PsDeclaration(symbol_expr, init_val)]
+            # write back result to reduction target variable
+            append_ast = [PsAssignment(ptr_access, write_back_ptr)]
+
+            kernel_ast.statements = prepend_ast + kernel_ast.statements
+            kernel_ast.statements += append_ast
+
         #   Target-Specific optimizations
         if self._target.is_cpu():
             kernel_ast = self._transform_for_cpu(kernel_ast)
@@ -405,14 +437,18 @@ class DefaultKernelCreationDriver:
 
         idx_scheme: GpuIndexingScheme = self._cfg.gpu.get_option("indexing_scheme")
         manual_launch_grid: bool = self._cfg.gpu.get_option("manual_launch_grid")
-        assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option("assume_warp_aligned_block_size")
+        assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option(
+            "assume_warp_aligned_block_size"
+        )
         warp_size: int | None = self._cfg.gpu.get_option("warp_size")
 
         if warp_size is None:
             warp_size = GpuOptions.default_warp_size(self._target)
 
         if warp_size is None and assume_warp_aligned_block_size:
-            warn("GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`.")
+            warn(
+                "GPU warp size is unknown - ignoring assumption `assume_warp_aligned_block_size`."
+            )
 
         return GpuIndexing(
             self._ctx,
@@ -457,6 +493,11 @@ class DefaultKernelCreationDriver:
                 else None
             )
 
+            assume_warp_aligned_block_size: bool = self._cfg.gpu.get_option(
+                "assume_warp_aligned_block_size"
+            )
+            warp_size: int | None = self._cfg.gpu.get_option("warp_size")
+
             GpuPlatform: type
             match self._target:
                 case Target.CUDA:
@@ -468,6 +509,8 @@ class DefaultKernelCreationDriver:
 
             return GpuPlatform(
                 self._ctx,
+                assume_warp_aligned_block_size,
+                warp_size,
                 thread_mapping=thread_mapping,
             )
 
diff --git a/src/pystencils/codegen/gpu_indexing.py b/src/pystencils/codegen/gpu_indexing.py
index 43b612bd77f535ef45c09c38489784dd12a53ce0..f5606da0256039ba187d62583f6838e825002f63 100644
--- a/src/pystencils/codegen/gpu_indexing.py
+++ b/src/pystencils/codegen/gpu_indexing.py
@@ -260,6 +260,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
     def __init__(
         self,
+        rank: int,
         num_work_items: _Dim3Lambda,
         hw_props: HardwareProperties,
         assume_warp_aligned_block_size: bool,
@@ -270,7 +271,7 @@ class DynamicBlockSizeLaunchConfiguration(GpuLaunchConfiguration):
 
         self._assume_warp_aligned_block_size = assume_warp_aligned_block_size
 
-        default_bs = GpuLaunchConfiguration.get_default_block_size(len(num_work_items))
+        default_bs = GpuLaunchConfiguration.get_default_block_size(rank)
         self._default_block_size = default_bs
         self._init_block_size: dim3 = default_bs
         self._compute_block_size: (
@@ -598,6 +599,7 @@ class GpuIndexing:
 
         def factory():
             return DynamicBlockSizeLaunchConfiguration(
+                rank,
                 num_work_items,
                 self._hw_props,
                 self._assume_warp_aligned_block_size,
diff --git a/src/pystencils/include/gpu_atomics.h b/src/pystencils/include/gpu_atomics.h
new file mode 100644
index 0000000000000000000000000000000000000000..6de5c3321e1cff84e00cdf7b3c551638ecb2a99e
--- /dev/null
+++ b/src/pystencils/include/gpu_atomics.h
@@ -0,0 +1,90 @@
+#pragma once
+
+// No direct implementation for all atomic operations available
+// -> add support by custom implementations using a CAS mechanism
+
+#if defined(__CUDA_ARCH__) || defined(__HIPCC_RTC__)
+
+// - atomicMul (double/float)
+//   see https://stackoverflow.com/questions/43354798/atomic-multiplication-and-division
+__device__ double atomicMul(double* address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int oldValue = *address_as_ull, assumed;
+    do {
+      assumed = oldValue;
+      oldValue = atomicCAS(address_as_ull, assumed, __double_as_longlong(val *
+                           __longlong_as_double(assumed)));
+    } while (assumed != oldValue);
+
+    return __longlong_as_double(oldValue);
+}
+
+__device__ float atomicMul(float* address, float val) {
+    int* address_as_int = (int*)address;
+    int old = *address_as_int;
+    int assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_int, assumed, __float_as_int(val * __int_as_float(assumed)));
+    } while (assumed != old);
+
+    return __int_as_float(old);
+}
+
+#endif
+
+#ifdef __CUDA_ARCH__
+
+// - atomicMin (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMin(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val < __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMin(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val < __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
+
+// - atomicMax (double/float)
+//   see https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ double atomicMax(double *address, double val)
+{
+    unsigned long long ret = __double_as_longlong(*address);
+    while(val > __longlong_as_double(ret))
+    {
+        unsigned long long old = ret;
+        if((ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val))) == old)
+            break;
+    }
+    return __longlong_as_double(ret);
+}
+
+__device__ __forceinline__ float atomicMax(float *address, float val)
+{
+    int ret = __float_as_int(*address);
+    while(val > __int_as_float(ret))
+    {
+        int old = ret;
+        if((ret = atomicCAS((int *)address, old, __float_as_int(val))) == old)
+            break;
+    }
+    return __int_as_float(ret);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/pystencils/include/simd_horizontal_helpers.h b/src/pystencils/include/simd_horizontal_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd1889153a56876032781b77dcb66a413b3fe07d
--- /dev/null
+++ b/src/pystencils/include/simd_horizontal_helpers.h
@@ -0,0 +1,231 @@
+#pragma once
+
+#include <cmath>
+
+#if defined(__SSE3__)
+#include <immintrin.h>
+
+inline double _mm_horizontal_add_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	return dst + _mm_cvtsd_f64(_mm_hadd_pd(_v, _v));
+}
+
+inline float _mm_horizontal_add_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_hadd_ps(_v, _v);
+	return dst + _mm_cvtss_f32(_mm_add_ps(_h, _mm_movehdup_ps(_h)));
+}
+
+inline double _mm_horizontal_mul_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	double _r = _mm_cvtsd_f64(_mm_mul_pd(_v, _mm_shuffle_pd(_v, _v, 1)));
+	return dst * _r;
+}
+
+inline float _mm_horizontal_mul_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_mul_ps(_v, _mm_shuffle_ps(_v, _v, 177));
+	float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return dst * _r;
+}
+
+inline double _mm_horizontal_min_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	double _r = _mm_cvtsd_f64(_mm_min_pd(_v, _mm_shuffle_pd(_v, _v, 1)));
+	return fmin(_r, dst);
+}
+
+inline float _mm_horizontal_min_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_min_ps(_v, _mm_shuffle_ps(_v, _v, 177));
+	float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmin(_r, dst);
+}
+
+inline double _mm_horizontal_max_pd(double dst, __m128d src) { 
+	__m128d _v = src;
+	double _r = _mm_cvtsd_f64(_mm_max_pd(_v, _mm_shuffle_pd(_v, _v, 1)));
+	return fmax(_r, dst);
+}
+
+inline float _mm_horizontal_max_ps(float dst, __m128 src) { 
+	__m128 _v = src;
+	__m128 _h = _mm_max_ps(_v, _mm_shuffle_ps(_v, _v, 177));
+	float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmax(_r, dst);
+}
+
+#endif
+
+#if defined(__AVX__)
+#include <immintrin.h>
+
+inline double _mm256_horizontal_add_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m256d _h = _mm256_hadd_pd(_v, _v);
+	return dst + _mm_cvtsd_f64(_mm_add_pd(_mm256_extractf128_pd(_h,1), _mm256_castpd256_pd128(_h)));
+}
+
+inline float _mm256_horizontal_add_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m256 _h = _mm256_hadd_ps(_v, _v);
+	__m128  _i = _mm_add_ps(_mm256_extractf128_ps(_h,1), _mm256_castps256_ps128(_h));
+	return dst + _mm_cvtss_f32(_mm_hadd_ps(_i,_i));
+}
+
+inline double _mm256_horizontal_mul_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m128d _w = _mm_mul_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v));
+	double _r = _mm_cvtsd_f64(_mm_mul_pd(_w, _mm_permute_pd(_w,1))); 
+	return dst * _r;
+}
+
+inline float _mm256_horizontal_mul_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m128 _w = _mm_mul_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v));
+	__m128 _h = _mm_mul_ps(_w, _mm_shuffle_ps(_w, _w, 177));
+	float _r = _mm_cvtss_f32(_mm_mul_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return dst * _r;
+}
+
+inline double _mm256_horizontal_min_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m128d _w = _mm_min_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v));
+	double _r = _mm_cvtsd_f64(_mm_min_pd(_w, _mm_permute_pd(_w,1))); 
+	return fmin(_r, dst);
+}
+
+inline float _mm256_horizontal_min_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m128 _w = _mm_min_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v));
+	__m128 _h = _mm_min_ps(_w, _mm_shuffle_ps(_w, _w, 177));
+	float _r = _mm_cvtss_f32(_mm_min_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmin(_r, dst);
+}
+
+inline double _mm256_horizontal_max_pd(double dst, __m256d src) { 
+	__m256d _v = src;
+	__m128d _w = _mm_max_pd(_mm256_extractf128_pd(_v,1), _mm256_castpd256_pd128(_v));
+	double _r = _mm_cvtsd_f64(_mm_max_pd(_w, _mm_permute_pd(_w,1))); 
+	return fmax(_r, dst);
+}
+
+inline float _mm256_horizontal_max_ps(float dst, __m256 src) { 
+	__m256 _v = src;
+	__m128 _w = _mm_max_ps(_mm256_extractf128_ps(_v,1), _mm256_castps256_ps128(_v));
+	__m128 _h = _mm_max_ps(_w, _mm_shuffle_ps(_w, _w, 177));
+	float _r = _mm_cvtss_f32(_mm_max_ps(_h, _mm_shuffle_ps(_h, _h, 10)));
+	return fmax(_r, dst);
+}
+
+#endif
+
+#if defined(__AVX512F__)
+#include <immintrin.h>
+
+inline double _mm512_horizontal_add_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_add_pd(src);
+	return dst + _r;
+}
+
+inline float _mm512_horizontal_add_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_add_ps(src);
+	return dst + _r;
+}
+
+inline double _mm512_horizontal_mul_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_mul_pd(src);
+	return dst * _r;
+}
+
+inline float _mm512_horizontal_mul_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_mul_ps(src);
+	return dst * _r;
+}
+
+inline double _mm512_horizontal_min_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_min_pd(src);
+	return fmin(_r, dst);
+}
+
+inline float _mm512_horizontal_min_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_min_ps(src);
+	return fmin(_r, dst);
+}
+
+inline double _mm512_horizontal_max_pd(double dst, __m512d src) { 
+	double _r = _mm512_reduce_max_pd(src);
+	return fmax(_r, dst);
+}
+
+inline float _mm512_horizontal_max_ps(float dst, __m512 src) { 
+	float _r = _mm512_reduce_max_ps(src);
+	return fmax(_r, dst);
+}
+
+#endif
+
+#if defined(_M_ARM64)
+#include <arm_neon.h>
+
+inline double vgetq_horizontal_add_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r += vgetq_lane_f64(_v,1);
+	return dst + _r;
+}
+
+inline float vget_horizontal_add_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vadd_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r += vget_lane_f32(_w,1);
+	return dst + _r;
+}
+
+inline double vgetq_horizontal_mul_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r *= vgetq_lane_f64(_v,1);
+	return dst * _r;
+}
+
+inline float vget_horizontal_mul_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vmul_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r *= vget_lane_f32(_w,1);
+	return dst * _r;
+}
+
+inline double vgetq_horizontal_min_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r = fmin(_r, vgetq_lane_f64(_v,1));
+	return fmin(_r, dst);
+}
+
+inline float vget_horizontal_min_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vmin_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r = fmin(_r, vget_lane_f32(_w,1));
+	return fmin(_r, dst);
+}
+
+inline double vgetq_horizontal_max_f64(double dst, float64x2_t src) { 
+	float64x2_t _v = src;
+	double _r = vgetq_lane_f64(_v,0);
+	_r = fmax(_r, vgetq_lane_f64(_v,1));
+	return fmax(_r, dst);
+}
+
+inline float vget_horizontal_max_f32(float dst, float32x4_t src) { 
+	float32x4_t _v = src;
+	float32x2_t _w = vmax_f32(vget_high_f32(_v), vget_low_f32(_v));
+	float _r = vgetq_lane_f32(_w,0);
+	_r = fmax(_r, vget_lane_f32(_w,1));
+	return fmax(_r, dst);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/pystencils/jit/cpu_extension_module.py b/src/pystencils/jit/cpu_extension_module.py
index fca043db90725a441cb8b0ed9b99765c53eecb8d..4d76ea9ca1d75bc2b2ca2d77976b42c5f16b240c 100644
--- a/src/pystencils/jit/cpu_extension_module.py
+++ b/src/pystencils/jit/cpu_extension_module.py
@@ -92,6 +92,7 @@ class PsKernelExtensioNModule:
 
         #   Kernels and call wrappers
         from ..backend.emission import CAstPrinter
+
         printer = CAstPrinter(func_prefix="FUNC_PREFIX")
 
         for name, kernel in self._kernels.items():
@@ -200,13 +201,15 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 """
 
     def __init__(self) -> None:
-        self._buffer_types: dict[Field, PsType] = dict()
-        self._array_extractions: dict[Field, str] = dict()
-        self._array_frees: dict[Field, str] = dict()
+        self._buffer_types: dict[Any, PsType] = dict()
+        self._array_extractions: dict[Any, str] = dict()
+        self._array_frees: dict[Any, str] = dict()
 
         self._array_assoc_var_extractions: dict[Parameter, str] = dict()
         self._scalar_extractions: dict[Parameter, str] = dict()
 
+        self._pointer_extractions: dict[Parameter, str] = dict()
+
         self._constraint_checks: list[str] = []
 
         self._call: str | None = None
@@ -232,38 +235,42 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
         else:
             return None
 
+    def get_buffer(self, buffer_name: str) -> str:
+        """Get the Python buffer object for a given buffer name."""
+        return f"buffer_{buffer_name}"
+
     def get_field_buffer(self, field: Field) -> str:
         """Get the Python buffer object for the given field."""
-        return f"buffer_{field.name}"
+        return self.get_buffer(field.name)
 
-    def extract_field(self, field: Field) -> None:
-        """Add the necessary code to extract the NumPy array for a given field"""
-        if field not in self._array_extractions:
-            extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=field.name)
-            actual_dtype = self._buffer_types[field]
+    def extract_buffer(self, buffer: Any, name: str) -> None:
+        """Add the necessary code to extract the NumPy array for a given buffer"""
+        if buffer not in self._array_extractions:
+            extraction_code = self.TMPL_EXTRACT_ARRAY.format(name=name)
+            actual_dtype = self._buffer_types[buffer]
 
             #   Check array type
             type_char = self._type_char(actual_dtype)
             if type_char is not None:
-                dtype_cond = f"buffer_{field.name}.format[0] == '{type_char}'"
+                dtype_cond = f"buffer_{name}.format[0] == '{type_char}'"
                 extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
                     cond=dtype_cond,
                     what="data type",
-                    name=field.name,
+                    name=name,
                     expected=str(actual_dtype),
                 )
 
             #   Check item size
             itemsize = actual_dtype.itemsize
-            item_size_cond = f"buffer_{field.name}.itemsize == {itemsize}"
+            item_size_cond = f"buffer_{name}.itemsize == {itemsize}"
             extraction_code += self.TMPL_CHECK_ARRAY_TYPE.format(
-                cond=item_size_cond, what="itemsize", name=field.name, expected=itemsize
+                cond=item_size_cond, what="itemsize", name=name, expected=itemsize
             )
 
-            self._array_extractions[field] = extraction_code
+            self._array_extractions[buffer] = extraction_code
 
-            release_code = f"PyBuffer_Release(&buffer_{field.name});"
-            self._array_frees[field] = release_code
+            release_code = f"PyBuffer_Release(&buffer_{name});"
+            self._array_frees[buffer] = release_code
 
     def extract_scalar(self, param: Parameter) -> str:
         if param not in self._scalar_extractions:
@@ -277,6 +284,26 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
 
         return param.name
 
+    def extract_ptr(self, param: Parameter) -> str:
+        if param not in self._pointer_extractions:
+            ptr = param.symbol
+            ptr_dtype = ptr.dtype
+
+            assert isinstance(ptr_dtype, PsPointerType)
+
+            self._buffer_types[ptr] = ptr_dtype.base_type
+            self.extract_buffer(ptr, param.name)
+            buffer = self.get_buffer(param.name)
+            code = (
+                f"{param.dtype.c_string()} {param.name} = ({param.dtype}) {buffer}.buf;"
+            )
+
+            assert code is not None
+
+            self._array_assoc_var_extractions[param] = code
+
+        return param.name
+
     def extract_array_assoc_var(self, param: Parameter) -> str:
         if param not in self._array_assoc_var_extractions:
             field = param.fields[0]
@@ -321,11 +348,13 @@ if( !kwargs || !PyDict_Check(kwargs) ) {{
                     actual_field_type = field.dtype
 
                 self._buffer_types[prop.field] = actual_field_type
-                self.extract_field(prop.field)
+                self.extract_buffer(prop.field, field.name)
 
         for param in params:
             if param.is_field_parameter:
                 self.extract_array_assoc_var(param)
+            elif isinstance(param.dtype, PsPointerType):
+                self.extract_ptr(param)
             else:
                 self.extract_scalar(param)
 
diff --git a/src/pystencils/jit/gpu_cupy.py b/src/pystencils/jit/gpu_cupy.py
index 2217809618ec9a161d5e2d6f8c3c374e399e0471..760dccf41883c26b21c0fd8fde5820832bead3d1 100644
--- a/src/pystencils/jit/gpu_cupy.py
+++ b/src/pystencils/jit/gpu_cupy.py
@@ -11,7 +11,6 @@ except ImportError:
 from ..codegen import Target
 from ..field import FieldType
 
-from ..types import PsType
 from .jit import JitBase, JitError, KernelWrapper
 from ..codegen import (
     Kernel,
@@ -20,7 +19,7 @@ from ..codegen import (
 )
 from ..codegen.gpu_indexing import GpuLaunchConfiguration
 from ..codegen.properties import FieldShape, FieldStride, FieldBasePtr
-from ..types import PsStructType, PsPointerType
+from ..types import PsType, PsStructType, PsPointerType
 
 from ..include import get_pystencils_include_path
 
@@ -186,10 +185,13 @@ class CupyKernelWrapper(KernelWrapper):
                                 arr.strides[coord] // arr.dtype.itemsize,
                             )
                             break
+            elif isinstance(kparam.dtype, PsPointerType):
+                val = kwargs[kparam.name]
+                kernel_args.append(val)
             else:
                 #   scalar parameter
-                val: Any = kwargs[param.name]
-                adder(param, val)
+                val = kwargs[kparam.name]
+                adder(kparam, val)
 
         #   Process Arguments
 
diff --git a/src/pystencils/reduction_op_mapping.py b/src/pystencils/reduction_op_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..06fb8aa3e981d661f8e4226a34307f03cd6d9a70
--- /dev/null
+++ b/src/pystencils/reduction_op_mapping.py
@@ -0,0 +1,39 @@
+from .backend.ast.expressions import PsExpression, PsCall, PsAdd, PsSub, PsMul, PsDiv
+from .backend.exceptions import FreezeError
+from .backend.functions import PsMathFunction, MathFunctions
+from .sympyextensions.reduction import ReductionOp
+
+_available_operator_interface: set[ReductionOp] = {
+    ReductionOp.Add,
+    ReductionOp.Sub,
+    ReductionOp.Mul,
+    ReductionOp.Div,
+}
+
+
+def reduction_op_to_expr(op: ReductionOp, op1, op2) -> PsExpression:
+    if op in _available_operator_interface:
+        match op:
+            case ReductionOp.Add:
+                operator = PsAdd
+            case ReductionOp.Sub:
+                operator = PsSub
+            case ReductionOp.Mul:
+                operator = PsMul
+            case ReductionOp.Div:
+                operator = PsDiv
+            case _:
+                raise FreezeError(
+                    f"Found unsupported operation type for reduction assignments: {op}."
+                )
+        return operator(op1, op2)
+    else:
+        match op:
+            case ReductionOp.Min:
+                return PsCall(PsMathFunction(MathFunctions.Min), [op1, op2])
+            case ReductionOp.Max:
+                return PsCall(PsMathFunction(MathFunctions.Max), [op1, op2])
+            case _:
+                raise FreezeError(
+                    f"Found unsupported operation type for reduction assignments: {op}."
+                )
diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py
index f1ba8715431d96fb2a09a01e45872def421fe94f..03b4edccfdd402d3013e0e82b924f5e5e87458c7 100644
--- a/src/pystencils/simp/assignment_collection.py
+++ b/src/pystencils/simp/assignment_collection.py
@@ -1,5 +1,6 @@
 import itertools
 from copy import copy
+
 from typing import Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Union
 
 import sympy as sp
diff --git a/src/pystencils/sympyextensions/__init__.py b/src/pystencils/sympyextensions/__init__.py
index fe108a3ff9ae12139e6204f6cd2cc174b4c5211a..bd0fa1fe9d5459f842ec4aca52bf13de63ab7834 100644
--- a/src/pystencils/sympyextensions/__init__.py
+++ b/src/pystencils/sympyextensions/__init__.py
@@ -1,6 +1,7 @@
 from .astnodes import ConditionalFieldAccess
 from .typed_sympy import TypedSymbol, CastFunc, tcast, DynamicType
 from .pointers import mem_acc
+from .reduction import reduction_assignment, reduction_assignment_from_str, ReductionOp
 
 from .math import (
     prod,
@@ -27,12 +28,15 @@ from .math import (
     count_operations_in_ast,
     common_denominator,
     get_symmetric_part,
-    SymbolCreator
+    SymbolCreator,
 )
 
 
 __all__ = [
     "ConditionalFieldAccess",
+    "reduction_assignment",
+    "reduction_assignment_from_str",
+    "ReductionOp",
     "TypedSymbol",
     "CastFunc",
     "tcast",
@@ -63,5 +67,5 @@ __all__ = [
     "common_denominator",
     "get_symmetric_part",
     "SymbolCreator",
-    "DynamicType"
+    "DynamicType",
 ]
diff --git a/src/pystencils/sympyextensions/reduction.py b/src/pystencils/sympyextensions/reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..e95e37c24f3e5815ebaebb8f2739da15619a70c5
--- /dev/null
+++ b/src/pystencils/sympyextensions/reduction.py
@@ -0,0 +1,82 @@
+from enum import Enum
+
+from sympy.codegen.ast import AssignmentBase
+
+
+class ReductionOp(Enum):
+    Add = "+"
+    Sub = "-"
+    Mul = "*"
+    Div = "/"
+    Min = "min"
+    Max = "max"
+
+
+class ReductionAssignment(AssignmentBase):
+    """
+    Base class for reduced assignments.
+
+    Attributes:
+    ===========
+
+    reduction_op : ReductionOp
+       Enum for binary operation being applied in the assignment, such as "Add" for "+", "Sub" for "-", etc.
+    """
+
+    _reduction_op = None  # type: ReductionOp
+
+    @property
+    def reduction_op(self):
+        return self._reduction_op
+
+    @reduction_op.setter
+    def reduction_op(self, op):
+        self._reduction_op = op
+
+
+class AddReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Add
+
+
+class SubReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Sub
+
+
+class MulReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Mul
+
+
+class MinReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Min
+
+
+class MaxReductionAssignment(ReductionAssignment):
+    reduction_op = ReductionOp.Max
+
+
+# Mapping from ReductionOp enum to ReductionAssigment classes
+_reduction_assignment_classes = {
+    cls.reduction_op: cls
+    for cls in [
+        AddReductionAssignment,
+        SubReductionAssignment,
+        MulReductionAssignment,
+        MinReductionAssignment,
+        MaxReductionAssignment,
+    ]
+}
+
+# Mapping from ReductionOp str to ReductionAssigment classes
+_reduction_assignment_classes_for_str = {
+    cls.value: cls for cls in _reduction_assignment_classes
+}
+
+
+def reduction_assignment(lhs, op: ReductionOp, rhs):
+    if op not in _reduction_assignment_classes:
+        raise ValueError("Unrecognized operator %s" % op)
+    return _reduction_assignment_classes[op](lhs, rhs)
+
+
+def reduction_assignment_from_str(lhs, op: str, rhs):
+    return reduction_assignment(lhs, _reduction_assignment_classes_for_str[op], rhs)
diff --git a/tests/kernelcreation/test_reduction.py b/tests/kernelcreation/test_reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1710cf5ffe8e5ced135867be2c9e947c7e6b64
--- /dev/null
+++ b/tests/kernelcreation/test_reduction.py
@@ -0,0 +1,93 @@
+import pytest
+import numpy as np
+
+import pystencils as ps
+from pystencils.sympyextensions import reduction_assignment_from_str
+
+INIT_W = 5
+INIT_ARR = 2
+SIZE = 15
+SOLUTION = {
+    "+": INIT_W + INIT_ARR * SIZE,
+    "-": INIT_W - INIT_ARR * SIZE,
+    "*": INIT_W * INIT_ARR**SIZE,
+    "min": min(INIT_W, INIT_ARR),
+    "max": max(INIT_W, INIT_ARR),
+}
+
+
+# get AST for kernel with reduction assignment
+def get_reduction_assign_ast(dtype, op, config):
+    x = ps.fields(f"x: {dtype}[1d]")
+    w = ps.TypedSymbol("w", dtype)
+
+    red_assign = reduction_assignment_from_str(w, op, x.center())
+
+    return ps.create_kernel([red_assign], config, default_dtype=dtype)
+
+
+@pytest.mark.parametrize("instruction_set", ["sse", "avx"])
+@pytest.mark.parametrize("dtype", ["float64", "float32"])
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
+def test_reduction_cpu(instruction_set, dtype, op):
+    vectorize_info = {
+        "instruction_set": instruction_set,
+        "assume_inner_stride_one": True,
+    }
+
+    config = ps.CreateKernelConfig(
+        target=ps.Target.CPU, cpu_openmp=True, cpu_vectorize_info=vectorize_info
+    )
+
+    ast_reduction = get_reduction_assign_ast(dtype, op, config)
+    ps.show_code(ast_reduction)
+    kernel_reduction = ast_reduction.compile()
+
+    array = np.full((SIZE,), INIT_ARR, dtype=dtype)
+    reduction_array = np.full((1,), INIT_W, dtype=dtype)
+
+    kernel_reduction(x=array, w=reduction_array)
+    assert np.allclose(reduction_array, SOLUTION[op])
+
+
+@pytest.mark.parametrize("dtype", ["float64", "float32"])
+@pytest.mark.parametrize("op", ["+", "-", "*", "min", "max"])
+@pytest.mark.parametrize("assume_warp_aligned_block_size", [True, False])
+@pytest.mark.parametrize("use_block_fitting", [True, False])
+def test_reduction_gpu(
+        dtype: str,
+        op: str,
+        assume_warp_aligned_block_size: bool,
+        use_block_fitting: bool,
+):
+    try:
+        import cupy as cp
+        from cupy_backends.cuda.api.runtime import CUDARuntimeError
+
+        device_count = range(cp.cuda.runtime.getDeviceCount())
+        print(f"Found {device_count} GPUs")
+    except ImportError:
+        pytest.skip(reason="CuPy is not available", allow_module_level=True)
+    except CUDARuntimeError:
+        pytest.skip(
+            reason="No CUDA capable device is detected", allow_module_level=True
+        )
+
+    cfg = ps.CreateKernelConfig(target=ps.Target.GPU)
+    cfg.gpu.assume_warp_aligned_block_size = assume_warp_aligned_block_size
+
+    ast_reduction = get_reduction_assign_ast(dtype, op, cfg)
+    ps.show_code(ast_reduction)
+    kernel_reduction = ast_reduction.compile()
+
+    if use_block_fitting:
+        kernel_reduction.launch_config.fit_block_size((32, 1, 1))
+
+    array = np.full((SIZE,), INIT_ARR, dtype=dtype)
+    reduction_array = np.full((1,), INIT_W, dtype=dtype)
+
+    array_gpu = cp.asarray(array)
+    reduction_array_gpu = cp.asarray(reduction_array)
+
+    kernel_reduction(x=array_gpu, w=reduction_array_gpu)
+    assert np.allclose(reduction_array_gpu.get(), SOLUTION[op])
diff --git a/util/generate_simd_horizontal_op.py b/util/generate_simd_horizontal_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d652c6e1f53f5a7b060f2a725162ef07f1fdfae
--- /dev/null
+++ b/util/generate_simd_horizontal_op.py
@@ -0,0 +1,309 @@
+from enum import Enum
+
+FCT_QUALIFIERS = "inline"
+
+
+class InstructionSets(Enum):
+    SSE3 = "SSE3"
+    AVX = "AVX"
+    AVX512 = "AVX512"
+    NEON = "NEON"
+
+    def __str__(self):
+        return self.value
+
+
+class ReductionOps(Enum):
+    Add = ("add", "+")
+    Mul = ("mul", "*")
+    Min = ("min", "min")
+    Max = ("max", "max")
+
+    def __init__(self, op_name, op_str):
+        self.op_name = op_name
+        self.op_str = op_str
+
+
+class ScalarTypes(Enum):
+    Double = "double"
+    Float = "float"
+
+    def __str__(self):
+        return self.value
+
+
+class VectorTypes(Enum):
+    SSE3_128d = "__m128d"
+    SSE3_128 = "__m128"
+
+    AVX_256d = "__m256d"
+    AVX_256 = "__m256"
+    AVX_128 = "__m128"
+
+    AVX_512d = "__m512d"
+    AVX_512 = "__m512"
+
+    NEON_64x2 = "float64x2_t"
+    NEON_32x4 = "float32x4_t"
+
+    def __str__(self):
+        return self.value
+
+
+class Variable:
+    def __init__(self, name: str, dtype: ScalarTypes | VectorTypes):
+        self._name = name
+        self._dtype = dtype
+
+    def __str__(self):
+        return f"{self._dtype} {self._name}"
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def dtype(self) -> ScalarTypes | VectorTypes:
+        return self._dtype
+
+
+def get_intrin_from_vector_type(vtype: VectorTypes) -> InstructionSets:
+    match vtype:
+        case VectorTypes.SSE3_128 | VectorTypes.SSE3_128d:
+            return InstructionSets.SSE3
+        case VectorTypes.AVX_256 | VectorTypes.AVX_256d:
+            return InstructionSets.AVX
+        case VectorTypes.AVX_512 | VectorTypes.AVX_512d:
+            return InstructionSets.AVX512
+        case VectorTypes.NEON_32x4 | VectorTypes.NEON_64x2:
+            return InstructionSets.NEON
+
+
+def intrin_prefix(instruction_set: InstructionSets, double_prec: bool):
+    match instruction_set:
+        case InstructionSets.SSE3:
+            return "_mm"
+        case InstructionSets.AVX:
+            return "_mm256"
+        case InstructionSets.AVX512:
+            return "_mm512"
+        case InstructionSets.NEON:
+            return "vgetq" if double_prec else "vget"
+        case _:
+            raise ValueError(f"Unknown instruction set {instruction_set}")
+
+
+def intrin_suffix(instruction_set: InstructionSets, double_prec: bool):
+    if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]:
+        return "pd" if double_prec else "ps"
+    elif instruction_set in [InstructionSets.NEON]:
+        return "f64" if double_prec else "f32"
+    else:
+        raise ValueError(f"Unknown instruction set {instruction_set}")
+
+
+def generate_hadd_intrin(instruction_set: InstructionSets, double_prec: bool, v: str):
+    return f"{intrin_prefix(instruction_set, double_prec)}_hadd_{intrin_suffix(instruction_set, double_prec)}({v}, {v})"
+
+
+def generate_shuffle_intrin(instruction_set: InstructionSets, double_prec: bool, v: str, offset):
+    return f"_mm_shuffle_{intrin_suffix(instruction_set, double_prec)}({v}, {v}, {offset})"
+
+
+def generate_op_intrin(instruction_set: InstructionSets, double_prec: bool, reduction_op: ReductionOps, a: str, b: str):
+    return f"_mm_{reduction_op.op_name}_{intrin_suffix(instruction_set, double_prec)}({a}, {b})"
+
+
+def generate_cvts_intrin(double_prec: bool, v: str):
+    convert_suffix = "f64" if double_prec else "f32"
+    intrin_suffix = "d" if double_prec else "s"
+    return f"_mm_cvts{intrin_suffix}_{convert_suffix}({v})"
+
+
+def generate_fct_name(instruction_set: InstructionSets, double_prec: bool, op: ReductionOps):
+    prefix = intrin_prefix(instruction_set, double_prec)
+    suffix = intrin_suffix(instruction_set, double_prec)
+    return f"{prefix}_horizontal_{op.op_name}_{suffix}"
+
+
+def generate_fct_decl(instruction_set: InstructionSets, op: ReductionOps, svar: Variable, vvar: Variable):
+    double_prec = svar.dtype is ScalarTypes.Double
+    return f"{FCT_QUALIFIERS} {svar.dtype} {generate_fct_name(instruction_set, double_prec, op)}({svar}, {vvar}) {{ \n"
+
+
+# SSE & AVX provide horizontal add 'hadd' intrinsic that allows for specialized handling
+def generate_simd_horizontal_add(scalar_var: Variable, vector_var: Variable):
+    reduction_op = ReductionOps.Add
+    instruction_set = get_intrin_from_vector_type(vector_var.dtype)
+    double_prec = scalar_var.dtype is ScalarTypes.Double
+
+    sname = scalar_var.name
+    vtype = vector_var.dtype
+    vname = vector_var.name
+
+    simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b)
+    hadd = lambda var: generate_hadd_intrin(instruction_set, double_prec, var)
+    cvts = lambda var: generate_cvts_intrin(double_prec, var)
+
+    # function body
+    body = f"\t{vtype} _v = {vname};\n"
+    match instruction_set:
+        case InstructionSets.SSE3:
+            if double_prec:
+                body += f"\treturn {sname} + {cvts(hadd('_v'))};\n"
+            else:
+                body += f"\t{vtype} _h = {hadd('_v')};\n" \
+                        f"\treturn {sname} + {cvts(simd_op('_h', '_mm_movehdup_ps(_h)'))};\n"
+
+        case InstructionSets.AVX:
+            if double_prec:
+                body += f"\t{vtype} _h = {hadd('_v')};\n" \
+                        f"\treturn {sname} + {cvts(simd_op('_mm256_extractf128_pd(_h,1)', '_mm256_castpd256_pd128(_h)'))};\n"
+            else:
+                add_i = "_mm_hadd_ps(_i,_i)"
+                body += f"\t{vtype} _h = {hadd('_v')};\n" \
+                        f"\t__m128  _i = {simd_op('_mm256_extractf128_ps(_h,1)', '_mm256_castps256_ps128(_h)')};\n" \
+                        f"\treturn {sname} + {cvts(add_i)};\n"
+
+        case _:
+            raise ValueError(f"No specialized version of horizontal_add available for {instruction_set}")
+
+    # function decl
+    decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var)
+
+    return decl + body + "}\n"
+
+
+def generate_simd_horizontal_op(reduction_op: ReductionOps, scalar_var: Variable, vector_var: Variable):
+    instruction_set = get_intrin_from_vector_type(vector_var.dtype)
+    double_prec = scalar_var.dtype is ScalarTypes.Double
+
+    # generate specialized version for add operation
+    if reduction_op == ReductionOps.Add and instruction_set in [InstructionSets.SSE3, InstructionSets.AVX]:
+        return generate_simd_horizontal_add(scalar_var, vector_var)
+
+    sname = scalar_var.name
+    stype = scalar_var.dtype
+    vtype = vector_var.dtype
+    vname = vector_var.name
+
+    opname = reduction_op.op_name
+    opstr = reduction_op.op_str
+
+    reduction_function = f"f{opname}" \
+        if reduction_op in [ReductionOps.Max, ReductionOps.Min] else None
+
+    simd_op = lambda a, b: generate_op_intrin(instruction_set, double_prec, reduction_op, a, b)
+    cvts = lambda var: generate_cvts_intrin(double_prec, var)
+    shuffle = lambda var, offset: generate_shuffle_intrin(instruction_set, double_prec, var, offset)
+
+    # function body
+    body = f"\t{vtype} _v = {vname};\n" if instruction_set != InstructionSets.AVX512 else ""
+    match instruction_set:
+        case InstructionSets.SSE3:
+            if double_prec:
+                body += f"\t{stype} _r = {cvts(simd_op('_v', shuffle('_v', 1)))};\n"
+            else:
+                body += f"\t{vtype} _h = {simd_op('_v', shuffle('_v', 177))};\n" \
+                        f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n"
+
+        case InstructionSets.AVX:
+            if double_prec:
+                body += f"\t__m128d _w = {simd_op('_mm256_extractf128_pd(_v,1)', '_mm256_castpd256_pd128(_v)')};\n" \
+                        f"\t{stype} _r = {cvts(simd_op('_w', '_mm_permute_pd(_w,1)'))}; \n"
+            else:
+                body += f"\t__m128 _w = {simd_op('_mm256_extractf128_ps(_v,1)', '_mm256_castps256_ps128(_v)')};\n" \
+                        f"\t__m128 _h = {simd_op('_w', shuffle('_w', 177))};\n" \
+                        f"\t{stype} _r = {cvts(simd_op('_h', shuffle('_h', 10)))};\n"
+
+        case InstructionSets.AVX512:
+            suffix = intrin_suffix(instruction_set, double_prec)
+            body += f"\t{stype} _r = _mm512_reduce_{opname}_{suffix}({vname});\n"
+
+        case InstructionSets.NEON:
+            if double_prec:
+                body += f"\t{stype} _r = vgetq_lane_f64(_v,0);\n"
+                if reduction_function:
+                    body += f"\t_r = {reduction_function}(_r, vgetq_lane_f64(_v,1));\n"
+                else:
+                    body += f"\t_r {opstr}= vgetq_lane_f64(_v,1);\n"
+            else:
+                body += f"\tfloat32x2_t _w = v{opname}_f32(vget_high_f32(_v), vget_low_f32(_v));\n" \
+                        f"\t{stype} _r = vgetq_lane_f32(_w,0);\n"
+                if reduction_function:
+                    body += f"\t_r = {reduction_function}(_r, vget_lane_f32(_w,1));\n"
+                else:
+                    body += f"\t_r {opstr}= vget_lane_f32(_w,1);\n"
+
+        case _:
+            raise ValueError(f"Unsupported instruction set {instruction_set}")
+
+    # finalize reduction
+    if reduction_function:
+        body += f"\treturn {reduction_function}(_r, {sname});\n"
+    else:
+        body += f"\treturn {sname} {opstr} _r;\n"
+
+    # function decl
+    decl = generate_fct_decl(instruction_set, reduction_op, scalar_var, vector_var)
+
+    return decl + body + "}\n"
+
+
+stypes = {
+    True: ScalarTypes.Double,
+    False: ScalarTypes.Float
+}
+
+vtypes_for_instruction_set = {
+    InstructionSets.SSE3: {
+        True: VectorTypes.SSE3_128d,
+        False: VectorTypes.SSE3_128
+    },
+    InstructionSets.AVX: {
+        True: VectorTypes.AVX_256d,
+        False: VectorTypes.AVX_256
+    },
+    InstructionSets.AVX512: {
+        True: VectorTypes.AVX_512d,
+        False: VectorTypes.AVX_512
+    },
+    InstructionSets.NEON: {
+        True: VectorTypes.NEON_64x2,
+        False: VectorTypes.NEON_32x4
+    },
+}
+
+guards_for_instruction_sets = {
+    InstructionSets.SSE3: "__SSE3__",
+    InstructionSets.AVX: "__AVX__",
+    InstructionSets.AVX512: '__AVX512F__',
+    InstructionSets.NEON: '_M_ARM64',
+}
+
+code = """#pragma once
+
+#include <cmath>
+
+"""
+
+for instruction_set in InstructionSets:
+    code += f"#if defined({guards_for_instruction_sets[instruction_set]})\n"
+
+    if instruction_set in [InstructionSets.SSE3, InstructionSets.AVX, InstructionSets.AVX512]:
+        code += "#include <immintrin.h>\n\n"
+    elif instruction_set == InstructionSets.NEON:
+        code += "#include <arm_neon.h>\n\n"
+    else:
+        ValueError(f"Missing header include for instruction set {instruction_set}")
+
+    for reduction_op in ReductionOps:
+        for double_prec in [True, False]:
+            scalar_var = Variable("dst", stypes[double_prec])
+            vector_var = Variable("src", vtypes_for_instruction_set[instruction_set][double_prec])
+
+            code += generate_simd_horizontal_op(reduction_op, scalar_var, vector_var) + "\n"
+
+    code += "#endif\n\n"
+
+print(code)