From 103553f4b26ad2f62b07d9781212d2562ec82739 Mon Sep 17 00:00:00 2001
From: Frederik Hennig <frederik.hennig@fau.de>
Date: Mon, 11 Mar 2024 15:02:36 +0100
Subject: [PATCH] add CPU optimizer config. Extend various doc comments.

---
 src/pystencils/backend/ast/astnode.py         |   7 +-
 .../backend/kernelcreation/context.py         |  31 +++---
 .../backend/kernelcreation/freeze.py          |   1 +
 src/pystencils/backend/kernelfunction.py      |   8 +-
 src/pystencils/backend/symbols.py             |   4 +-
 src/pystencils/config.py                      | 102 ++++++++++++++++++
 src/pystencils/kernelcreation.py              |   3 +-
 7 files changed, 132 insertions(+), 24 deletions(-)

diff --git a/src/pystencils/backend/ast/astnode.py b/src/pystencils/backend/ast/astnode.py
index 3487d4200..4ef557fe1 100644
--- a/src/pystencils/backend/ast/astnode.py
+++ b/src/pystencils/backend/ast/astnode.py
@@ -31,10 +31,15 @@ class PsAstNode(ABC):
 
     @abstractmethod
     def clone(self) -> PsAstNode:
+        """Perform a deep copy of the AST."""
         pass
 
     def structurally_equal(self, other: PsAstNode) -> bool:
-        """Check two ASTs for structural equality."""
+        """Check two ASTs for structural equality.
+
+        By default this method checks the node's type and children.
+        If an AST node has additional internal state, it MUST override this method.
+        """
         return (
             (type(self) is type(other))
             and len(self.children) == len(other.children)
diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py
index 22286156e..7bf75f69b 100644
--- a/src/pystencils/backend/kernelcreation/context.py
+++ b/src/pystencils/backend/kernelcreation/context.py
@@ -41,25 +41,20 @@ FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array"))
 
 class KernelCreationContext:
     """Manages the translation process from the SymPy frontend to the backend AST, and collects
-    all necessary information for the translation.
+    all necessary information for the translation:
+
+     - *Data Types*: The kernel creation context manages the default data types for loop limits
+       and counters, index calculations, and the typifier.
+     - *Symbols*: The context maintains a symbol table, keeping track of all symbols encountered
+       during kernel translation together with their types.
+     - *Fields and Arrays*: The context collects all fields encountered during code generation,
+       applies a few consistency checks to them, and manages their associated arrays.
+     - *Iteration Space*: The context manages the iteration space of the kernel currently being
+       translated.
+     - *Constraints*: The context collects all kernel parameter constraints introduced during the
+       translation process.
+     - *Required Headers*: The context collects all header files required for the kernel to run.
 
-
-    Data Types
-    ----------
-
-    The kernel creation context manages the default data types for loop limits and counters, index calculations,
-    and the typifier.
-
-    Fields and Arrays
-    ------------------
-
-    The kernel creation context acts as a factory for mapping fields to arrays.
-
-    Iteration Space
-    ---------------
-
-    The context manages the iteration space within which the current translation takes place. It may be a sparse
-    or full iteration space.
     """
 
     def __init__(
diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py
index 26dcc9d8e..ecdcf2f94 100644
--- a/src/pystencils/backend/kernelcreation/freeze.py
+++ b/src/pystencils/backend/kernelcreation/freeze.py
@@ -54,6 +54,7 @@ class FreezeExpressions:
      - Augmented Assignments
      - AddressOf
      - Conditionals (+ frontend class)
+     - Relations (sp.Relational)
      - pystencils.integer_functions
      - pystencils.sympyextensions.bit_masks
      - GPU fast approximations (pystencils.fast_approximation)
diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py
index c38ac60b4..45ec32603 100644
--- a/src/pystencils/backend/kernelfunction.py
+++ b/src/pystencils/backend/kernelfunction.py
@@ -132,17 +132,21 @@ class KernelFunction:
 
     @property
     def target(self) -> Target:
-        """See pystencils.Target"""
         return self._target
 
     @property
     def name(self) -> str:
         return self._name
-
+    
     @name.setter
     def name(self, n: str):
         self._name = n
 
+    @property
+    def function_name(self) -> str:
+        """For backward compatibility"""
+        return self._name
+
     @property
     def parameters(self) -> tuple[KernelParameter, ...]:
         return self._params
diff --git a/src/pystencils/backend/symbols.py b/src/pystencils/backend/symbols.py
index 3c3d5ab6e..e1db5a930 100644
--- a/src/pystencils/backend/symbols.py
+++ b/src/pystencils/backend/symbols.py
@@ -5,7 +5,7 @@ from .exceptions import PsInternalCompilerError
 class PsSymbol:
     """A mutable symbol with name and data type.
 
-    Be advised to not create objects of this class directly unless you know what you are doing;
+    Do not create objects of this class directly unless you know what you are doing;
     instead obtain them from a `KernelCreationContext` through `KernelCreationContext.get_symbol`.
     This way, the context can keep track of all symbols used in the translation run,
     and uniqueness of symbols is ensured.
@@ -50,4 +50,4 @@ class PsSymbol:
         return f"{self._name}: {dtype_str}"
 
     def __repr__(self) -> str:
-        return str(self)
+        return f"PsSymbol({self._name}, {self._dtype})"
diff --git a/src/pystencils/config.py b/src/pystencils/config.py
index d63bd9336..cb82785df 100644
--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -1,3 +1,7 @@
+from __future__ import annotations
+
+from collections.abc import Collection
+
 from typing import Sequence
 from dataclasses import dataclass
 
@@ -11,6 +15,90 @@ from .types import PsIntegerType, PsNumericType, PsIeeeFloatType
 from .defaults import DEFAULTS
 
 
+@dataclass
+class CpuOptimConfig:
+    """Configuration for the CPU optimizer.
+    
+    If any flag in this configuration is set to a value not supported by the CPU specified
+    in `CreateKernelConfig.target`, an error will be raised.
+    """
+    
+    openmp: bool = False
+    """Enable OpenMP parallelization.
+    
+    If set to `True`, the kernel will be parallelized using OpenMP according to the OpenMP settings
+    given in this configuration.
+    """
+
+    vectorize: bool | VectorizationConfig = False
+    """Enable and configure auto-vectorization.
+    
+    If set to an instance of `VectorizationConfig` and a CPU target with vector capabilities is selected,
+    pystencils will attempt to vectorize the kernel according to the given vectorization options.
+
+    If set to `True`, pystencils will infer vectorization options from the given CPU target.
+
+    If set to `False`, no vectorization takes place.
+    """
+
+    loop_blocking: None | tuple[int, ...] = None
+    """Block sizes for loop blocking.
+    
+    If set, the kernel's loops will be tiled according to the given block sizes.
+    """
+
+    use_cacheline_zeroing: bool = False
+    """Enable cache-line zeroing.
+    
+    If set to `True` and the selected CPU supports cacheline zeroing, the CPU optimizer will attempt
+    to produce cacheline zeroing instructions where possible.
+    """
+
+
+@dataclass
+class VectorizationConfig:
+    """Configuration for the auto-vectorizer.
+    
+    If any flag in this configuration is set to a value not supported by the CPU specified
+    in `CreateKernelConfig.target`, an error will be raised.
+    """
+
+    vector_width: int | None = None
+    """Desired vector register width in bits.
+    
+    If set to an integer value, the vectorizer will use this as the desired vector register width.
+
+    If set to `None`, the vector register width will be automatically set to the broadest possible.
+    
+    If the selected CPU does not support the given width, an error will be raised.
+    """
+
+    use_nontemporal_stores: bool | Collection[str | Field] = False
+    """Enable nontemporal (streaming) stores.
+    
+    If set to `True` and the selected CPU supports streaming stores, the vectorizer will generate
+    nontemporal store instructions for all stores.
+
+    If set to a collection of fields (or field names), streaming stores will only be generated for
+    the given fields.
+    """
+
+    assume_aligned: bool = False
+    """Assume field pointer alignment.
+    
+    If set to `True`, the vectorizer will assume that the address of the first inner entry
+    (after ghost layers) of each field is aligned at the necessary byte boundary.
+    """
+
+    assume_inner_stride_one: bool = False
+    """Assume stride associated with the innermost spatial coordinate of all fields is one.
+    
+    If set to `True`, the vectorizer will replace the stride of the innermost spatial coordinate
+    with unity, thus enabling vectorization. If any fields already have a fixed innermost stride
+    that is not equal to one, an error will be raised.
+    """
+
+
 @dataclass
 class CreateKernelConfig:
     """Options for create_kernel."""
@@ -67,6 +155,12 @@ class CreateKernelConfig:
     This data type will be applied to all untyped symbols.
     """
 
+    cpu_optim: None | CpuOptimConfig = None
+    """Configuration of the CPU kernel optimizer.
+    
+    If this parameter is set while `target` is a non-CPU target, an error will be raised.
+    """
+
     def __post_init__(self):
         #   Check iteration space argument consistency
         if (
@@ -88,6 +182,14 @@ class CreateKernelConfig:
             raise PsOptionsError(
                 "Only fields with `field_type == FieldType.INDEXED` can be specified as `index_field`"
             )
+        
+        #   Check optim
+        if self.cpu_optim is not None:
+            if not self.target.is_cpu():
+                raise PsOptionsError(f"`cpu_optim` cannot be set for non-CPU target {self.target}")
+            
+            if self.cpu_optim.vectorize is not False and not self.target.is_vector_cpu():
+                raise PsOptionsError(f"Cannot enable auto-vectorization for non-vector CPU target {self.target}")
 
         #   Infer JIT
         if self.jit is None:
diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py
index 439eb7872..a79c68c59 100644
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -37,7 +37,8 @@ def create_kernel(
     assignments: AssignmentCollection | list[Assignment] | Assignment,
     config: CreateKernelConfig = CreateKernelConfig(),
 ):
-    """Create a kernel AST from an assignment collection."""
+    """Create a kernel function from an assignment collection."""
+
     ctx = KernelCreationContext(
         default_dtype=config.default_dtype, index_dtype=config.index_dtype
     )
-- 
GitLab