From 103553f4b26ad2f62b07d9781212d2562ec82739 Mon Sep 17 00:00:00 2001 From: Frederik Hennig <frederik.hennig@fau.de> Date: Mon, 11 Mar 2024 15:02:36 +0100 Subject: [PATCH] add CPU optimizer config. Extend various doc comments. --- src/pystencils/backend/ast/astnode.py | 7 +- .../backend/kernelcreation/context.py | 31 +++--- .../backend/kernelcreation/freeze.py | 1 + src/pystencils/backend/kernelfunction.py | 8 +- src/pystencils/backend/symbols.py | 4 +- src/pystencils/config.py | 102 ++++++++++++++++++ src/pystencils/kernelcreation.py | 3 +- 7 files changed, 132 insertions(+), 24 deletions(-) diff --git a/src/pystencils/backend/ast/astnode.py b/src/pystencils/backend/ast/astnode.py index 3487d4200..4ef557fe1 100644 --- a/src/pystencils/backend/ast/astnode.py +++ b/src/pystencils/backend/ast/astnode.py @@ -31,10 +31,15 @@ class PsAstNode(ABC): @abstractmethod def clone(self) -> PsAstNode: + """Perform a deep copy of the AST.""" pass def structurally_equal(self, other: PsAstNode) -> bool: - """Check two ASTs for structural equality.""" + """Check two ASTs for structural equality. + + By default this method checks the node's type and children. + If an AST node has additional internal state, it MUST override this method. + """ return ( (type(self) is type(other)) and len(self.children) == len(other.children) diff --git a/src/pystencils/backend/kernelcreation/context.py b/src/pystencils/backend/kernelcreation/context.py index 22286156e..7bf75f69b 100644 --- a/src/pystencils/backend/kernelcreation/context.py +++ b/src/pystencils/backend/kernelcreation/context.py @@ -41,25 +41,20 @@ FieldArrayPair = namedtuple("FieldArrayPair", ("field", "array")) class KernelCreationContext: """Manages the translation process from the SymPy frontend to the backend AST, and collects - all necessary information for the translation. + all necessary information for the translation: + + - *Data Types*: The kernel creation context manages the default data types for loop limits + and counters, index calculations, and the typifier. + - *Symbols*: The context maintains a symbol table, keeping track of all symbols encountered + during kernel translation together with their types. + - *Fields and Arrays*: The context collects all fields encountered during code generation, + applies a few consistency checks to them, and manages their associated arrays. + - *Iteration Space*: The context manages the iteration space of the kernel currently being + translated. + - *Constraints*: The context collects all kernel parameter constraints introduced during the + translation process. + - *Required Headers*: The context collects all header files required for the kernel to run. - - Data Types - ---------- - - The kernel creation context manages the default data types for loop limits and counters, index calculations, - and the typifier. - - Fields and Arrays - ------------------ - - The kernel creation context acts as a factory for mapping fields to arrays. - - Iteration Space - --------------- - - The context manages the iteration space within which the current translation takes place. It may be a sparse - or full iteration space. """ def __init__( diff --git a/src/pystencils/backend/kernelcreation/freeze.py b/src/pystencils/backend/kernelcreation/freeze.py index 26dcc9d8e..ecdcf2f94 100644 --- a/src/pystencils/backend/kernelcreation/freeze.py +++ b/src/pystencils/backend/kernelcreation/freeze.py @@ -54,6 +54,7 @@ class FreezeExpressions: - Augmented Assignments - AddressOf - Conditionals (+ frontend class) + - Relations (sp.Relational) - pystencils.integer_functions - pystencils.sympyextensions.bit_masks - GPU fast approximations (pystencils.fast_approximation) diff --git a/src/pystencils/backend/kernelfunction.py b/src/pystencils/backend/kernelfunction.py index c38ac60b4..45ec32603 100644 --- a/src/pystencils/backend/kernelfunction.py +++ b/src/pystencils/backend/kernelfunction.py @@ -132,17 +132,21 @@ class KernelFunction: @property def target(self) -> Target: - """See pystencils.Target""" return self._target @property def name(self) -> str: return self._name - + @name.setter def name(self, n: str): self._name = n + @property + def function_name(self) -> str: + """For backward compatibility""" + return self._name + @property def parameters(self) -> tuple[KernelParameter, ...]: return self._params diff --git a/src/pystencils/backend/symbols.py b/src/pystencils/backend/symbols.py index 3c3d5ab6e..e1db5a930 100644 --- a/src/pystencils/backend/symbols.py +++ b/src/pystencils/backend/symbols.py @@ -5,7 +5,7 @@ from .exceptions import PsInternalCompilerError class PsSymbol: """A mutable symbol with name and data type. - Be advised to not create objects of this class directly unless you know what you are doing; + Do not create objects of this class directly unless you know what you are doing; instead obtain them from a `KernelCreationContext` through `KernelCreationContext.get_symbol`. This way, the context can keep track of all symbols used in the translation run, and uniqueness of symbols is ensured. @@ -50,4 +50,4 @@ class PsSymbol: return f"{self._name}: {dtype_str}" def __repr__(self) -> str: - return str(self) + return f"PsSymbol({self._name}, {self._dtype})" diff --git a/src/pystencils/config.py b/src/pystencils/config.py index d63bd9336..cb82785df 100644 --- a/src/pystencils/config.py +++ b/src/pystencils/config.py @@ -1,3 +1,7 @@ +from __future__ import annotations + +from collections.abc import Collection + from typing import Sequence from dataclasses import dataclass @@ -11,6 +15,90 @@ from .types import PsIntegerType, PsNumericType, PsIeeeFloatType from .defaults import DEFAULTS +@dataclass +class CpuOptimConfig: + """Configuration for the CPU optimizer. + + If any flag in this configuration is set to a value not supported by the CPU specified + in `CreateKernelConfig.target`, an error will be raised. + """ + + openmp: bool = False + """Enable OpenMP parallelization. + + If set to `True`, the kernel will be parallelized using OpenMP according to the OpenMP settings + given in this configuration. + """ + + vectorize: bool | VectorizationConfig = False + """Enable and configure auto-vectorization. + + If set to an instance of `VectorizationConfig` and a CPU target with vector capabilities is selected, + pystencils will attempt to vectorize the kernel according to the given vectorization options. + + If set to `True`, pystencils will infer vectorization options from the given CPU target. + + If set to `False`, no vectorization takes place. + """ + + loop_blocking: None | tuple[int, ...] = None + """Block sizes for loop blocking. + + If set, the kernel's loops will be tiled according to the given block sizes. + """ + + use_cacheline_zeroing: bool = False + """Enable cache-line zeroing. + + If set to `True` and the selected CPU supports cacheline zeroing, the CPU optimizer will attempt + to produce cacheline zeroing instructions where possible. + """ + + +@dataclass +class VectorizationConfig: + """Configuration for the auto-vectorizer. + + If any flag in this configuration is set to a value not supported by the CPU specified + in `CreateKernelConfig.target`, an error will be raised. + """ + + vector_width: int | None = None + """Desired vector register width in bits. + + If set to an integer value, the vectorizer will use this as the desired vector register width. + + If set to `None`, the vector register width will be automatically set to the broadest possible. + + If the selected CPU does not support the given width, an error will be raised. + """ + + use_nontemporal_stores: bool | Collection[str | Field] = False + """Enable nontemporal (streaming) stores. + + If set to `True` and the selected CPU supports streaming stores, the vectorizer will generate + nontemporal store instructions for all stores. + + If set to a collection of fields (or field names), streaming stores will only be generated for + the given fields. + """ + + assume_aligned: bool = False + """Assume field pointer alignment. + + If set to `True`, the vectorizer will assume that the address of the first inner entry + (after ghost layers) of each field is aligned at the necessary byte boundary. + """ + + assume_inner_stride_one: bool = False + """Assume stride associated with the innermost spatial coordinate of all fields is one. + + If set to `True`, the vectorizer will replace the stride of the innermost spatial coordinate + with unity, thus enabling vectorization. If any fields already have a fixed innermost stride + that is not equal to one, an error will be raised. + """ + + @dataclass class CreateKernelConfig: """Options for create_kernel.""" @@ -67,6 +155,12 @@ class CreateKernelConfig: This data type will be applied to all untyped symbols. """ + cpu_optim: None | CpuOptimConfig = None + """Configuration of the CPU kernel optimizer. + + If this parameter is set while `target` is a non-CPU target, an error will be raised. + """ + def __post_init__(self): # Check iteration space argument consistency if ( @@ -88,6 +182,14 @@ class CreateKernelConfig: raise PsOptionsError( "Only fields with `field_type == FieldType.INDEXED` can be specified as `index_field`" ) + + # Check optim + if self.cpu_optim is not None: + if not self.target.is_cpu(): + raise PsOptionsError(f"`cpu_optim` cannot be set for non-CPU target {self.target}") + + if self.cpu_optim.vectorize is not False and not self.target.is_vector_cpu(): + raise PsOptionsError(f"Cannot enable auto-vectorization for non-vector CPU target {self.target}") # Infer JIT if self.jit is None: diff --git a/src/pystencils/kernelcreation.py b/src/pystencils/kernelcreation.py index 439eb7872..a79c68c59 100644 --- a/src/pystencils/kernelcreation.py +++ b/src/pystencils/kernelcreation.py @@ -37,7 +37,8 @@ def create_kernel( assignments: AssignmentCollection | list[Assignment] | Assignment, config: CreateKernelConfig = CreateKernelConfig(), ): - """Create a kernel AST from an assignment collection.""" + """Create a kernel function from an assignment collection.""" + ctx = KernelCreationContext( default_dtype=config.default_dtype, index_dtype=config.index_dtype ) -- GitLab