diff --git a/src/pystencils/field.py b/src/pystencils/field.py index b4c040e53b1780b9428878d6fcfe94a4d570c3b2..7630f4cd8a50d44fc21f24164eb9afcc8c90bbc0 100644 --- a/src/pystencils/field.py +++ b/src/pystencils/field.py @@ -569,6 +569,8 @@ class Field: """ _iterable = False # see https://i10git.cs.fau.de/pycodegen/pystencils/-/merge_requests/166#note_10680 + __match_args__ = ("field", "offsets", "index") + def __new__(cls, name, *args, **kwargs): obj = Field.Access.__xnew_cached_(cls, name, *args, **kwargs) return obj diff --git a/src/pystencils/nbackend/kernelcreation/analysis.py b/src/pystencils/nbackend/kernelcreation/analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..173c58e3015ac8465ff9404b7698c320a2c3b38f --- /dev/null +++ b/src/pystencils/nbackend/kernelcreation/analysis.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +from collections import namedtuple, defaultdict +from typing import Any, Sequence +from itertools import chain + +import sympy as sp + +from .context import KernelCreationContext + +from ...field import Field +from ...assignment import Assignment +from ...simp import AssignmentCollection +from ...transformations import NestedScopes + +from ..exceptions import PsInternalCompilerError + + +class KernelConstraintsError(Exception): + pass + + +class KernelAnalysis: + """General analysis pass over a kernel expressed using the SymPy frontend. + + The kernel analysis fulfills two tasks. It checks the SymPy input for consistency, + and populates the context with required knowledge about the kernel. + + A `KernelAnalysis` object may be called at most once. + + Consistency and Constraints + --------------------------- + + The following checks are performed: + + - **SSA Form:** The given assignments must be in single-assignment form; each symbol must be written at most once. + - **Independence of Accesses:** To avoid loop-carried dependencies, each field may be written at most once at + each index, and if a field is written at some location with index `i`, it may only be read with index `i` in + the same location. + - **Independence of Writes:** A weaker requirement than access independence; each field may only be written once + at each index. + + Knowledge Collection + -------------------- + + The following knowledge is collected into the context: + - The set of fields accessed in the kernel + """ + + FieldAndIndex = namedtuple("FieldAndIndex", ["field", "index"]) + + def __init__(self, ctx: KernelCreationContext): + self._ctx = ctx + + self._check_access_independence = False + self._check_double_writes = ( + False # TODO: Access independence check implies double writes check + ) + + # Map pairs of fields and indices to offsets + self._field_writes: dict[KernelAnalysis.FieldAndIndex, set[Any]] = defaultdict( + set + ) + + self._fields_written: set[Field] = set() + self._fields_read: set[Field] = set() + + self._scopes = NestedScopes() + + self._called = False + + def __call__(self, obj: AssignmentCollection | Sequence[Assignment] | Assignment): + if self._called: + raise PsInternalCompilerError("KernelAnalysis called twice!") + + self._called = True + self._visit(obj) + + for field in chain(self._fields_written, self._fields_read): + self._ctx.add_field(field) + + def _visit(self, obj: Any): + match obj: + case AssignmentCollection(main_asms, subexps): + self._visit(subexps) + self._visit(main_asms) + + case [*asms]: # lists and tuples are unpacked + for asm in asms: + self._visit(asm) + + case Assignment(lhs, rhs): + self._handle_rhs(rhs) + self._handle_lhs(lhs) + + case unknown: + raise KernelConstraintsError( + f"Don't know how to interpret {unknown} in a kernel." + ) + + def _handle_lhs(self, lhs: sp.Basic): + if not isinstance(lhs, sp.Symbol): + raise KernelConstraintsError( + f"Invalid expression on assignment left-hand side: {lhs}" + ) + + match lhs: + case Field.Access(field, offsets, index): + self._fields_written.add(field) + self._fields_read.update(lhs.indirect_addressing_fields) + + fai = self.FieldAndIndex(field, index) + if self._check_double_writes and offsets in self._field_writes[fai]: + raise KernelConstraintsError( + f"Field {field.name} is written twice at the same location" + ) + + self._field_writes[fai].add(offsets) + + if self._check_double_writes and len(self._field_writes[fai]) > 1: + raise KernelConstraintsError( + f"Field {field.name} is written at two different locations" + ) + + case sp.Symbol(): + if self._scopes.is_defined_locally(lhs): + raise KernelConstraintsError( + f"Assignments not in SSA form, multiple assignments to {lhs.name}" + ) + if lhs in self._scopes.free_parameters: + raise KernelConstraintsError( + f"Symbol {lhs.name} is written, after it has been read" + ) + self._scopes.define_symbol(lhs) + + def _handle_rhs(self, rhs: sp.Basic): + def rec(expr: sp.Basic): + match expr: + case Field.Access(field, offsets, index): + self._fields_read.add(field) + self._fields_read.update(expr.indirect_addressing_fields) + # TODO: Should we recurse into the arguments of the field access? + + if self._check_access_independence: + writes = self._field_writes[ + KernelAnalysis.FieldAndIndex(field, index) + ] + assert len(writes) <= 1 + for write_offset in writes: + if write_offset != offsets: + raise KernelConstraintsError( + f"Violation of loop independence condition. Field " + f"{field} is read at {offsets} and written at {write_offset}" + ) + case sp.Symbol(): + self._scopes.access_symbol(expr) + + for arg in expr.args: + rec(arg) + + rec(rhs) diff --git a/src/pystencils/nbackend/kernelcreation/domain_kernels.py b/src/pystencils/nbackend/kernelcreation/domain_kernels.py deleted file mode 100644 index 18f157c09bcde5f28775c0d75dd4eef73cfd0d8d..0000000000000000000000000000000000000000 --- a/src/pystencils/nbackend/kernelcreation/domain_kernels.py +++ /dev/null @@ -1,136 +0,0 @@ -from typing import Sequence -from itertools import chain - -from ...simp import AssignmentCollection -from ...field import Field, FieldType -from ...kernel_contrains_check import KernelConstraintsCheck - -from ..ast import PsBlock - -from .context import KernelCreationContext, IterationSpace -from .freeze import FreezeExpressions -from .typification import Typifier -from .options import KernelCreationOptions -from ..exceptions import PsInputError, PsInternalCompilerError - -# flake8: noqa - - -def create_kernel(assignments: AssignmentCollection, options: KernelCreationOptions): - # 1. Prepare context - ctx = KernelCreationContext(options) - - # 2. Check kernel constraints and collect all fields - - """ - TODO: Replace the KernelConstraintsCheck by a KernelAnalysis pass. - - The kernel analysis should: - - Check constraints on the assignments (SSA form, independence conditions, ...) - - Collect all fields and register them in the context - - Maybe collect all field accesses and register them at the context - - Running all this analysis in a single pass will likely improve compiler performance - since no additional searches, e.g. for field accesses, are necessary later. - """ - - check = KernelConstraintsCheck() # TODO: config - check.visit(assignments) - - # Collect all fields - for f in chain(check.fields_written, check.fields_read): - ctx.add_field(f) - - # 3. Create iteration space - ispace: IterationSpace = ( - create_sparse_iteration_space(ctx, assignments) - if len(ctx.fields.index_fields) > 0 - else create_full_iteration_space(ctx, assignments) - ) - - ctx.set_iteration_space(ispace) - - # 4. Freeze assignments - # This call is the same for both domain and indexed kernels - freeze = FreezeExpressions(ctx) - kernel_body: PsBlock = freeze(assignments) - - # 5. Typify - # Also the same for both types of kernels - typify = Typifier(ctx) - kernel_body = typify(kernel_body) - - # Up to this point, all was target-agnostic, but now the target becomes relevant. - # Here we might hand off the compilation to a target-specific part of the compiler - # (CPU/CUDA/...), since these will likely also apply very different optimizations. - - # 6. Add loops or device indexing - # This step translates the iteration space to actual index calculation code and is once again - # different in indexed and domain kernels. - - # 7. Apply optimizations - # - Vectorization - # - OpenMP - # - Loop Splitting, Tiling, Blocking - - # 8. Create and return kernel function. - - -def create_sparse_iteration_space( - ctx: KernelCreationContext, assignments: AssignmentCollection -) -> IterationSpace: - return NotImplemented - - -def create_full_iteration_space( - ctx: KernelCreationContext, assignments: AssignmentCollection -) -> IterationSpace: - assert not ctx.fields.index_fields - - # Collect all relative accesses into domain fields - def access_filter(acc: Field.Access): - return acc.field.field_type in ( - FieldType.GENERIC, - FieldType.STAGGERED, - FieldType.STAGGERED_FLUX, - ) - - domain_field_accesses = assignments.atoms(Field.Access) - domain_field_accesses = set(filter(access_filter, domain_field_accesses)) - - # The following scenarios exist: - # - We have at least one domain field -> find the common field and use it to determine the iteration region - # - We have no domain fields, but at least one custom field -> determine common field from custom fields - # - We have neither domain nor custom fields -> Error - - from ...transformations import get_common_field - - if len(domain_field_accesses) > 0: - archetype_field = get_common_field(ctx.fields.domain_fields) - inferred_gls = max( - [fa.required_ghost_layers for fa in domain_field_accesses] - ) - elif len(ctx.fields.custom_fields) > 0: - archetype_field = get_common_field(ctx.fields.custom_fields) - inferred_gls = 0 - else: - raise PsInputError( - "Unable to construct iteration space: The kernel contains no accesses to domain or custom fields." - ) - - # If the user provided a ghost layer specification, use that - # Otherwise, if an iteration slice was specified, use that - # Otherwise, use the inferred ghost layers - - from .iteration_space import FullIterationSpace - - if ctx.options.ghost_layers is not None: - return FullIterationSpace.create_with_ghost_layers( - ctx, archetype_field, ctx.options.ghost_layers - ) - elif ctx.options.iteration_slice is not None: - raise PsInternalCompilerError("Iteration slices not supported yet") - else: - return FullIterationSpace.create_with_ghost_layers( - ctx, archetype_field, inferred_gls - ) diff --git a/src/pystencils/nbackend/kernelcreation/iteration_space.py b/src/pystencils/nbackend/kernelcreation/iteration_space.py index 1cabc6736209398e7cd10551c0ae7f980caa0f60..2bec83320d4a5cfc7fbb444e818ce8eeb3cf0e3c 100644 --- a/src/pystencils/nbackend/kernelcreation/iteration_space.py +++ b/src/pystencils/nbackend/kernelcreation/iteration_space.py @@ -5,7 +5,8 @@ from dataclasses import dataclass from functools import reduce from operator import mul -from ...field import Field +from ...simp import AssignmentCollection +from ...field import Field, FieldType from ..typed_expressions import ( PsTypedVariable, @@ -15,6 +16,7 @@ from ..typed_expressions import ( ) from ..arrays import PsLinearizedArray from .defaults import Pymbolic as Defaults +from ..exceptions import PsInputError, PsInternalCompilerError if TYPE_CHECKING: from .context import KernelCreationContext @@ -141,3 +143,61 @@ class SparseIterationSpace(IterationSpace): @property def index_list(self) -> PsLinearizedArray: return self._index_list + + +def create_sparse_iteration_space( + ctx: KernelCreationContext, assignments: AssignmentCollection +) -> IterationSpace: + return NotImplemented + + +def create_full_iteration_space( + ctx: KernelCreationContext, assignments: AssignmentCollection +) -> IterationSpace: + assert not ctx.fields.index_fields + + # Collect all relative accesses into domain fields + def access_filter(acc: Field.Access): + return acc.field.field_type in ( + FieldType.GENERIC, + FieldType.STAGGERED, + FieldType.STAGGERED_FLUX, + ) + + domain_field_accesses = assignments.atoms(Field.Access) + domain_field_accesses = set(filter(access_filter, domain_field_accesses)) + + # The following scenarios exist: + # - We have at least one domain field -> find the common field and use it to determine the iteration region + # - We have no domain fields, but at least one custom field -> determine common field from custom fields + # - We have neither domain nor custom fields -> Error + + from ...transformations import get_common_field + + if len(domain_field_accesses) > 0: + archetype_field = get_common_field(ctx.fields.domain_fields) + inferred_gls = max([fa.required_ghost_layers for fa in domain_field_accesses]) + elif len(ctx.fields.custom_fields) > 0: + archetype_field = get_common_field(ctx.fields.custom_fields) + inferred_gls = 0 + else: + raise PsInputError( + "Unable to construct iteration space: The kernel contains no accesses to domain or custom fields." + ) + + # If the user provided a ghost layer specification, use that + # Otherwise, if an iteration slice was specified, use that + # Otherwise, use the inferred ghost layers + + from .iteration_space import FullIterationSpace + + if ctx.options.ghost_layers is not None: + return FullIterationSpace.create_with_ghost_layers( + ctx, archetype_field, ctx.options.ghost_layers + ) + elif ctx.options.iteration_slice is not None: + raise PsInternalCompilerError("Iteration slices not supported yet") + else: + return FullIterationSpace.create_with_ghost_layers( + ctx, archetype_field, inferred_gls + ) diff --git a/src/pystencils/nbackend/kernelcreation/kernelcreation.py b/src/pystencils/nbackend/kernelcreation/kernelcreation.py new file mode 100644 index 0000000000000000000000000000000000000000..732b034597ec6aecfaaccf9323d0f5b3c431bd43 --- /dev/null +++ b/src/pystencils/nbackend/kernelcreation/kernelcreation.py @@ -0,0 +1,61 @@ +from itertools import chain + +from ...simp import AssignmentCollection + +from ..ast import PsBlock + +from .context import KernelCreationContext +from .analysis import KernelAnalysis +from .freeze import FreezeExpressions +from .typification import Typifier +from .options import KernelCreationOptions +from .iteration_space import ( + IterationSpace, + create_sparse_iteration_space, + create_full_iteration_space, +) + +# flake8: noqa + + +def create_kernel(assignments: AssignmentCollection, options: KernelCreationOptions): + # 1. Prepare context + ctx = KernelCreationContext(options) + + # 2. Check kernel constraints and collect knowledge + analysis = KernelAnalysis(ctx) + analysis(assignments) + + # 3. Create iteration space + ispace: IterationSpace = ( + create_sparse_iteration_space(ctx, assignments) + if len(ctx.fields.index_fields) > 0 + else create_full_iteration_space(ctx, assignments) + ) + + ctx.set_iteration_space(ispace) + + # 4. Freeze assignments + # This call is the same for both domain and indexed kernels + freeze = FreezeExpressions(ctx) + kernel_body: PsBlock = freeze(assignments) + + # 5. Typify + # Also the same for both types of kernels + typify = Typifier(ctx) + kernel_body = typify(kernel_body) + + # Up to this point, all was target-agnostic, but now the target becomes relevant. + # Here we might hand off the compilation to a target-specific part of the compiler + # (CPU/CUDA/...), since these will likely also apply very different optimizations. + + # 6. Add loops or device indexing + # This step translates the iteration space to actual index calculation code and is once again + # different in indexed and domain kernels. + + # 7. Apply optimizations + # - Vectorization + # - OpenMP + # - Loop Splitting, Tiling, Blocking + + # 8. Create and return kernel function. diff --git a/src/pystencils/simp/assignment_collection.py b/src/pystencils/simp/assignment_collection.py index b0c09cec9ae2066d2db9fcb11d2c03e239d0e091..cb89e58bbd58b4b5344aa3a59d741f1618b3a574 100644 --- a/src/pystencils/simp/assignment_collection.py +++ b/src/pystencils/simp/assignment_collection.py @@ -31,6 +31,8 @@ class AssignmentCollection: """ + __match_args__ = ("main_assignments", "subexpressions") + # ------------------------------- Creation & Inplace Manipulation -------------------------------------------------- def __init__(self, main_assignments: Union[List[Assignment], Dict[sp.Expr, sp.Expr]],