Frederik Hennig · Frederik Hennig · Frederik Hennig · Frederik Hennig · Frederik Hennig · Frederik Hennig
--- a/src/pystencils/backend/ast/expressions.py
+++ b/src/pystencils/backend/ast/expressions.py
@@ -51,7 +51,7 @@ class PsExpression(PsAstNode, ABC):

    def get_dtype(self) -> PsType:
        if self._dtype is None:
-            raise PsInternalCompilerError("No dtype set on this expression yet.")
+            raise PsInternalCompilerError(f"No data type set on expression {self}.")

        return self._dtype


--- a/src/pystencils/backend/emission/ir_printer.py
+++ b/src/pystencils/backend/emission/ir_printer.py
@@ -17,9 +17,21 @@ def emit_ir(ir: PsAstNode):


 class IRAstPrinter(BasePrinter):
-
-    def __init__(self, indent_width=3):
+    """Print the IR AST as pseudo-code.
+    
+    This printer produces a complete pseudocode representation of a pystencils AST.
+    Other than the `CAstPrinter`, the `IRAstPrinter` is capable of emitting code for
+    each node defined in `ast <pystencils.backend.ast>`.
+    It is furthermore configurable w.r.t. the level of detail it should emit.
+
+    Args:
+        indent_width: Number of spaces with which to indent lines in each nested block.
+        annotate_constants: If ``True`` (the default), annotate all constant literals with their data type.
+    """
+
+    def __init__(self, indent_width=3, annotate_constants: bool = True):
        super().__init__(indent_width)
+        self._annotate_constants = annotate_constants

    def visit(self, node: PsAstNode, pc: PrinterCtx) -> str:
        match node:
@@ -66,7 +78,10 @@ class IRAstPrinter(BasePrinter):
        return f"{symb.name}: {self._type_str(symb.dtype)}"

    def _constant_literal(self, constant: PsConstant) -> str:
-        return f"[{constant.value}: {self._deconst_type_str(constant.dtype)}]"
+        if self._annotate_constants:
+            return f"[{constant.value}: {self._deconst_type_str(constant.dtype)}]"
+        else:
+            return str(constant.value)

    def _type_str(self, dtype: PsType | None):
        if dtype is None:

--- a/src/pystencils/backend/kernelcreation/__init__.py
+++ b/src/pystencils/backend/kernelcreation/__init__.py
@@ -12,8 +12,6 @@ from .iteration_space import (
    create_sparse_iteration_space,
 )

-from .cpu_optimization import optimize_cpu
-
 __all__ = [
    "KernelCreationContext",
    "KernelAnalysis",
@@ -25,5 +23,4 @@ __all__ = [
    "SparseIterationSpace",
    "create_full_iteration_space",
    "create_sparse_iteration_space",
-    "optimize_cpu",
 ]
--- a/src/pystencils/backend/kernelcreation/cpu_optimization.py
+++ b/src/pystencils/backend/kernelcreation/cpu_optimization.py
-from __future__ import annotations
-from typing import cast, TYPE_CHECKING
-
-from .context import KernelCreationContext
-from ..ast.structural import PsBlock
-
-from ...config import CpuOptimConfig, OpenMpConfig
-
-if TYPE_CHECKING:
-    from ..platforms import GenericCpu
-
-
-def optimize_cpu(
-    ctx: KernelCreationContext,
-    platform: GenericCpu,
-    kernel_ast: PsBlock,
-    cfg: CpuOptimConfig | None,
-) -> PsBlock:
-    """Carry out CPU-specific optimizations according to the given configuration."""
-    from ..transformations import CanonicalizeSymbols, HoistLoopInvariantDeclarations
-
-    canonicalize = CanonicalizeSymbols(ctx, True)
-    kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
-
-    hoist_invariants = HoistLoopInvariantDeclarations(ctx)
-    kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast))
-
-    if cfg is None:
-        return kernel_ast
-
-    if cfg.loop_blocking:
-        raise NotImplementedError("Loop blocking not implemented yet.")
-
-    if cfg.vectorize is not False:
-        raise NotImplementedError("Vectorization not implemented yet")
-
-    if cfg.openmp is not False:
-        from ..transformations import AddOpenMP
-
-        params = cfg.openmp if isinstance(cfg.openmp, OpenMpConfig) else OpenMpConfig()
-        add_omp = AddOpenMP(ctx, params)
-        kernel_ast = cast(PsBlock, add_omp(kernel_ast))
-
-    if cfg.use_cacheline_zeroing:
-        raise NotImplementedError("CL-zeroing not implemented yet")
-
-    return kernel_ast
--- a/src/pystencils/backend/kernelcreation/iteration_space.py
+++ b/src/pystencils/backend/kernelcreation/iteration_space.py
@@ -13,7 +13,7 @@ from ..memory import PsSymbol, PsBuffer
 from ..constants import PsConstant
 from ..ast.expressions import PsExpression, PsConstantExpr, PsTernary, PsEq, PsRem
 from ..ast.util import failing_cast
-from ...types import PsStructType, constify
+from ...types import PsStructType
 from ..exceptions import PsInputError, KernelConstraintsError

 if TYPE_CHECKING:
@@ -209,17 +209,23 @@ class FullIterationSpace(IterationSpace):
    @property
    def archetype_field(self) -> Field | None:
        return self._archetype_field
+    
+    @property
+    def loop_order(self) -> tuple[int, ...]:
+        """Return the loop order of this iteration space, ordered from slowest to fastest coordinate."""
+        if self._archetype_field is not None:
+            return self._archetype_field.layout
+        else:
+            return tuple(range(len(self.dimensions)))

    def dimensions_in_loop_order(self) -> Sequence[FullIterationSpace.Dimension]:
        """Return the dimensions of this iteration space ordered from the slowest to the fastest coordinate.

-        If an archetype field is specified, the field layout is used to determine the ideal loop order;
+        If this iteration space has an `archetype field <FullIterationSpace.archetype_field>` set,
+        its field layout is used to determine the ideal loop order;
        otherwise, the dimensions are returned as they are
        """
-        if self._archetype_field is not None:
-            return [self._dimensions[i] for i in self._archetype_field.layout]
-        else:
-            return self._dimensions
+        return [self._dimensions[i] for i in self.loop_order]

    def actual_iterations(
        self, dimension: int | FullIterationSpace.Dimension | None = None
@@ -359,7 +365,7 @@ def create_sparse_iteration_space(
    dim = archetype_field.spatial_dimensions
    coord_members = [
        PsStructType.Member(name, ctx.index_dtype)
-        for name in DEFAULTS._index_struct_coordinate_names[:dim]
+        for name in DEFAULTS.index_struct_coordinate_names[:dim]
    ]

    #   Determine index field
@@ -379,7 +385,7 @@ def create_sparse_iteration_space(
        )

    spatial_counters = [
-        ctx.get_symbol(name, constify(ctx.index_dtype))
+        ctx.get_symbol(name, ctx.index_dtype)
        for name in DEFAULTS.spatial_counter_names[:dim]
    ]


--- a/src/pystencils/backend/platforms/x86.py
+++ b/src/pystencils/backend/platforms/x86.py
@@ -21,6 +21,7 @@ from ..constants import PsConstant

 from ..exceptions import MaterializationError
 from .generic_cpu import GenericVectorCpu
+from ..kernelcreation import KernelCreationContext

 from ...types.quick import Fp, SInt
 from ..functions import CFunction
@@ -77,6 +78,28 @@ class X86VectorArch(Enum):
                )

        return suffix
+    
+    def intrin_type(self, vtype: PsVectorType):
+        scalar_type = vtype.scalar_type
+        match scalar_type:
+            case Fp(16) if self >= X86VectorArch.AVX512:
+                suffix = "h"
+            case Fp(32):
+                suffix = ""
+            case Fp(64):
+                suffix = "d"
+            case SInt(_):
+                suffix = "i"
+            case _:
+                raise MaterializationError(
+                    f"x86/{self} does not support scalar type {scalar_type}"
+                )
+
+        if vtype.width > self.max_vector_width:
+            raise MaterializationError(
+                f"x86/{self} does not support {vtype}"
+            )
+        return PsCustomType(f"__m{vtype.width}{suffix}")


 class X86VectorCpu(GenericVectorCpu):
@@ -86,7 +109,8 @@ class X86VectorCpu(GenericVectorCpu):
    https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html.
    """

-    def __init__(self, vector_arch: X86VectorArch):
+    def __init__(self, ctx: KernelCreationContext, vector_arch: X86VectorArch):
+        super().__init__(ctx)
        self._vector_arch = vector_arch

    @property
@@ -111,26 +135,7 @@ class X86VectorCpu(GenericVectorCpu):
        return super().required_headers | headers

    def type_intrinsic(self, vector_type: PsVectorType) -> PsCustomType:
-        scalar_type = vector_type.scalar_type
-        match scalar_type:
-            case Fp(16) if self._vector_arch >= X86VectorArch.AVX512:
-                suffix = "h"
-            case Fp(32):
-                suffix = ""
-            case Fp(64):
-                suffix = "d"
-            case SInt(_):
-                suffix = "i"
-            case _:
-                raise MaterializationError(
-                    f"x86/{self._vector_arch} does not support scalar type {scalar_type}"
-                )
-
-        if vector_type.width > self._vector_arch.max_vector_width:
-            raise MaterializationError(
-                f"x86/{self._vector_arch} does not support {vector_type}"
-            )
-        return PsCustomType(f"__m{vector_type.width}{suffix}")
+        return self._vector_arch.intrin_type(vector_type)

    def constant_intrinsic(self, c: PsConstant) -> PsExpression:
        vtype = c.dtype
@@ -212,12 +217,14 @@ def _x86_op_intrin(
 ) -> CFunction:
    prefix = varch.intrin_prefix(vtype)
    suffix = varch.intrin_suffix(vtype)
+    rtype = atype = varch.intrin_type(vtype)

    match op:
        case PsVecBroadcast():
            opstr = "set1"
            if vtype.scalar_type == SInt(64) and vtype.vector_entries <= 4:
-                suffix += "x"
+                suffix += "x"   
+            atype = vtype.scalar_type
        case PsAdd():
            opstr = "add"
        case PsSub():
@@ -236,4 +243,4 @@ def _x86_op_intrin(
            raise MaterializationError(f"Unable to select operation intrinsic for {type(op)}")

    num_args = 1 if isinstance(op, PsUnOp) else 2
-    return CFunction(f"{prefix}_{opstr}_{suffix}", (vtype,) * num_args, vtype)
+    return CFunction(f"{prefix}_{opstr}_{suffix}", (atype,) * num_args, rtype)
--- a/src/pystencils/backend/transformations/eliminate_constants.py
+++ b/src/pystencils/backend/transformations/eliminate_constants.py
@@ -45,7 +45,7 @@ from ...types import (
    PsBoolType,
    PsScalarType,
    PsVectorType,
-    PsTypeError,
+    constify
 )


@@ -57,9 +57,9 @@ class ECContext:
        self._ctx = ctx
        self._extracted_constants: dict[AstEqWrapper, PsSymbol] = dict()

-        from ..emission import CAstPrinter
+        from ..emission import IRAstPrinter

-        self._printer = CAstPrinter(0)
+        self._printer = IRAstPrinter(indent_width=0, annotate_constants=False)

    @property
    def extractions(self) -> Iterable[tuple[PsSymbol, PsExpression]]:
@@ -89,10 +89,7 @@ class ECContext:

        if expr_wrapped not in self._extracted_constants:
            symb_name = self._get_symb_name(expr)
-            try:
-                symb = self._ctx.get_symbol(symb_name, dtype)
-            except PsTypeError:
-                symb = self._ctx.get_symbol(f"{symb_name}_{dtype.c_string()}", dtype)
+            symb = self._ctx.get_new_symbol(symb_name, constify(dtype))

            self._extracted_constants[expr_wrapped] = symb
        else:
@@ -133,6 +130,10 @@ class EliminateConstants:
    def __call__(self, node: PsExpression) -> PsExpression:
        pass

+    @overload
+    def __call__(self, node: PsBlock) -> PsBlock:
+        pass
+
    @overload
    def __call__(self, node: PsAstNode) -> PsAstNode:
        pass

--- a/src/pystencils/backend/transformations/loop_vectorizer.py
+++ b/src/pystencils/backend/transformations/loop_vectorizer.py
@@ -61,6 +61,24 @@ class LoopVectorizer:
        self._vectorize_ast = AstVectorizer(ctx)
        self._fold = EliminateConstants(ctx)

+    @overload
+    def vectorize_select_loops(
+        self, node: PsBlock, predicate: Callable[[PsLoop], bool]
+    ) -> PsBlock:
+        ...
+
+    @overload
+    def vectorize_select_loops(
+        self, node: PsLoop, predicate: Callable[[PsLoop], bool]
+    ) -> PsLoop | PsBlock:
+        ...
+
+    @overload
+    def vectorize_select_loops(
+        self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
+    ) -> PsAstNode:
+        ...
+
    def vectorize_select_loops(
        self, node: PsAstNode, predicate: Callable[[PsLoop], bool]
    ) -> PsAstNode:

--- a/src/pystencils/backend/transformations/lower_to_c.py
+++ b/src/pystencils/backend/transformations/lower_to_c.py
@@ -37,9 +37,13 @@ class LowerToC:

    def __init__(self, ctx: KernelCreationContext) -> None:
        self._ctx = ctx
+        self._substitutions: dict[PsSymbol, PsSymbol] = dict()
+
        self._typify = Typifier(ctx)

-        self._substitutions: dict[PsSymbol, PsSymbol] = dict()
+        from .eliminate_constants import EliminateConstants
+
+        self._fold = EliminateConstants(self._ctx)

    def __call__(self, node: PsAstNode) -> PsAstNode:
        self._substitutions = dict()
@@ -65,7 +69,8 @@ class LowerToC:
                        return i

                summands: list[PsExpression] = [
-                    maybe_cast(cast(PsExpression, self.visit(idx).clone())) * PsExpression.make(stride)
+                    maybe_cast(cast(PsExpression, self.visit(idx).clone()))
+                    * PsExpression.make(stride)
                    for idx, stride in zip(indices, buf.strides, strict=True)
                ]

@@ -77,9 +82,11 @@ class LowerToC:

                mem_acc = PsMemAcc(bptr.clone(), linearized_idx)

-                return self._typify.typify_expression(
-                    mem_acc, target_type=buf.element_type
-                )[0]
+                return self._fold(
+                    self._typify.typify_expression(
+                        mem_acc, target_type=buf.element_type
+                    )[0]
+                )

            case PsLookup(aggr, member_name) if isinstance(
                aggr, PsBufferAcc
@@ -115,10 +122,7 @@ class LowerToC:
                const=bp_type.const,
                restrict=bp_type.restrict,
            )
-            type_erased_bp = PsSymbol(
-                bp.name,
-                erased_type
-            )
+            type_erased_bp = PsSymbol(bp.name, erased_type)
            type_erased_bp.add_property(BufferBasePtr(buf))
            self._substitutions[bp] = type_erased_bp
        else:

--- a/src/pystencils/config.py
+++ b/src/pystencils/config.py
@@ -5,12 +5,18 @@ from warnings import warn
 from collections.abc import Collection

 from typing import Sequence
-from dataclasses import dataclass, InitVar
+from dataclasses import dataclass, InitVar, replace

 from .target import Target
 from .field import Field, FieldType

-from .types import PsIntegerType, UserTypeSpec, PsIeeeFloatType, create_type
+from .types import (
+    PsIntegerType,
+    UserTypeSpec,
+    PsIeeeFloatType,
+    PsScalarType,
+    create_type,
+)

 from .defaults import DEFAULTS

@@ -90,6 +96,14 @@ class CpuOptimConfig:
    to produce cacheline zeroing instructions where possible.
    """

+    def get_vectorization_config(self) -> VectorizationConfig | None:
+        if self.vectorize is True:
+            return VectorizationConfig()
+        elif isinstance(self.vectorize, VectorizationConfig):
+            return self.vectorize
+        else:
+            return None
+

 @dataclass
 class VectorizationConfig:
@@ -99,14 +113,13 @@ class VectorizationConfig:
    in `CreateKernelConfig.target`, an error will be raised.
    """

-    vector_width: int | None = None
-    """Desired vector register width in bits.
-    
-    If set to an integer value, the vectorizer will use this as the desired vector register width.
+    lanes: int | None = None
+    """Number of SIMD lanes to be used in vectorization.

-    If set to `None`, the vector register width will be automatically set to the broadest possible.
+    If set to `None` (the default), the vector register width will be automatically set to the broadest possible.
    
-    If the selected CPU does not support the given width, an error will be raised.
+    If the CPU architecture specified in `target <CreateKernelConfig.target>` does not support some
+    operation contained in the kernel with the given number of lanes, an error will be raised.
    """

    use_nontemporal_stores: bool | Collection[str | Field] = False
@@ -134,6 +147,25 @@ class VectorizationConfig:
    that is not equal to one, an error will be raised.
    """

+    @staticmethod
+    def default_lanes(target: Target, dtype: PsScalarType):
+        if not target.is_vector_cpu():
+            raise ValueError(f"Given target {target} is no vector CPU target.")
+
+        assert dtype.itemsize is not None
+
+        match target:
+            case Target.X86_SSE:
+                return 128 // (dtype.itemsize * 8)
+            case Target.X86_AVX:
+                return 256 // (dtype.itemsize * 8)
+            case Target.X86_AVX512 | Target.X86_AVX512_FP16:
+                return 512 // (dtype.itemsize * 8)
+            case _:
+                raise NotImplementedError(
+                    f"No default number of lanes known for {dtype} on {target}"
+                )
+

 @dataclass
 class GpuIndexingConfig:
@@ -266,6 +298,13 @@ class CreateKernelConfig:

    #   Getters

+    def get_target(self) -> Target:
+        match self.target:
+            case Target.CurrentCPU:
+                return Target.auto_cpu()
+            case _:
+                return self.target
+
    def get_jit(self) -> JitBase:
        """Returns either the user-specified JIT compiler, or infers one from the target if none is given."""
        if self.jit is None:
@@ -371,7 +410,7 @@ class CreateKernelConfig:
            warn(
                "Setting the deprecated `data_type` will override the value of `default_dtype`. "
                "Set `default_dtype` instead.",
-                FutureWarning,
+                UserWarning,
            )
            self.default_dtype = data_type

@@ -394,7 +433,52 @@ class CreateKernelConfig:

        if cpu_vectorize_info is not None:
            _deprecated_option("cpu_vectorize_info", "cpu_optim.vectorize")
-            raise NotImplementedError("CPU vectorization is not implemented yet")
+            if "instruction_set" in cpu_vectorize_info:
+                if self.target != Target.GenericCPU:
+                    raise PsOptionsError(
+                        "Setting 'instruction_set' in the deprecated 'cpu_vectorize_info' option is only "
+                        "valid if `target == Target.CPU`."
+                    )
+
+                isa = cpu_vectorize_info["instruction_set"]
+                vec_target: Target
+                match isa:
+                    case "best":
+                        vec_target = Target.available_vector_cpu_targets().pop()
+                    case "sse":
+                        vec_target = Target.X86_SSE
+                    case "avx":
+                        vec_target = Target.X86_AVX
+                    case "avx512":
+                        vec_target = Target.X86_AVX512
+                    case "avx512vl":
+                        vec_target = Target.X86_AVX512 | Target._VL
+                    case _:
+                        raise PsOptionsError(
+                            f'Value {isa} in `cpu_vectorize_info["instruction_set"]` is not supported.'
+                        )
+
+                warn(
+                    f"Value {isa} for `instruction_set` in deprecated `cpu_vectorize_info` "
+                    "will override the `target` option. "
+                    f"Set `target` to {vec_target} instead.",
+                    UserWarning,
+                )
+
+                self.target = vec_target
+
+            deprecated_vec_opts = VectorizationConfig(
+                assume_inner_stride_one=cpu_vectorize_info.get(
+                    "assume_inner_stride_one", False
+                ),
+                assume_aligned=cpu_vectorize_info.get("assume_aligned", False),
+                use_nontemporal_stores=cpu_vectorize_info.get("nontemporal", False),
+            )
+
+            if optim is not None:
+                optim = replace(optim, vectorize=deprecated_vec_opts)
+            else:
+                optim = CpuOptimConfig(vectorize=deprecated_vec_opts)

        if optim is not None:
            if self.cpu_optim is not None:

--- a/src/pystencils/defaults.py
+++ b/src/pystencils/defaults.py
-from typing import TypeVar, Generic, Callable
-from .types import PsType, PsIeeeFloatType, PsIntegerType, PsSignedIntegerType, PsStructType
+from .types import (
+    PsIeeeFloatType,
+    PsIntegerType,
+    PsSignedIntegerType,
+    PsStructType,
+    UserTypeSpec,
+    create_type,
+)

-from pystencils.sympyextensions.typed_sympy import TypedSymbol
+from pystencils.sympyextensions.typed_sympy import TypedSymbol, DynamicType

-SymbolT = TypeVar("SymbolT")

-
-class GenericDefaults(Generic[SymbolT]):
-    def __init__(self, symcreate: Callable[[str, PsType], SymbolT]):
+class SympyDefaults:
+    def __init__(self):
        self.numeric_dtype = PsIeeeFloatType(64)
        """Default data type for numerical computations"""

@@ -18,37 +22,38 @@ class GenericDefaults(Generic[SymbolT]):
        """Names of the default spatial counters"""

        self.spatial_counters = (
-            symcreate("ctr_0", self.index_dtype),
-            symcreate("ctr_1", self.index_dtype),
-            symcreate("ctr_2", self.index_dtype),
+            TypedSymbol("ctr_0", DynamicType.INDEX_TYPE),
+            TypedSymbol("ctr_1", DynamicType.INDEX_TYPE),
+            TypedSymbol("ctr_2", DynamicType.INDEX_TYPE),
        )
        """Default spatial counters"""

-        self._index_struct_coordinate_names = ("x", "y", "z")
+        self.index_struct_coordinate_names = ("x", "y", "z")
        """Default names of spatial coordinate members in index list structures"""

-        self.index_struct_coordinates = (
-            PsStructType.Member("x", self.index_dtype),
-            PsStructType.Member("y", self.index_dtype),
-            PsStructType.Member("z", self.index_dtype),
-        )
-        """Default spatial coordinate members in index list structures"""
-
        self.sparse_counter_name = "sparse_idx"
        """Name of the default sparse iteration counter"""

-        self.sparse_counter = symcreate(self.sparse_counter_name, self.index_dtype)
+        self.sparse_counter = TypedSymbol(
+            self.sparse_counter_name, DynamicType.INDEX_TYPE
+        )
        """Default sparse iteration counter."""

    def field_shape_name(self, field_name: str, coord: int):
        return f"_size_{field_name}_{coord}"
-    
+
    def field_stride_name(self, field_name: str, coord: int):
        return f"_stride_{field_name}_{coord}"
-    
+
    def field_pointer_name(self, field_name: str):
        return f"_data_{field_name}"

+    def index_struct(self, index_dtype: UserTypeSpec, dim: int) -> PsStructType:
+        idx_type = create_type(index_dtype)
+        return PsStructType(
+            [(name, idx_type) for name in self.index_struct_coordinate_names[:dim]]
+        )
+

-DEFAULTS = GenericDefaults[TypedSymbol](TypedSymbol)
+DEFAULTS = SympyDefaults()
 """Default names and symbols used throughout code generation"""
--- a/src/pystencils/kernelcreation.py
+++ b/src/pystencils/kernelcreation.py
@@ -2,27 +2,35 @@ from typing import cast, Sequence
 from dataclasses import replace

 from .target import Target
-from .config import CreateKernelConfig
+from .config import (
+    CreateKernelConfig,
+    OpenMpConfig,
+    VectorizationConfig,
+)
 from .backend import KernelFunction
-from .types import create_numeric_type, PsIntegerType
-from .backend.ast.structural import PsBlock
+from .types import create_numeric_type, PsIntegerType, PsScalarType
+from .backend.ast.structural import PsBlock, PsLoop
 from .backend.kernelcreation import (
    KernelCreationContext,
    KernelAnalysis,
    FreezeExpressions,
    Typifier,
 )
+from .backend.constants import PsConstant
 from .backend.kernelcreation.iteration_space import (
    create_sparse_iteration_space,
    create_full_iteration_space,
+    FullIterationSpace,
 )
-
+from .backend.platforms import Platform, GenericCpu, GenericVectorCpu, GenericGpu
+from .backend.exceptions import VectorizationError

 from .backend.transformations import (
    EliminateConstants,
    LowerToC,
    SelectFunctions,
    CanonicalizeSymbols,
+    HoistLoopInvariantDeclarations,
 )
 from .backend.kernelfunction import (
    create_cpu_kernel_function,
@@ -60,125 +68,245 @@ def create_kernel(
    if kwargs:
        config = replace(config, **kwargs)

-    idx_dtype = create_numeric_type(config.index_dtype)
-    assert isinstance(idx_dtype, PsIntegerType)
+    driver = DefaultKernelCreationDriver(config)
+    return driver(assignments)

-    ctx = KernelCreationContext(
-        default_dtype=create_numeric_type(config.default_dtype),
-        index_dtype=idx_dtype,
-    )

-    if isinstance(assignments, AssignmentBase):
-        assignments = [assignments]
+class DefaultKernelCreationDriver:
+    def __init__(self, cfg: CreateKernelConfig):
+        self._cfg = cfg

-    if not isinstance(assignments, AssignmentCollection):
-        assignments = AssignmentCollection(assignments)  # type: ignore
+        idx_dtype = create_numeric_type(self._cfg.index_dtype)
+        assert isinstance(idx_dtype, PsIntegerType)

-    _ = _parse_simplification_hints(assignments)
+        self._ctx = KernelCreationContext(
+            default_dtype=create_numeric_type(self._cfg.default_dtype),
+            index_dtype=idx_dtype,
+        )

-    analysis = KernelAnalysis(
-        ctx, not config.skip_independence_check, not config.allow_double_writes
-    )
-    analysis(assignments)
+        self._target = self._cfg.get_target()
+        self._platform = self._get_platform()

-    if len(ctx.fields.index_fields) > 0 or config.index_field is not None:
-        ispace = create_sparse_iteration_space(
-            ctx, assignments, index_field=config.index_field
-        )
-    else:
-        ispace = create_full_iteration_space(
-            ctx,
-            assignments,
-            ghost_layers=config.ghost_layers,
-            iteration_slice=config.iteration_slice,
+    def __call__(
+        self,
+        assignments: AssignmentCollection | Sequence[AssignmentBase] | AssignmentBase,
+    ):
+        if isinstance(assignments, AssignmentBase):
+            assignments = [assignments]
+
+        if not isinstance(assignments, AssignmentCollection):
+            assignments = AssignmentCollection(assignments)  # type: ignore
+
+        _ = _parse_simplification_hints(assignments)
+
+        analysis = KernelAnalysis(
+            self._ctx,
+            not self._cfg.skip_independence_check,
+            not self._cfg.allow_double_writes,
        )
+        analysis(assignments)

-    ctx.set_iteration_space(ispace)
+        if len(self._ctx.fields.index_fields) > 0 or self._cfg.index_field is not None:
+            ispace = create_sparse_iteration_space(
+                self._ctx, assignments, index_field=self._cfg.index_field
+            )
+        else:
+            ispace = create_full_iteration_space(
+                self._ctx,
+                assignments,
+                ghost_layers=self._cfg.ghost_layers,
+                iteration_slice=self._cfg.iteration_slice,
+            )

-    freeze = FreezeExpressions(ctx)
-    kernel_body = freeze(assignments)
+        self._ctx.set_iteration_space(ispace)

-    typify = Typifier(ctx)
-    kernel_body = typify(kernel_body)
+        freeze = FreezeExpressions(self._ctx)
+        kernel_body = freeze(assignments)

-    match config.target:
-        case Target.GenericCPU:
-            from .backend.platforms import GenericCpu
+        typify = Typifier(self._ctx)
+        kernel_body = typify(kernel_body)

-            platform = GenericCpu(ctx)
-            kernel_ast = platform.materialize_iteration_space(kernel_body, ispace)
+        match self._platform:
+            case GenericCpu():
+                kernel_ast = self._platform.materialize_iteration_space(
+                    kernel_body, ispace
+                )
+            case GenericGpu():
+                kernel_ast, gpu_threads = self._platform.materialize_iteration_space(
+                    kernel_body, ispace
+                )

-        case target if target.is_gpu():
-            match target:
-                case Target.SYCL:
-                    from .backend.platforms import SyclPlatform
+        #   Fold and extract constants
+        elim_constants = EliminateConstants(self._ctx, extract_constant_exprs=True)
+        kernel_ast = cast(PsBlock, elim_constants(kernel_ast))

-                    platform = SyclPlatform(ctx, config.gpu_indexing)
-                case Target.CUDA:
-                    from .backend.platforms import CudaPlatform
+        #   Target-Specific optimizations
+        if self._cfg.target.is_cpu():
+            kernel_ast = self._transform_for_cpu(kernel_ast)
+
+        #   Note: After this point, the AST may contain intrinsics, so type-dependent
+        #   transformations cannot be run any more
+
+        #   Lowering
+        lower_to_c = LowerToC(self._ctx)
+        kernel_ast = cast(PsBlock, lower_to_c(kernel_ast))
+
+        select_functions = SelectFunctions(self._platform)
+        kernel_ast = cast(PsBlock, select_functions(kernel_ast))
+
+        #   Late canonicalization pass: Canonicalize new symbols introduced by LowerToC

-                    platform = CudaPlatform(ctx, config.gpu_indexing)
-                case _:
-                    raise NotImplementedError(
-                        f"Code generation for target {target} not implemented"
-                    )
+        canonicalize = CanonicalizeSymbols(self._ctx, True)
+        kernel_ast = cast(PsBlock, canonicalize(kernel_ast))

-            kernel_ast, gpu_threads = platform.materialize_iteration_space(
-                kernel_body, ispace
+        if self._cfg.target.is_cpu():
+            return create_cpu_kernel_function(
+                self._ctx,
+                self._platform,
+                kernel_ast,
+                self._cfg.function_name,
+                self._cfg.target,
+                self._cfg.get_jit(),
            )
+        else:
+            return create_gpu_kernel_function(
+                self._ctx,
+                self._platform,
+                kernel_ast,
+                gpu_threads,
+                self._cfg.function_name,
+                self._cfg.target,
+                self._cfg.get_jit(),
+            )
+
+    def _transform_for_cpu(self, kernel_ast: PsBlock):
+        canonicalize = CanonicalizeSymbols(self._ctx, True)
+        kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
+
+        hoist_invariants = HoistLoopInvariantDeclarations(self._ctx)
+        kernel_ast = cast(PsBlock, hoist_invariants(kernel_ast))
+
+        cpu_cfg = self._cfg.cpu_optim
+
+        if cpu_cfg is None:
+            return kernel_ast

-        case _:
-            raise NotImplementedError(
-                f"Code generation for target {target} not implemented"
+        if cpu_cfg.loop_blocking:
+            raise NotImplementedError("Loop blocking not implemented yet.")
+
+        kernel_ast = self._vectorize(kernel_ast)
+
+        if cpu_cfg.openmp is not False:
+            from .backend.transformations import AddOpenMP
+
+            params = (
+                cpu_cfg.openmp
+                if isinstance(cpu_cfg.openmp, OpenMpConfig)
+                else OpenMpConfig()
            )
+            add_omp = AddOpenMP(self._ctx, params)
+            kernel_ast = cast(PsBlock, add_omp(kernel_ast))
+
+        if cpu_cfg.use_cacheline_zeroing:
+            raise NotImplementedError("CL-zeroing not implemented yet")

-    #   Fold and extract constants
-    elim_constants = EliminateConstants(ctx, extract_constant_exprs=True)
-    kernel_ast = cast(PsBlock, elim_constants(kernel_ast))
+        return kernel_ast

-    #   Target-Specific optimizations
-    if config.target.is_cpu():
-        from .backend.kernelcreation import optimize_cpu
+    def _vectorize(self, kernel_ast: PsBlock) -> PsBlock:
+        assert self._cfg.cpu_optim is not None
+        vec_config = self._cfg.cpu_optim.get_vectorization_config()
+        if vec_config is None:
+            return kernel_ast

-        assert isinstance(platform, GenericCpu)
+        from .backend.transformations import LoopVectorizer, SelectIntrinsics

-        kernel_ast = optimize_cpu(ctx, platform, kernel_ast, config.cpu_optim)
+        assert isinstance(self._platform, GenericVectorCpu)

-    #   Lowering
-    lower_to_c = LowerToC(ctx)
-    kernel_ast = cast(PsBlock, lower_to_c(kernel_ast))
+        ispace = self._ctx.get_iteration_space()
+        if not isinstance(ispace, FullIterationSpace):
+            raise VectorizationError(
+                "Unable to vectorize kernel: The kernel is not using a dense iteration space."
+            )

-    select_functions = SelectFunctions(platform)
-    kernel_ast = cast(PsBlock, select_functions(kernel_ast))
+        inner_loop_coord = ispace.loop_order[-1]
+        inner_loop_dim = ispace.dimensions[inner_loop_coord]
+
+        #   Apply stride (TODO: and alignment) assumptions
+        if vec_config.assume_inner_stride_one:
+            for field in self._ctx.fields:
+                buf = self._ctx.get_buffer(field)
+                inner_stride = buf.strides[inner_loop_coord]
+                if isinstance(inner_stride, PsConstant):
+                    if inner_stride.value != 1:
+                        raise VectorizationError(
+                            f"Unable to apply assumption 'assume_inner_stride_one': "
+                            f"Field {field} has fixed stride {inner_stride} "
+                            f"set in the inner coordinate {inner_loop_coord}."
+                        )
+                else:
+                    buf.strides[inner_loop_coord] = PsConstant(1, buf.index_type)
+                    #   TODO: Communicate assumption to runtime system via a precondition
+
+        #   Call loop vectorizer
+        if vec_config.lanes is None:
+            lanes = VectorizationConfig.default_lanes(
+                self._target, cast(PsScalarType, self._ctx.default_dtype)
+            )
+        else:
+            lanes = vec_config.lanes
+
+        vectorizer = LoopVectorizer(self._ctx, lanes)
+
+        def loop_predicate(loop: PsLoop):
+            return loop.counter.symbol == inner_loop_dim.counter
+
+        kernel_ast = vectorizer.vectorize_select_loops(kernel_ast, loop_predicate)
+
+        select_intrin = SelectIntrinsics(self._ctx, self._platform)
+        kernel_ast = cast(PsBlock, select_intrin(kernel_ast))
+
+        return kernel_ast
+
+    def _get_platform(self) -> Platform:
+        if Target._CPU in self._target:
+            if Target._X86 in self._target:
+                from .backend.platforms.x86 import X86VectorArch, X86VectorCpu
+
+                arch: X86VectorArch
+
+                if Target._SSE in self._target:
+                    arch = X86VectorArch.SSE
+                elif Target._AVX in self._target:
+                    arch = X86VectorArch.AVX
+                elif Target._AVX512 in self._target:
+                    if Target._FP16 in self._target:
+                        arch = X86VectorArch.AVX512_FP16
+                    else:
+                        arch = X86VectorArch.AVX512
+                else:
+                    assert False, "unreachable code"
+
+                return X86VectorCpu(self._ctx, arch)
+            elif self._target == Target.GenericCPU:
+                return GenericCpu(self._ctx)
+            else:
+                raise NotImplementedError(
+                    f"No platform is currently available for CPU target {self._target}"
+                )
+
+        elif Target._GPU in self._target:
+            match self._target:
+                case Target.SYCL:
+                    from .backend.platforms import SyclPlatform

-    #   Late canonicalization and constant elimination passes
-    #    * Since lowering introduces new index calculations and indexing symbols into the AST,
-    #    * these need to be handled here
-    
-    canonicalize = CanonicalizeSymbols(ctx, True)
-    kernel_ast = cast(PsBlock, canonicalize(kernel_ast))
+                    return SyclPlatform(self._ctx, self._cfg.gpu_indexing)
+                case Target.CUDA:
+                    from .backend.platforms import CudaPlatform

-    late_fold_constants = EliminateConstants(ctx, extract_constant_exprs=False)
-    kernel_ast = cast(PsBlock, late_fold_constants(kernel_ast))
+                    return CudaPlatform(self._ctx, self._cfg.gpu_indexing)

-    if config.target.is_cpu():
-        return create_cpu_kernel_function(
-            ctx,
-            platform,
-            kernel_ast,
-            config.function_name,
-            config.target,
-            config.get_jit(),
-        )
-    else:
-        return create_gpu_kernel_function(
-            ctx,
-            platform,
-            kernel_ast,
-            gpu_threads,
-            config.function_name,
-            config.target,
-            config.get_jit(),
+        raise NotImplementedError(
+            f"Code generation for target {self._target} not implemented"
        )


@@ -192,6 +320,9 @@ def create_staggered_kernel(

 #   Internals

+
 def _parse_simplification_hints(ac: AssignmentCollection):
    if "split_groups" in ac.simplification_hints:
-        raise NotImplementedError("Loop splitting was requested, but is not implemented yet")
+        raise NotImplementedError(
+            "Loop splitting was requested, but is not implemented yet"
+        )
--- a/src/pystencils/py.typed
+++ b/src/pystencils/py.typed
--- a/src/pystencils/target.py
+++ b/src/pystencils/target.py
@@ -118,7 +118,7 @@ class Target(Flag):

    @staticmethod
    def available_vector_cpu_targets() -> list[Target]:
-        """Returns a list of available (vector) CPU targets, ordered from least to most capable."""
+        """Returns a list of available vector CPU targets, ordered from least to most capable."""
        return _available_vector_targets()



--- a/src/pystencils/types/meta.py
+++ b/src/pystencils/types/meta.py
@@ -138,7 +138,7 @@ class PsType(metaclass=PsTypeMeta):

    @property
    def itemsize(self) -> int | None:
-        """If this type has a valid in-memory size, return that size."""
+        """If this type has a valid in-memory size, return that size in bytes."""
        return None

    @property

--- a/src/pystencils/types/types.py
+++ b/src/pystencils/types/types.py
@@ -100,15 +100,19 @@ class PsPointerType(PsDereferencableType):

 class PsArrayType(PsDereferencableType):
    """Multidimensional array of fixed shape.
-    
+
    The element type of an array is never const; only the array itself can be.
    If ``element_type`` is const, its constness will be removed.
    """

    def __init__(
-        self, element_type: PsType, shape: SupportsIndex | Sequence[SupportsIndex], const: bool = False
+        self,
+        element_type: PsType,
+        shape: SupportsIndex | Sequence[SupportsIndex],
+        const: bool = False,
    ):
        from operator import index
+
        if isinstance(shape, SupportsIndex):
            shape = (index(shape),)
        else:
@@ -116,10 +120,10 @@ class PsArrayType(PsDereferencableType):

        if not shape or any(s <= 0 for s in shape):
            raise ValueError(f"Invalid array shape: {shape}")
-        
+
        if isinstance(element_type, PsArrayType):
            raise ValueError("Element type of array cannot be another array.")
-        
+
        element_type = deconstify(element_type)

        self._shape = shape
@@ -137,7 +141,7 @@ class PsArrayType(PsDereferencableType):
    def shape(self) -> tuple[int, ...]:
        """Shape of this array"""
        return self._shape
-    
+
    @property
    def dim(self) -> int:
        """Dimensionality of this array"""
@@ -396,12 +400,13 @@ class PsVectorType(PsNumericType):
        return np.dtype((self._scalar_type.numpy_dtype, (self._vector_entries,)))

    def create_constant(self, value: Any) -> Any:
-        if (
-            isinstance(value, np.ndarray)
-            and value.dtype == self.scalar_type.numpy_dtype
-            and value.shape == (self._vector_entries,)
-        ):
-            return value.copy()
+        if isinstance(value, np.ndarray):
+            if value.shape != (self._vector_entries,):
+                raise PsTypeError(
+                    f"Cannot create constant of vector type {self} from array of shape {value.shape}"
+                )
+            
+            return np.array([self._scalar_type.create_constant(v) for v in value])

        element = self._scalar_type.create_constant(value)
        return np.array(
@@ -552,7 +557,7 @@ class PsIntegerType(PsScalarType, ABC):

    def c_string(self) -> str:
        return f"{self._const_string()}{self._str_without_const()}_t"
-    
+
    def __str__(self) -> str:
        return f"{self._const_string()}{self._str_without_const()}"


--- a/tests/kernelcreation/test_domain_kernels.py
+++ b/tests/kernelcreation/test_domain_kernels.py
@@ -2,32 +2,84 @@ import pytest
 import sympy as sp
 import numpy as np

-from pystencils import fields, Field, AssignmentCollection, Target, CreateKernelConfig
+from dataclasses import replace
+
+from pystencils import (
+    fields,
+    Field,
+    AssignmentCollection,
+    Target,
+    CreateKernelConfig,
+    CpuOptimConfig,
+    VectorizationConfig,
+)
 from pystencils.assignment import assignment_from_stencil

-from pystencils.kernelcreation import create_kernel
+from pystencils.kernelcreation import create_kernel, KernelFunction
+from pystencils.backend.emission import emit_code

+AVAILABLE_TARGETS = [Target.GenericCPU]

-@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA))
-def test_filter_kernel(target):
-    if target == Target.CUDA:
-        xp = pytest.importorskip("cupy")
+try:
+    import cupy
+
+    AVAILABLE_TARGETS += [Target.CUDA]
+except ImportError:
+    pass
+
+AVAILABLE_TARGETS += Target.available_vector_cpu_targets()
+TEST_IDS = [t.name for t in AVAILABLE_TARGETS]
+
+
+@pytest.fixture(params=AVAILABLE_TARGETS, ids=TEST_IDS)
+def gen_config(request):
+    target: Target = request.param
+
+    gen_config = CreateKernelConfig(target=target)
+
+    if Target._VECTOR in target:
+        gen_config = replace(
+            gen_config,
+            cpu_optim=CpuOptimConfig(
+                vectorize=VectorizationConfig(assume_inner_stride_one=True)
+            ),
+        )
+
+    return gen_config
+
+
+def inspect_dp_kernel(kernel: KernelFunction, gen_config: CreateKernelConfig):
+    code = emit_code(kernel)
+
+    match gen_config.target:
+        case Target.X86_SSE:
+            assert "_mm_loadu_pd" in code
+            assert "_mm_storeu_pd" in code
+        case Target.X86_AVX:
+            assert "_mm256_loadu_pd" in code
+            assert "_mm256_storeu_pd" in code
+        case Target.X86_AVX512:
+            assert "_mm512_loadu_pd" in code
+            assert "_mm512_storeu_pd" in code
+
+
+def test_filter_kernel(gen_config):
+    if gen_config.target == Target.CUDA:
+        import cupy as cp
+
+        xp = cp
    else:
        xp = np

    weight = sp.Symbol("weight")
-    stencil = [
-        [1, 1, 1],
-        [1, 1, 1],
-        [1, 1, 1]
-    ]
+    stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]

    src, dst = fields("src, dst: [2D]")
    asm = assignment_from_stencil(stencil, src, dst, normalization_factor=weight)
    asms = AssignmentCollection([asm])

-    gen_config = CreateKernelConfig(target=target)
    ast = create_kernel(asms, gen_config)
+    inspect_dp_kernel(ast, gen_config)
    kernel = ast.compile()

    src_arr = xp.ones((42, 31))
@@ -41,31 +93,28 @@ def test_filter_kernel(target):
    xp.testing.assert_allclose(dst_arr, expected)


-@pytest.mark.parametrize("target", (Target.GenericCPU, Target.CUDA))
-def test_filter_kernel_fixedsize(target):
-    if target == Target.CUDA:
-        xp = pytest.importorskip("cupy")
+def test_filter_kernel_fixedsize(gen_config):
+    if gen_config.target == Target.CUDA:
+        import cupy as cp
+
+        xp = cp
    else:
        xp = np

    weight = sp.Symbol("weight")
-    stencil = [
-        [1, 1, 1],
-        [1, 1, 1],
-        [1, 1, 1]
-    ]
+    stencil = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]

    src_arr = xp.ones((42, 31))
    dst_arr = xp.zeros_like(src_arr)

    src = Field.create_from_numpy_array("src", src_arr)
    dst = Field.create_from_numpy_array("dst", dst_arr)
-    
+
    asm = assignment_from_stencil(stencil, src, dst, normalization_factor=weight)
    asms = AssignmentCollection([asm])

-    gen_config = CreateKernelConfig(target=target)
    ast = create_kernel(asms, gen_config)
+    inspect_dp_kernel(ast, gen_config)
    kernel = ast.compile()

    kernel(src=src_arr, dst=dst_arr, weight=2.0)

--- a/tests/kernelcreation/test_sliced_iteration.py
+++ b/tests/kernelcreation/test_sliced_iteration.py
@@ -27,6 +27,3 @@ def test_sliced_iteration():
    expected_result = np.zeros(size)
    expected_result[1:x_end_value, 1] = 1
    np.testing.assert_almost_equal(expected_result, dst_arr)
-
-
-test_sliced_iteration()
--- a/tests/kernelcreation/test_spatial_counters.py
+++ b/tests/kernelcreation/test_spatial_counters.py
+import pytest
+import numpy as np
+
+from pystencils import (
+    Field,
+    Assignment,
+    create_kernel,
+    CreateKernelConfig,
+    DEFAULTS,
+    FieldType,
+)
+from pystencils.sympyextensions import CastFunc
+
+
+@pytest.mark.parametrize("index_dtype", ["int16", "int32", "uint32", "int64"])
+def test_spatial_counters_dense(index_dtype):
+    #   Parametrized over index_dtype to make sure the `DynamicType.INDEX` in the
+    #   DEFAULTS works validly
+    x, y, z = DEFAULTS.spatial_counters
+
+    f = Field.create_generic("f", 3, "float64", index_shape=(3,), layout="fzyx")
+
+    asms = [
+        Assignment(f(0), CastFunc.as_numeric(z)),
+        Assignment(f(1), CastFunc.as_numeric(y)),
+        Assignment(f(2), CastFunc.as_numeric(x)),
+    ]
+
+    cfg = CreateKernelConfig(index_dtype=index_dtype)
+    kernel = create_kernel(asms, cfg).compile()
+
+    f_arr = np.zeros((16, 16, 16, 3))
+    kernel(f=f_arr)
+
+    expected = np.mgrid[0:16, 0:16, 0:16].astype(np.float64).transpose()
+
+    np.testing.assert_equal(f_arr, expected)
+
+
+@pytest.mark.parametrize("index_dtype", ["int16", "int32", "uint32", "int64"])
+def test_spatial_counters_sparse(index_dtype):
+    x, y, z = DEFAULTS.spatial_counters
+
+    f = Field.create_generic("f", 3, "float64", index_shape=(3,), layout="fzyx")
+
+    asms = [
+        Assignment(f(0), CastFunc.as_numeric(x)),
+        Assignment(f(1), CastFunc.as_numeric(y)),
+        Assignment(f(2), CastFunc.as_numeric(z)),
+    ]
+
+    idx_struct = DEFAULTS.index_struct(index_dtype, 3)
+    idx_field = Field.create_generic(
+        "index", 1, idx_struct, field_type=FieldType.INDEXED
+    )
+
+    cfg = CreateKernelConfig(index_dtype=index_dtype, index_field=idx_field)
+    kernel = create_kernel(asms, cfg).compile()
+
+    f_arr = np.zeros((16, 16, 16, 3))
+    idx_arr = np.array(
+        [(1, 4, 3), (5, 1, 6), (9, 5, 1), (3, 13, 7)], dtype=idx_struct.numpy_dtype
+    )
+
+    kernel(f=f_arr, index=idx_arr)
+
+    for t in idx_arr:
+        assert f_arr[t[0], t[1], t[2], 0] == t[0].astype(np.float64)
+        assert f_arr[t[0], t[1], t[2], 1] == t[1].astype(np.float64)
+        assert f_arr[t[0], t[1], t[2], 2] == t[2].astype(np.float64)
--- a/tests/nbackend/test_vectorization.py
+++ b/tests/nbackend/test_vectorization.py
@@ -3,6 +3,8 @@ import sympy as sp
 import numpy as np
 from dataclasses import dataclass
 from itertools import chain
+from functools import partial
+from typing import Callable

 from pystencils.backend.kernelcreation import (
    KernelCreationContext,
@@ -28,64 +30,52 @@ from pystencils.types.quick import SInt, Fp

 @dataclass
 class VectorTestSetup:
-    platform: GenericVectorCpu
+    target: Target
+    platform_factory: Callable[[KernelCreationContext], GenericVectorCpu]
    lanes: int
    numeric_dtype: PsScalarType
    index_dtype: PsIntegerType

    @property
    def name(self) -> str:
-        if isinstance(self.platform, X86VectorCpu):
-            match self.platform.vector_arch:
-                case X86VectorArch.SSE:
-                    isa = "SSE"
-                case X86VectorArch.AVX:
-                    isa = "AVX"
-                case X86VectorArch.AVX512:
-                    isa = "AVX512"
-                case X86VectorArch.AVX512_FP16:
-                    isa = "AVX512_FP16"
-        else:
-            assert False
-
-        return f"{isa}/{self.numeric_dtype}<{self.lanes}>/{self.index_dtype}"
+        return f"{self.target.name}/{self.numeric_dtype}<{self.lanes}>/{self.index_dtype}"


 def get_setups(target: Target) -> list[VectorTestSetup]:
    match target:
        case Target.X86_SSE:
-            sse_platform = X86VectorCpu(X86VectorArch.SSE)
+            sse_platform = partial(X86VectorCpu, vector_arch=X86VectorArch.SSE)
            return [
-                VectorTestSetup(sse_platform, 4, Fp(32), SInt(32)),
-                VectorTestSetup(sse_platform, 2, Fp(64), SInt(64)),
+                VectorTestSetup(target, sse_platform, 4, Fp(32), SInt(32)),
+                VectorTestSetup(target, sse_platform, 2, Fp(64), SInt(64)),
            ]

        case Target.X86_AVX:
-            avx_platform = X86VectorCpu(X86VectorArch.AVX)
+            avx_platform = partial(X86VectorCpu, vector_arch=X86VectorArch.AVX)
            return [
-                VectorTestSetup(avx_platform, 4, Fp(32), SInt(32)),
-                VectorTestSetup(avx_platform, 8, Fp(32), SInt(32)),
-                VectorTestSetup(avx_platform, 2, Fp(64), SInt(64)),
-                VectorTestSetup(avx_platform, 4, Fp(64), SInt(64)),
+                VectorTestSetup(target, avx_platform, 4, Fp(32), SInt(32)),
+                VectorTestSetup(target, avx_platform, 8, Fp(32), SInt(32)),
+                VectorTestSetup(target, avx_platform, 2, Fp(64), SInt(64)),
+                VectorTestSetup(target, avx_platform, 4, Fp(64), SInt(64)),
            ]

        case Target.X86_AVX512:
-            avx512_platform = X86VectorCpu(X86VectorArch.AVX512)
+            avx512_platform = partial(X86VectorCpu, vector_arch=X86VectorArch.AVX512)
            return [
-                VectorTestSetup(avx512_platform, 4, Fp(32), SInt(32)),
-                VectorTestSetup(avx512_platform, 8, Fp(32), SInt(32)),
-                VectorTestSetup(avx512_platform, 16, Fp(32), SInt(32)),
-                VectorTestSetup(avx512_platform, 2, Fp(64), SInt(64)),
-                VectorTestSetup(avx512_platform, 4, Fp(64), SInt(64)),
-                VectorTestSetup(avx512_platform, 8, Fp(64), SInt(64)),
+                VectorTestSetup(target, avx512_platform, 4, Fp(32), SInt(32)),
+                VectorTestSetup(target, avx512_platform, 8, Fp(32), SInt(32)),
+                VectorTestSetup(target, avx512_platform, 16, Fp(32), SInt(32)),
+                VectorTestSetup(target, avx512_platform, 2, Fp(64), SInt(64)),
+                VectorTestSetup(target, avx512_platform, 4, Fp(64), SInt(64)),
+                VectorTestSetup(target, avx512_platform, 8, Fp(64), SInt(64)),
            ]

        case Target.X86_AVX512_FP16:
-            avx512_platform = X86VectorCpu(X86VectorArch.AVX512_FP16)
+            avx512_platform = partial(X86VectorCpu, vector_arch=X86VectorArch.AVX512_FP16)
            return [
-                VectorTestSetup(avx512_platform, 8, Fp(16), SInt(32)),
-                VectorTestSetup(avx512_platform, 16, Fp(16), SInt(32)),
-                VectorTestSetup(avx512_platform, 32, Fp(16), SInt(32)),
+                VectorTestSetup(target, avx512_platform, 8, Fp(16), SInt(32)),
+                VectorTestSetup(target, avx512_platform, 16, Fp(16), SInt(32)),
+                VectorTestSetup(target, avx512_platform, 32, Fp(16), SInt(32)),
            ]

        case _:
@@ -108,6 +98,7 @@ def create_vector_kernel(
    ctx = KernelCreationContext(
        default_dtype=setup.numeric_dtype, index_dtype=setup.index_dtype
    )
+    platform = setup.platform_factory(ctx)

    factory = AstFactory(ctx)

@@ -129,7 +120,7 @@ def create_vector_kernel(
        loop_nest, lambda l: l.counter.symbol.name == "ctr_0"
    )

-    select_intrin = SelectIntrinsics(ctx, setup.platform)
+    select_intrin = SelectIntrinsics(ctx, platform)
    loop_nest = select_intrin(loop_nest)

    lower = LowerToC(ctx)
@@ -137,7 +128,7 @@ def create_vector_kernel(

    func = create_cpu_kernel_function(
        ctx,
-        setup.platform,
+        platform,
        PsBlock([loop_nest]),
        "vector_kernel",
        Target.CPU,
No results found