a6754c61 · a6754c61 · a6754c61 · a6754c61 · a6754c61 · a6754c61
--- a/gpucuda/indexing.py
+++ b/gpucuda/indexing.py
-import abc
-from typing import Tuple  # noqa
-import sympy as sp
-from pystencils.astnodes import Conditional, Block
-from pystencils.integer_functions import div_ceil
-from pystencils.slicing import normalize_slice
-from pystencils.data_types import TypedSymbol, create_type
-from functools import partial
-AUTO_BLOCK_SIZE_LIMITING = True
-BLOCK_IDX = [TypedSymbol("blockIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-THREAD_IDX = [TypedSymbol("threadIdx." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-BLOCK_DIM = [TypedSymbol("blockDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-GRID_DIM = [TypedSymbol("gridDim." + coord, create_type("int")) for coord in ('x', 'y', 'z')]
-class AbstractIndexing(abc.ABC):
-    """
-    Abstract base class for all Indexing classes. An Indexing class defines how a multidimensional
-    field is mapped to CUDA's block and grid system. It calculates indices based on CUDA's thread and block indices
-    and computes the number of blocks and threads a kernel is started with. The Indexing class is created with
-    a pystencils field, a slice to iterate over, and further optional parameters that must have default values.
-    """
-    @property
-    @abc.abstractmethod
-    def coordinates(self):
-        """Returns a sequence of coordinate expressions for (x,y,z) depending on symbolic CUDA block and thread indices.
-        These symbolic indices can be obtained with the method `index_variables` """
-    @property
-    def index_variables(self):
-        """Sympy symbols for CUDA's block and thread indices, and block and grid dimensions. """
-        return BLOCK_IDX + THREAD_IDX + BLOCK_DIM + GRID_DIM
-    @abc.abstractmethod
-    def call_parameters(self, arr_shape):
-        """Determine grid and block size for kernel call.
-        Args:
-            arr_shape: the numeric (not symbolic) shape of the array
-        Returns:
-            dict with keys 'blocks' and 'threads' with tuple values for number of (x,y,z) threads and blocks
-            the kernel should be started with
-        """
-    @abc.abstractmethod
-    def guard(self, kernel_content, arr_shape):
-        """In some indexing schemes not all threads of a block execute the kernel content.
-        This function can return a Conditional ast node, defining this execution guard.
-        Args:
-            kernel_content: the actual kernel contents which can e.g. be put into the Conditional node as true block
-            arr_shape: the numeric or symbolic shape of the field
-        Returns:
-            ast node, which is put inside the kernel function
-        """
-# -------------------------------------------- Implementations ---------------------------------------------------------
-class BlockIndexing(AbstractIndexing):
-    """Generic indexing scheme that maps sub-blocks of an array to CUDA blocks.
-    Args:
-        field: pystencils field (common to all Indexing classes)
-        iteration_slice: slice that defines rectangular subarea which is iterated over
-        permute_block_size_dependent_on_layout: if True the block_size is permuted such that the fastest coordinate
-                                                gets the largest amount of threads
-        compile_time_block_size: compile in concrete block size, otherwise the cuda variable 'blockDim' is used
-    """
-    def __init__(self, field, iteration_slice=None,
-                 block_size=(16, 16, 1), permute_block_size_dependent_on_layout=True, compile_time_block_size=False):
-        if field.spatial_dimensions > 3:
-            raise NotImplementedError("This indexing scheme supports at most 3 spatial dimensions")
-        if permute_block_size_dependent_on_layout:
-            block_size = self.permute_block_size_according_to_layout(block_size, field.layout)
-        if AUTO_BLOCK_SIZE_LIMITING:
-            block_size = self.limit_block_size_to_device_maximum(block_size)
-        self._block_size = block_size
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._dim = field.spatial_dimensions
-        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
-        self._compile_time_block_size = compile_time_block_size
-    @property
-    def coordinates(self):
-        offsets = _get_start_from_slice(self._iterationSlice)
-        block_size = self._block_size if self._compile_time_block_size else BLOCK_DIM
-        coordinates = [block_index * bs + thread_idx + off
-                       for block_index, bs, thread_idx, off in zip(BLOCK_IDX, block_size, THREAD_IDX, offsets)]
-        return coordinates[:self._dim]
-    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
-        extend_bs = (1,) * (3 - len(self._block_size))
-        block_size = self._block_size + extend_bs
-        if not self._compile_time_block_size:
-            block_size = tuple(sp.Min(bs, shape) for bs, shape in zip(block_size, widths)) + extend_bs
-        grid = tuple(div_ceil(length, block_size)
-                     for length, block_size in zip(widths, block_size))
-        extend_gr = (1,) * (3 - len(grid))
-        return {'block': block_size,
-                'grid': grid + extend_gr}
-    def guard(self, kernel_content, arr_shape):
-        arr_shape = arr_shape[:self._dim]
-        conditions = [c < end
-                      for c, end in zip(self.coordinates, _get_end_from_slice(self._iterationSlice, arr_shape))]
-        condition = conditions[0]
-        for c in conditions[1:]:
-            condition = sp.And(condition, c)
-        return Block([Conditional(condition, kernel_content)])
-    @staticmethod
-    def limit_block_size_to_device_maximum(block_size):
-        """Changes block size according to match device limits.
-        * if the total amount of threads is too big for the current device, the biggest coordinate is divided by 2.
-        * next, if one component is still too big, the component which is too big is divided by 2 and the smallest
-          component is multiplied by 2, such that the total amount of threads stays the same
-        Returns:
-            the altered block_size
-        """
-        # Get device limits
-        import pycuda.driver as cuda
-        # noinspection PyUnresolvedReferences
-        import pycuda.autoinit  # NOQA
-        da = cuda.device_attribute
-        device = cuda.Context.get_device()
-        block_size = list(block_size)
-        max_threads = device.get_attribute(da.MAX_THREADS_PER_BLOCK)
-        max_block_size = [device.get_attribute(a)
-                          for a in (da.MAX_BLOCK_DIM_X, da.MAX_BLOCK_DIM_Y, da.MAX_BLOCK_DIM_Z)]
-        def prod(seq):
-            result = 1
-            for e in seq:
-                result *= e
-            return result
-        def get_index_of_too_big_element():
-            for i, bs in enumerate(block_size):
-                if bs > max_block_size[i]:
-                    return i
-            return None
-        def get_index_of_too_small_element():
-            for i, bs in enumerate(block_size):
-                if bs // 2 <= max_block_size[i]:
-                    return i
-            return None
-        # Reduce the total number of threads if necessary
-        while prod(block_size) > max_threads:
-            item_to_reduce = block_size.index(max(block_size))
-            for j, block_size_entry in enumerate(block_size):
-                if block_size_entry > max_block_size[j]:
-                    item_to_reduce = j
-            block_size[item_to_reduce] //= 2
-        # Cap individual elements
-        too_big_element_index = get_index_of_too_big_element()
-        while too_big_element_index is not None:
-            too_small_element_index = get_index_of_too_small_element()
-            block_size[too_small_element_index] *= 2
-            block_size[too_big_element_index] //= 2
-            too_big_element_index = get_index_of_too_big_element()
-        return tuple(block_size)
-    @staticmethod
-    def limit_block_size_by_register_restriction(block_size, required_registers_per_thread, device=None):
-        """Shrinks the block_size if there are too many registers used per multiprocessor.
-        This is not done automatically, since the required_registers_per_thread are not known before compilation.
-        They can be obtained by ``func.num_regs`` from a pycuda function.
-        :returns smaller block_size if too many registers are used.
-        """
-        import pycuda.driver as cuda
-        # noinspection PyUnresolvedReferences
-        import pycuda.autoinit  # NOQA
-        da = cuda.device_attribute
-        if device is None:
-            device = cuda.Context.get_device()
-        available_registers_per_mp = device.get_attribute(da.MAX_REGISTERS_PER_MULTIPROCESSOR)
-        block = block_size
-        while True:
-            num_threads = 1
-            for t in block:
-                num_threads *= t
-            required_registers_per_mt = num_threads * required_registers_per_thread
-            if required_registers_per_mt <= available_registers_per_mp:
-                return block
-            else:
-                largest_grid_entry_idx = max(range(len(block)), key=lambda e: block[e])
-                assert block[largest_grid_entry_idx] >= 2
-                block[largest_grid_entry_idx] //= 2
-    @staticmethod
-    def permute_block_size_according_to_layout(block_size, layout):
-        """Returns modified block_size such that the fastest coordinate gets the biggest block dimension"""
-        sorted_block_size = list(sorted(block_size, reverse=True))
-        while len(sorted_block_size) > len(layout):
-            sorted_block_size[0] *= sorted_block_size[-1]
-            sorted_block_size = sorted_block_size[:-1]
-        result = list(block_size)
-        for l, bs in zip(reversed(layout), sorted_block_size):
-            result[l] = bs
-        return tuple(result[:len(layout)])
-class LineIndexing(AbstractIndexing):
-    """
-    Indexing scheme that assigns the innermost 'line' i.e. the elements which are adjacent in memory to a 1D CUDA block.
-    The fastest coordinate is indexed with thread_idx.x, the remaining coordinates are mapped to block_idx.{x,y,z}
-    This indexing scheme supports up to 4 spatial dimensions, where the innermost dimensions is not larger than the
-    maximum amount of threads allowed in a CUDA block (which depends on device).
-    """
-    def __init__(self, field, iteration_slice=None):
-        available_indices = [THREAD_IDX[0]] + BLOCK_IDX
-        if field.spatial_dimensions > 4:
-            raise NotImplementedError("This indexing scheme supports at most 4 spatial dimensions")
-        coordinates = available_indices[:field.spatial_dimensions]
-        fastest_coordinate = field.layout[-1]
-        coordinates[0], coordinates[fastest_coordinate] = coordinates[fastest_coordinate], coordinates[0]
-        self._coordinates = coordinates
-        self._iterationSlice = normalize_slice(iteration_slice, field.spatial_shape)
-        self._symbolicShape = [e if isinstance(e, sp.Basic) else None for e in field.spatial_shape]
-    @property
-    def coordinates(self):
-        return [i + offset for i, offset in zip(self._coordinates, _get_start_from_slice(self._iterationSlice))]
-    def call_parameters(self, arr_shape):
-        substitution_dict = {sym: value for sym, value in zip(self._symbolicShape, arr_shape) if sym is not None}
-        widths = [end - start for start, end in zip(_get_start_from_slice(self._iterationSlice),
-                                                    _get_end_from_slice(self._iterationSlice, arr_shape))]
-        widths = sp.Matrix(widths).subs(substitution_dict)
-        def get_shape_of_cuda_idx(cuda_idx):
-            if cuda_idx not in self._coordinates:
-                return 1
-            else:
-                idx = self._coordinates.index(cuda_idx)
-                return widths[idx]
-        return {'block': tuple([get_shape_of_cuda_idx(idx) for idx in THREAD_IDX]),
-                'grid': tuple([get_shape_of_cuda_idx(idx) for idx in BLOCK_IDX])}
-    def guard(self, kernel_content, arr_shape):
-        return kernel_content
-# -------------------------------------- Helper functions --------------------------------------------------------------
-def _get_start_from_slice(iteration_slice):
-    res = []
-    for slice_component in iteration_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.start if slice_component.start is not None else 0)
-        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component)
-    return res
-def _get_end_from_slice(iteration_slice, arr_shape):
-    iter_slice = normalize_slice(iteration_slice, arr_shape)
-    res = []
-    for slice_component in iter_slice:
-        if type(slice_component) is slice:
-            res.append(slice_component.stop)
-        else:
-            assert isinstance(slice_component, int)
-            res.append(slice_component + 1)
-    return res
-def indexing_creator_from_params(gpu_indexing, gpu_indexing_params):
-    if isinstance(gpu_indexing, str):
-        if gpu_indexing == 'block':
-            indexing_creator = BlockIndexing
-        elif gpu_indexing == 'line':
-            indexing_creator = LineIndexing
-        else:
-            raise ValueError("Unknown GPU indexing %s. Valid values are 'block' and 'line'" % (gpu_indexing,))
-        if gpu_indexing_params:
-            indexing_creator = partial(indexing_creator, **gpu_indexing_params)
-        return indexing_creator
-    else:
-        return gpu_indexing
--- a/kerncraft_coupling/__init__.py
+++ b/kerncraft_coupling/__init__.py
-from .kerncraft_interface import PyStencilsKerncraftKernel, KerncraftParameters
-__all__ = ['PyStencilsKerncraftKernel', 'KerncraftParameters']
--- a/kerncraft_coupling/generate_benchmark.py
+++ b/kerncraft_coupling/generate_benchmark.py
-from jinja2 import Template
-from pystencils.backends.cbackend import generate_c
-from pystencils.sympyextensions import prod
-from pystencils.data_types import get_base_type
-benchmark_template = Template("""
-#include "kerncraft.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <math.h>
-{%- if likwid %}
-#include <likwid.h>
-{%- endif %}
-#define RESTRICT __restrict__
-#define FUNC_PREFIX
-void dummy(double *);
-extern int var_false;
-{{kernel_code}}
-int main(int argc, char **argv)
-{
-  {%- if likwid %}
-  likwid_markerInit();
-  likwid_markerThreadInit();
-  {%- endif %}
-  {%- for field_name, dataType, size in fields %}
-  // Initialization {{field_name}}
-  double * {{field_name}} = (double *) aligned_malloc(sizeof({{dataType}}) * {{size}}, 32);
-  for (unsigned long long i = 0; i < {{size}}; ++i)
-    {{field_name}}[i] = 0.23;
-  if(var_false)
-    dummy({{field_name}});
-  {%- endfor %}
-  {%- for constantName, dataType in constants %}
-  // Constant {{constantName}}
-  {{dataType}} {{constantName}};
-  {{constantName}} = 0.23;
-  if(var_false)
-      dummy(& {{constantName}});
-  {%- endfor %}
-  int repeat = atoi(argv[1]);
-  {%- if likwid %}
-  likwid_markerStartRegion("loop");
-  {%- endif %}
-  for (; repeat > 0; --repeat)
-  {
-    {{kernelName}}({{call_argument_list}});
-    // Dummy calls
-    {%- for field_name, dataType, size in fields %}
-    if(var_false) dummy({{field_name}});
-    {%- endfor %}
-    {%- for constantName, dataType in constants %}
-    if(var_false) dummy(&{{constantName}});
-    {%- endfor %}
-  }
-  {%- if likwid %}
-  likwid_markerStopRegion("loop");
-  {%- endif %}
-  {%- if likwid %}
-  likwid_markerClose();
-  {%- endif %}
-}
-""")
-def generate_benchmark(ast, likwid=False):
-    accessed_fields = {f.name: f for f in ast.fields_accessed}
-    constants = []
-    fields = []
-    call_parameters = []
-    for p in ast.parameters:
-        if not p.is_field_argument:
-            constants.append((p.name, str(p.dtype)))
-            call_parameters.append(p.name)
-        else:
-            assert p.is_field_ptr_argument, "Benchmark implemented only for kernels with fixed loop size"
-            field = accessed_fields[p.field_name]
-            dtype = str(get_base_type(p.dtype))
-            fields.append((p.field_name, dtype, prod(field.shape)))
-            call_parameters.append(p.field_name)
-    args = {
-        'likwid': likwid,
-        'kernel_code': generate_c(ast),
-        'kernelName': ast.function_name,
-        'fields': fields,
-        'constants': constants,
-        'call_argument_list': ",".join(call_parameters),
-    }
-    return benchmark_template.render(**args)
--- a/kerncraft_coupling/kerncraft_interface.py
+++ b/kerncraft_coupling/kerncraft_interface.py
-from tempfile import TemporaryDirectory
-import sympy as sp
-import os
-from collections import defaultdict
-import subprocess
-import kerncraft
-import kerncraft.kernel
-from kerncraft.iaca import iaca_analyse_instrumented_binary, iaca_instrumentation
-from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark
-from pystencils.astnodes import LoopOverCoordinate, SympyAssignment, ResolvedFieldAccess
-from pystencils.field import get_layout_from_strides
-from pystencils.sympyextensions import count_operations_in_ast
-from pystencils.utils import DotDict
-class PyStencilsKerncraftKernel(kerncraft.kernel.Kernel):
-    """
-    Implementation of kerncraft's kernel interface for pystencils CPU kernels.
-    Analyses a list of equations assuming they will be executed on a CPU
-    """
-    LIKWID_BASE = '/usr/local/likwid'
-    def __init__(self, ast, machine=None):
-        super(PyStencilsKerncraftKernel, self).__init__(machine)
-        self.ast = ast
-        self.temporary_dir = TemporaryDirectory()
-        # Loops
-        inner_loops = [l for l in ast.atoms(LoopOverCoordinate) if l.is_innermost_loop]
-        if len(inner_loops) == 0:
-            raise ValueError("No loop found in pystencils AST")
-        elif len(inner_loops) > 1:
-            raise ValueError("pystencils AST contains multiple inner loops - only one can be analyzed")
-        else:
-            inner_loop = inner_loops[0]
-        self._loop_stack = []
-        cur_node = inner_loop
-        while cur_node is not None:
-            if isinstance(cur_node, LoopOverCoordinate):
-                loop_counter_sym = cur_node.loop_counter_symbol
-                loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, cur_node.step)
-                self._loop_stack.append(loop_info)
-            cur_node = cur_node.parent
-        self._loop_stack = list(reversed(self._loop_stack))
-        # Data sources & destinations
-        self.sources = defaultdict(list)
-        self.destinations = defaultdict(list)
-        reads, writes = search_resolved_field_accesses_in_ast(inner_loop)
-        for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]:
-            for fa in accesses:
-                coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off
-                         for i, off in enumerate(fa.offsets)]
-                coord += list(fa.idx_coordinate_values)
-                layout = get_layout_from_strides(fa.field.strides)
-                permuted_coord = [coord[i] for i in layout]
-                target_dict[fa.field.name].append(permuted_coord)
-        # Variables (arrays)
-        fields_accessed = ast.fields_accessed
-        for field in fields_accessed:
-            layout = get_layout_from_strides(field.strides)
-            permuted_shape = list(field.shape[i] for i in layout)
-            self.set_variable(field.name, str(field.dtype), tuple(permuted_shape))
-        for param in ast.parameters:
-            if not param.is_field_argument:
-                self.set_variable(param.name, str(param.dtype), None)
-                self.sources[param.name] = [None]
-        # data type
-        self.datatype = list(self.variables.values())[0][0]
-        # flops
-        operation_count = count_operations_in_ast(inner_loop)
-        self._flops = {
-            '+': operation_count['adds'],
-            '*': operation_count['muls'],
-            '/': operation_count['divs'],
-        }
-        for k in [k for k, v in self._flops.items() if v == 0]:
-            del self._flops[k]
-        self.check()
-    def iaca_analysis(self, micro_architecture, asm_block='auto',
-                      pointer_increment='auto_with_manual_fallback', verbose=False):
-        compiler, compiler_args = self._machine.get_compiler()
-        if '-std=c99' not in compiler_args:
-            compiler_args += ['-std=c99']
-        header_path = kerncraft.get_header_path()
-        compiler_cmd = [compiler] + compiler_args + ['-I' + header_path]
-        src_file = os.path.join(self.temporary_dir.name, "source.c")
-        asm_file = os.path.join(self.temporary_dir.name, "source.s")
-        iaca_asm_file = os.path.join(self.temporary_dir.name, "source.iaca.s")
-        dummy_src_file = os.path.join(header_path, "dummy.c")
-        dummy_asm_file = os.path.join(self.temporary_dir.name, "dummy.s")
-        binary_file = os.path.join(self.temporary_dir.name, "binary")
-        # write source code to file
-        with open(src_file, 'w') as f:
-            f.write(generate_benchmark(self.ast, likwid=False))
-        # compile to asm files
-        subprocess.check_output(compiler_cmd + [src_file, '-S', '-o', asm_file])
-        subprocess.check_output(compiler_cmd + [dummy_src_file, '-S', '-o', dummy_asm_file])
-        with open(asm_file) as read, open(iaca_asm_file, 'w') as write:
-            instrumented_asm_block = iaca_instrumentation(read, write)
-        # assemble asm files to executable
-        subprocess.check_output(compiler_cmd + [iaca_asm_file, dummy_asm_file, '-o', binary_file])
-        result = iaca_analyse_instrumented_binary(binary_file, micro_architecture)
-        return result, instrumented_asm_block
-    def build(self, lflags=None, verbose=False):
-        compiler, compiler_args = self._machine.get_compiler()
-        if '-std=c99' not in compiler_args:
-            compiler_args.append('-std=c99')
-        header_path = kerncraft.get_header_path()
-        cmd = [compiler] + compiler_args + [
-            '-I' + os.path.join(self.LIKWID_BASE, 'include'),
-            '-L' + os.path.join(self.LIKWID_BASE, 'lib'),
-            '-I' + header_path,
-            '-Wl,-rpath=' + os.path.join(self.LIKWID_BASE, 'lib'),
-        ]
-        dummy_src_file = os.path.join(header_path, 'dummy.c')
-        src_file = os.path.join(self.temporary_dir.name, "source_likwid.c")
-        bin_file = os.path.join(self.temporary_dir.name, "benchmark")
-        with open(src_file, 'w') as f:
-            f.write(generate_benchmark(self.ast, likwid=True))
-        subprocess.check_output(cmd + [src_file, dummy_src_file, '-pthread', '-llikwid', '-o', bin_file])
-        return bin_file
-class KerncraftParameters(DotDict):
-    def __init__(self, **kwargs):
-        super(KerncraftParameters, self).__init__(**kwargs)
-        self['asm_block'] = 'auto'
-        self['asm_increment'] = 0
-        self['cores'] = 1
-        self['cache_predictor'] = 'SIM'
-        self['verbose'] = 0
-        self['pointer_increment'] = 'auto'
-        self['iterations'] = 10
-# ------------------------------------------- Helper functions ---------------------------------------------------------
-def search_resolved_field_accesses_in_ast(ast):
-    def visit(node, reads, writes):
-        if not isinstance(node, SympyAssignment):
-            for a in node.args:
-                visit(a, reads, writes)
-            return
-        for expr, accesses in [(node.lhs, writes), (node.rhs, reads)]:
-            accesses.update(expr.atoms(ResolvedFieldAccess))
-    read_accesses = set()
-    write_accesses = set()
-    visit(ast, read_accesses, write_accesses)
-    return read_accesses, write_accesses
--- a/kernelcreation.py
+++ b/kernelcreation.py
-from types import MappingProxyType
-import sympy as sp
-from pystencils.assignment import Assignment
-from pystencils.astnodes import LoopOverCoordinate, Conditional, Block, SympyAssignment
-from pystencils.cpu.vectorization import vectorize
-from pystencils.simp.assignment_collection import AssignmentCollection
-from pystencils.gpucuda.indexing import indexing_creator_from_params
-from pystencils.transformations import remove_conditionals_in_staggered_kernel
-def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None,
-                  skip_independence_check=False,
-                  cpu_openmp=False, cpu_vectorize_info=None,
-                  gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
-    """
-    Creates abstract syntax tree (AST) of kernel, using a list of update equations.
-    Args:
-        assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection`
-        target: 'cpu', 'llvm' or 'gpu'
-        data_type: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name
-                  to type
-        iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \
-                         part of the field is iterated over
-        ghost_layers: if left to default, the number of necessary ghost layers is determined automatically
-                     a single integer specifies the ghost layer count at all borders, can also be a sequence of
-                     pairs ``[(x_lower_gl, x_upper_gl), .... ]``
-        skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for
-                                 periodicity kernel, that access the field outside the iteration bounds. Use with care!
-        cpu_openmp: True or number of threads for OpenMP parallelization, False for no OpenMP
-        cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal'
-                            for documentation of these parameters see vectorize function. Example:
-                            '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}'
-        gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing`
-        gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class)
-                             e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }'
-    Returns:
-        abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or
-        can be compiled with through its 'compile()' member
-    Example:
-        >>> import pystencils as ps
-        >>> import numpy as np
-        >>> s, d = ps.fields('s, d: [2D]')
-        >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0])
-        >>> ast = ps.create_kernel(assignment, target='cpu', cpu_openmp=True)
-        >>> kernel = ast.compile()
-        >>> d_arr = np.zeros([5, 5])
-        >>> kernel(d=d_arr, s=np.ones([5, 5]))
-        >>> d_arr
-        array([[0., 0., 0., 0., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 4., 4., 4., 0.],
-               [0., 0., 0., 0., 0.]])
-    """
-    # ----  Normalizing parameters
-    split_groups = ()
-    if isinstance(assignments, AssignmentCollection):
-        if 'split_groups' in assignments.simplification_hints:
-            split_groups = assignments.simplification_hints['split_groups']
-        assignments = assignments.all_assignments
-    if isinstance(assignments, Assignment):
-        assignments = [assignments]
-    # ----  Creating ast
-    if target == 'cpu':
-        from pystencils.cpu import create_kernel
-        from pystencils.cpu import add_openmp
-        ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
-                            iteration_slice=iteration_slice, ghost_layers=ghost_layers,
-                            skip_independence_check=skip_independence_check)
-        if cpu_openmp:
-            add_openmp(ast, num_threads=cpu_openmp)
-        if cpu_vectorize_info:
-            if cpu_vectorize_info is True:
-                vectorize(ast)
-            elif isinstance(cpu_vectorize_info, dict):
-                vectorize(ast, **cpu_vectorize_info)
-            else:
-                raise ValueError("Invalid value for cpu_vectorize_info")
-        return ast
-    elif target == 'llvm':
-        from pystencils.llvm import create_kernel
-        ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups,
-                            iteration_slice=iteration_slice, ghost_layers=ghost_layers)
-        return ast
-    elif target == 'gpu':
-        from pystencils.gpucuda import create_cuda_kernel
-        ast = create_cuda_kernel(assignments, type_info=data_type,
-                                 indexing_creator=indexing_creator_from_params(gpu_indexing, gpu_indexing_params),
-                                 iteration_slice=iteration_slice, ghost_layers=ghost_layers,
-                                 skip_independence_check=skip_independence_check)
-        return ast
-    else:
-        raise ValueError("Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target,))
-def create_indexed_kernel(assignments, index_fields, target='cpu', data_type="double", coordinate_names=('x', 'y', 'z'),
-                          cpu_openmp=True, gpu_indexing='block', gpu_indexing_params=MappingProxyType({})):
-    """
-    Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with
-    coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling.
-    The coordinates are stored in a separated index_field, which is a one dimensional array with struct data type.
-    This struct has to contain fields named 'x', 'y' and for 3D fields ('z'). These names are configurable with the
-    'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for
-    example boundary parameters.
-    index_fields: list of index fields, i.e. 1D fields with struct data type
-    coordinate_names: name of the coordinate fields in the struct data type
-    Example:
-        >>> import pystencils as ps
-        >>> import numpy as np
-        >>>
-        >>> # Index field stores the indices of the cell to visit together with optional values
-        >>> index_arr_dtype = np.dtype([('x', np.int32), ('y', np.int32), ('val', np.double)])
-        >>> index_arr = np.array([(1, 1, 0.1), (2, 2, 0.2), (3, 3, 0.3)], dtype=index_arr_dtype)
-        >>> idx_field = ps.fields(idx=index_arr)
-        >>>
-        >>> # Additional values  stored in index field can be accessed in the kernel as well
-        >>> s, d = ps.fields('s, d: [2D]')
-        >>> assignment = ps.Assignment(d[0,0], 2 * s[0, 1] + 2 * s[1, 0] + idx_field('val'))
-        >>> ast = create_indexed_kernel(assignment, [idx_field], coordinate_names=('x', 'y'))
-        >>> kernel = ast.compile()
-        >>> d_arr = np.zeros([5, 5])
-        >>> kernel(s=np.ones([5, 5]), d=d_arr, idx=index_arr)
-        >>> d_arr
-        array([[0. , 0. , 0. , 0. , 0. ],
-               [0. , 4.1, 0. , 0. , 0. ],
-               [0. , 0. , 4.2, 0. , 0. ],
-               [0. , 0. , 0. , 4.3, 0. ],
-               [0. , 0. , 0. , 0. , 0. ]])
-    """
-    if isinstance(assignments, Assignment):
-        assignments = [assignments]
-    elif isinstance(assignments, AssignmentCollection):
-        assignments = assignments.all_assignments
-    if target == 'cpu':
-        from pystencils.cpu import create_indexed_kernel
-        from pystencils.cpu import add_openmp
-        ast = create_indexed_kernel(assignments, index_fields=index_fields, type_info=data_type,
-                                    coordinate_names=coordinate_names)
-        if cpu_openmp:
-            add_openmp(ast, num_threads=cpu_openmp)
-        return ast
-    elif target == 'llvm':
-        raise NotImplementedError("Indexed kernels are not yet supported in LLVM backend")
-    elif target == 'gpu':
-        from pystencils.gpucuda import created_indexed_cuda_kernel
-        idx_creator = indexing_creator_from_params(gpu_indexing, gpu_indexing_params)
-        ast = created_indexed_cuda_kernel(assignments, index_fields, type_info=data_type,
-                                          coordinate_names=coordinate_names, indexing_creator=idx_creator)
-        return ast
-    else:
-        raise ValueError("Unknown target %s. Has to be either 'cpu' or 'gpu'" % (target,))
-def create_staggered_kernel(staggered_field, expressions, subexpressions=(), target='cpu', **kwargs):
-    """Kernel that updates a staggered field.
-    .. image:: /img/staggered_grid.svg
-    Args:
-        staggered_field: field where the first index coordinate defines the location of the staggered value
-                can have 1 or 2 index coordinates, in case of of two index coordinates at every staggered location
-                a vector is stored, expressions has to be a sequence of sequences then
-                where e.g. ``f[0,0](0)`` is interpreted as value at the left cell boundary, ``f[1,0](0)`` the right cell
-                boundary and ``f[0,0](1)`` the southern cell boundary etc.
-        expressions: sequence of expressions of length dim, defining how the east, southern, (bottom) cell boundary
-                     should be updated.
-        subexpressions: optional sequence of Assignments, that define subexpressions used in the main expressions
-        target: 'cpu' or 'gpu'
-        kwargs: passed directly to create_kernel, iteration slice and ghost_layers parameters are not allowed
-    Returns:
-        AST, see `create_kernel`
-    """
-    assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs
-    assert staggered_field.index_dimensions in (1, 2), 'Staggered field must have one or two index dimensions'
-    dim = staggered_field.spatial_dimensions
-    counters = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim)]
-    conditions = [counters[i] < staggered_field.shape[i] - 1 for i in range(dim)]
-    assert len(expressions) == dim
-    if staggered_field.index_dimensions == 2:
-        assert all(len(sublist) == len(expressions[0]) for sublist in expressions), \
-            "If staggered field has two index dimensions expressions has to be a sequence of sequences of all the " \
-            "same length."
-    final_assignments = []
-    for d in range(dim):
-        cond = sp.And(*[conditions[i] for i in range(dim) if d != i])
-        if staggered_field.index_dimensions == 1:
-            assignments = [Assignment(staggered_field(d), expressions[d])]
-            a_coll = AssignmentCollection(assignments, list(subexpressions)).new_filtered([staggered_field(d)])
-        elif staggered_field.index_dimensions == 2:
-            assert staggered_field.has_fixed_index_shape
-            assignments = [Assignment(staggered_field(d, i), expr) for i, expr in enumerate(expressions[d])]
-            a_coll = AssignmentCollection(assignments, list(subexpressions))
-            a_coll = a_coll.new_filtered([staggered_field(d, i) for i in range(staggered_field.index_shape[1])])
-        sp_assignments = [SympyAssignment(a.lhs, a.rhs) for a in a_coll.all_assignments]
-        final_assignments.append(Conditional(cond, Block(sp_assignments)))
-    ghost_layers = [(1, 0)] * dim
-    cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None)
-    if cpu_vectorize_info:
-        del kwargs['cpu_vectorize_info']
-    ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs)
-    if target == 'cpu':
-        remove_conditionals_in_staggered_kernel(ast)
-        if cpu_vectorize_info is True:
-            vectorize(ast)
-        elif isinstance(cpu_vectorize_info, dict):
-            vectorize(ast, **cpu_vectorize_info)
-    return ast
--- a/llvm/__init__.py
+++ b/llvm/__init__.py
-from .kernelcreation import create_kernel
-from .llvmjit import make_python_function
-__all__ = ['create_kernel', 'make_python_function']
--- a/llvm/control_flow.py
+++ b/llvm/control_flow.py
-import llvmlite.ir as ir
-class Loop(object):
-    def __init__(self, builder, start_val, stop_val, step_val=1, loop_name='loop', phi_name="_phi"):
-        self.builder = builder
-        self.start_val = start_val
-        self.stop_val = stop_val
-        self.step_val = step_val
-        self.loop_name = loop_name
-        self.phi_name = phi_name
-    def __enter__(self):
-        self.loop_end, self.after, phi = self._for_loop(self.start_val, self.stop_val, self.step_val, self.loop_name,
-                                                        self.phi_name)
-        return phi
-    def _for_loop(self, start_val, stop_val, step_val, loop_name, phi_name):
-        # TODO size of int??? unisgned???
-        integer = ir.IntType(64)
-        # Loop block
-        pre_loop_bb = self.builder.block
-        loop_bb = self.builder.append_basic_block(name='loop_' + loop_name)
-        self.builder.branch(loop_bb)
-        # Insert an explicit fall through from the current block to loop_bb
-        self.builder.position_at_start(loop_bb)
-        # Add phi
-        phi = self.builder.phi(integer, name=phi_name)
-        phi.add_incoming(start_val, pre_loop_bb)
-        loop_end_bb = self.builder.append_basic_block(name=loop_name + "_end_bb")
-        self.builder.position_at_start(loop_end_bb)
-        next_var = self.builder.add(phi, step_val, name=loop_name + '_next_it')
-        cond = self.builder.icmp_unsigned('<', next_var, stop_val, name=loop_name + "_cond")
-        after_bb = self.builder.append_basic_block(name=loop_name + "_after_bb")
-        self.builder.cbranch(cond, loop_bb, after_bb)
-        phi.add_incoming(next_var, loop_end_bb)
-        self.builder.position_at_end(loop_bb)
-        return loop_end_bb, after_bb, phi
-    def __exit__(self, exc_type, exc, exc_tb):
-        self.builder.branch(self.loop_end)
-        self.builder.position_at_end(self.after)
--- a/llvm/kernelcreation.py
+++ b/llvm/kernelcreation.py
-from pystencils.transformations import insert_casts
-from functools import partial
-from pystencils.llvm.llvmjit import make_python_function
-def create_kernel(assignments, function_name="kernel", type_info=None, split_groups=(),
-                  iteration_slice=None, ghost_layers=None):
-    """
-    Creates an abstract syntax tree for a kernel function, by taking a list of update rules.
-    Loops are created according to the field accesses in the equations.
-    :param assignments: list of sympy equations, containing accesses to :class:`pystencils.field.Field`.
-           Defining the update rules of the kernel
-    :param function_name: name of the generated function - only important if generated code is written out
-    :param type_info: a map from symbol name to a C type specifier. If not specified all symbols are assumed to
-           be of type 'double' except symbols which occur on the left hand side of equations where the
-           right hand side is a sympy Boolean which are assumed to be 'bool' .
-    :param split_groups: Specification on how to split up inner loop into multiple loops. For details see
-           transformation :func:`pystencils.transformation.split_inner_loop`
-    :param iteration_slice: if not None, iteration is done only over this slice of the field
-    :param ghost_layers: a sequence of pairs for each coordinate with lower and upper nr of ghost layers
-                        if None, the number of ghost layers is determined automatically and assumed to be equal for a
-                        all dimensions
-    :return: :class:`pystencils.ast.KernelFunction` node
-    """
-    from pystencils.cpu import create_kernel
-    code = create_kernel(assignments, function_name, type_info, split_groups, iteration_slice, ghost_layers)
-    code = insert_casts(code)
-    code.compile = partial(make_python_function, code)
-    return code
--- a/llvm/llvm.py
+++ b/llvm/llvm.py
-import sympy as sp
-import functools
-from sympy import S, Indexed
-from sympy.printing.printer import Printer
-import llvmlite.ir as ir
-from pystencils.assignment import Assignment
-from pystencils.llvm.control_flow import Loop
-from pystencils.data_types import create_type, to_llvm_type, get_type_of_expression, collate_types, \
-    create_composite_type_from_string
-def generate_llvm(ast_node, module=None, builder=None):
-    """Prints the ast as llvm code."""
-    if module is None:
-        module = ir.Module()
-    if builder is None:
-        builder = ir.IRBuilder()
-    printer = LLVMPrinter(module, builder)
-    return printer._print(ast_node)
-# noinspection PyPep8Naming
-class LLVMPrinter(Printer):
-    """Convert expressions to LLVM IR"""
-    def __init__(self, module, builder, fn=None, *args, **kwargs):
-        self.func_arg_map = kwargs.pop("func_arg_map", {})
-        super(LLVMPrinter, self).__init__(*args, **kwargs)
-        self.fp_type = ir.DoubleType()
-        self.fp_pointer = self.fp_type.as_pointer()
-        self.integer = ir.IntType(64)
-        self.integer_pointer = self.integer.as_pointer()
-        self.void = ir.VoidType()
-        self.module = module
-        self.builder = builder
-        self.fn = fn
-        self.ext_fn = {}  # keep track of wrappers to external functions
-        self.tmp_var = {}
-    def _add_tmp_var(self, name, value):
-        self.tmp_var[name] = value
-    def _remove_tmp_var(self, name):
-        del self.tmp_var[name]
-    def _print_Number(self, n):
-        if get_type_of_expression(n) == create_type("int"):
-            return ir.Constant(self.integer, int(n))
-        elif get_type_of_expression(n) == create_type("double"):
-            return ir.Constant(self.fp_type, float(n))
-        else:
-            raise NotImplementedError("Numbers can only have int and double", n)
-    def _print_Float(self, expr):
-        return ir.Constant(self.fp_type, float(expr))
-    def _print_Integer(self, expr):
-        return ir.Constant(self.integer, int(expr))
-    def _print_int(self, i):
-        return ir.Constant(self.integer, i)
-    def _print_Symbol(self, s):
-        val = self.tmp_var.get(s)
-        if not val:
-            # look up parameter with name s
-            val = self.func_arg_map.get(s.name)
-        if not val:
-            raise LookupError("Symbol not found: %s" % s)
-        return val
-    def _print_Pow(self, expr):
-        base0 = self._print(expr.base)
-        if expr.exp == S.NegativeOne:
-            return self.builder.fdiv(ir.Constant(self.fp_type, 1.0), base0)
-        if expr.exp == S.Half:
-            fn = self.ext_fn.get("sqrt")
-            if not fn:
-                fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
-                fn = ir.Function(self.module, fn_type, "sqrt")
-                self.ext_fn["sqrt"] = fn
-            return self.builder.call(fn, [base0], "sqrt")
-        if expr.exp == 2:
-            return self.builder.fmul(base0, base0)
-        elif expr.exp == 3:
-            return self.builder.fmul(self.builder.fmul(base0, base0), base0)
-        exp0 = self._print(expr.exp)
-        fn = self.ext_fn.get("pow")
-        if not fn:
-            fn_type = ir.FunctionType(self.fp_type, [self.fp_type, self.fp_type])
-            fn = ir.Function(self.module, fn_type, "pow")
-            self.ext_fn["pow"] = fn
-        return self.builder.call(fn, [base0, exp0], "pow")
-    def _print_Mul(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        if get_type_of_expression(expr) == create_type('double'):
-            mul = self.builder.fmul
-        else:  # int TODO unsigned/signed
-            mul = self.builder.mul
-        for node in nodes[1:]:
-            e = mul(e, node)
-        return e
-    def _print_Add(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        if get_type_of_expression(expr) == create_type('double'):
-            add = self.builder.fadd
-        else:  # int TODO unsigned/signed
-            add = self.builder.add
-        for node in nodes[1:]:
-            e = add(e, node)
-        return e
-    def _print_Or(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        for node in nodes[1:]:
-            e = self.builder.or_(e, node)
-        return e
-    def _print_And(self, expr):
-        nodes = [self._print(a) for a in expr.args]
-        e = nodes[0]
-        for node in nodes[1:]:
-            e = self.builder.and_(e, node)
-        return e
-    def _print_StrictLessThan(self, expr):
-        return self._comparison('<', expr)
-    def _print_LessThan(self, expr):
-        return self._comparison('<=', expr)
-    def _print_StrictGreaterThan(self, expr):
-        return self._comparison('>', expr)
-    def _print_GreaterThan(self, expr):
-        return self._comparison('>=', expr)
-    def _print_Unequality(self, expr):
-        return self._comparison('!=', expr)
-    def _print_Equality(self, expr):
-        return self._comparison('==', expr)
-    def _comparison(self, cmpop, expr):
-        if collate_types([get_type_of_expression(arg) for arg in expr.args]) == create_type('double'):
-            comparison = self.builder.fcmp_unordered
-        else:
-            comparison = self.builder.icmp_signed
-        return comparison(cmpop, self._print(expr.lhs), self._print(expr.rhs))
-    def _print_KernelFunction(self, func):
-        # KernelFunction does not posses a return type
-        return_type = self.void
-        parameter_type = []
-        for parameter in func.parameters:
-            parameter_type.append(to_llvm_type(parameter.dtype))
-        func_type = ir.FunctionType(return_type, tuple(parameter_type))
-        name = func.function_name
-        fn = ir.Function(self.module, func_type, name)
-        self.ext_fn[name] = fn
-        # set proper names to arguments
-        for i, arg in enumerate(fn.args):
-            arg.name = func.parameters[i].name
-            self.func_arg_map[func.parameters[i].name] = arg
-        # func.attributes.add("inlinehint")
-        # func.attributes.add("argmemonly")
-        block = fn.append_basic_block(name="entry")
-        self.builder = ir.IRBuilder(block)  # TODO use goto_block instead
-        self._print(func.body)
-        self.builder.ret_void()
-        self.fn = fn
-        return fn
-    def _print_Block(self, block):
-        for node in block.args:
-            self._print(node)
-    def _print_LoopOverCoordinate(self, loop):
-        with Loop(self.builder, self._print(loop.start), self._print(loop.stop), self._print(loop.step),
-                  loop.loop_counter_name, loop.loop_counter_symbol.name) as i:
-            self._add_tmp_var(loop.loop_counter_symbol, i)
-            self._print(loop.body)
-            self._remove_tmp_var(loop.loop_counter_symbol)
-    def _print_SympyAssignment(self, assignment):
-        expr = self._print(assignment.rhs)
-        lhs = assignment.lhs
-        if isinstance(lhs, Indexed):
-            ptr = self._print(lhs.base.label)
-            index = self._print(lhs.args[1])
-            gep = self.builder.gep(ptr, [index])
-            return self.builder.store(expr, gep)
-        self.func_arg_map[assignment.lhs.name] = expr
-        return expr
-    def _print_cast_func(self, conversion):
-        node = self._print(conversion.args[0])
-        to_dtype = get_type_of_expression(conversion)
-        from_dtype = get_type_of_expression(conversion.args[0])
-        if from_dtype == to_dtype:
-            return self._print(conversion.args[0])
-        # (From, to)
-        decision = {
-            (create_composite_type_from_string("int16"),
-             create_composite_type_from_string("int64")): lambda: ir.Constant(self.integer, node),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
-            (create_composite_type_from_string("int16"),
-             create_composite_type_from_string("double")): functools.partial(self.builder.sitofp, node, self.fp_type),
-            (create_composite_type_from_string("double"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.fptosi, node, self.integer),
-            (create_composite_type_from_string("double *"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double *")): functools.partial(self.builder.inttoptr,
-                                                                               node, self.fp_pointer),
-            (create_composite_type_from_string("double * restrict"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node, self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double * restrict")): functools.partial(self.builder.inttoptr, node,
-                                                                                        self.fp_pointer),
-            (create_composite_type_from_string("double * restrict const"),
-             create_composite_type_from_string("int")): functools.partial(self.builder.ptrtoint, node,
-                                                                          self.integer),
-            (create_composite_type_from_string("int"),
-             create_composite_type_from_string("double * restrict const")): functools.partial(self.builder.inttoptr,
-                                                                                              node, self.fp_pointer),
-        }
-        # TODO float, TEST: const, restrict
-        # TODO bitcast, addrspacecast
-        # TODO unsigned/signed fills
-        # print([x for x in decision.keys()])
-        # print("Types:")
-        # print([(type(x), type(y)) for (x, y) in decision.keys()])
-        # print("Cast:")
-        # print((from_dtype, to_dtype))
-        return decision[(from_dtype, to_dtype)]()
-    def _print_pointer_arithmetic_func(self, pointer):
-        ptr = self._print(pointer.args[0])
-        index = self._print(pointer.args[1])
-        return self.builder.gep(ptr, [index])
-    def _print_Indexed(self, indexed):
-        ptr = self._print(indexed.base.label)
-        index = self._print(indexed.args[1])
-        gep = self.builder.gep(ptr, [index])
-        return self.builder.load(gep, name=indexed.base.label.name)
-    def _print_Piecewise(self, piece):
-        if not piece.args[-1].cond:
-            # We need the last conditional to be a True, otherwise the resulting
-            # function may not return a result.
-            raise ValueError("All Piecewise expressions must contain an "
-                             "(expr, True) statement to be used as a default "
-                             "condition. Without one, the generated "
-                             "expression may not evaluate to anything under "
-                             "some condition.")
-        if piece.has(Assignment):
-            raise NotImplementedError('The llvm-backend does not support assignments'
-                                      'in the Piecewise function. It is questionable'
-                                      'whether to implement it. So far there is no'
-                                      'use-case to test it.')
-        else:
-            phi_data = []
-            after_block = self.builder.append_basic_block()
-            for (expr, condition) in piece.args:
-                if condition == sp.sympify(True):  # Don't use 'is' use '=='!
-                    phi_data.append((self._print(expr), self.builder.block))
-                    self.builder.branch(after_block)
-                    self.builder.position_at_end(after_block)
-                else:
-                    cond = self._print(condition)
-                    true_block = self.builder.append_basic_block()
-                    false_block = self.builder.append_basic_block()
-                    self.builder.cbranch(cond, true_block, false_block)
-                    self.builder.position_at_end(true_block)
-                    phi_data.append((self._print(expr), true_block))
-                    self.builder.branch(after_block)
-                    self.builder.position_at_end(false_block)
-            phi = self.builder.phi(to_llvm_type(get_type_of_expression(piece)))
-            for (val, block) in phi_data:
-                phi.add_incoming(val, block)
-            return phi
-    # Should have a list of math library functions to validate this.
-    # TODO function calls to libs
-    def _print_Function(self, expr):
-        name = expr.func.__name__
-        e0 = self._print(expr.args[0])
-        fn = self.ext_fn.get(name)
-        if not fn:
-            fn_type = ir.FunctionType(self.fp_type, [self.fp_type])
-            fn = ir.Function(self.module, fn_type, name)
-            self.ext_fn[name] = fn
-        return self.builder.call(fn, [e0], name)
-    def empty_printer(self, expr):
-        try:
-            import inspect
-            mro = inspect.getmro(expr)
-        except AttributeError:
-            mro = "None"
-        raise TypeError("Unsupported type for LLVM JIT conversion: Expression:\"%s\", Type:\"%s\", MRO:%s"
-                        % (expr, type(expr), mro))
--- a/llvm/llvmjit.py
+++ b/llvm/llvmjit.py
-import llvmlite.ir as ir
-import llvmlite.binding as llvm
-import numpy as np
-import ctypes as ct
-from pystencils.data_types import create_composite_type_from_string
-from ..data_types import to_ctypes, ctypes_from_llvm, StructType, get_base_type
-from .llvm import generate_llvm
-from pystencils.transformations import symbol_name_to_variable_name
-from pystencils.field import FieldType
-def build_ctypes_argument_list(parameter_specification, argument_dict):
-    argument_dict = {symbol_name_to_variable_name(k): v for k, v in argument_dict.items()}
-    ct_arguments = []
-    array_shapes = set()
-    index_arr_shapes = set()
-    for arg in parameter_specification:
-        if arg.is_field_argument:
-            try:
-                field_arr = argument_dict[arg.field_name]
-            except KeyError:
-                raise KeyError("Missing field parameter for kernel call " + arg.field_name)
-            symbolic_field = arg.field
-            if arg.is_field_ptr_argument:
-                ct_arguments.append(field_arr.ctypes.data_as(to_ctypes(arg.dtype)))
-                if symbolic_field.has_fixed_shape:
-                    symbolic_field_shape = tuple(int(i) for i in symbolic_field.shape)
-                    if isinstance(symbolic_field.dtype, StructType):
-                        symbolic_field_shape = symbolic_field_shape[:-1]
-                    if symbolic_field_shape != field_arr.shape:
-                        raise ValueError("Passed array '%s' has shape %s which does not match expected shape %s" %
-                                         (arg.field_name, str(field_arr.shape), str(symbolic_field.shape)))
-                if symbolic_field.has_fixed_shape:
-                    symbolic_field_strides = tuple(int(i) * field_arr.itemsize for i in symbolic_field.strides)
-                    if isinstance(symbolic_field.dtype, StructType):
-                        symbolic_field_strides = symbolic_field_strides[:-1]
-                    if symbolic_field_strides != field_arr.strides:
-                        raise ValueError("Passed array '%s' has strides %s which does not match expected strides %s" %
-                                         (arg.field_name, str(field_arr.strides), str(symbolic_field_strides)))
-                if FieldType.is_indexed(symbolic_field):
-                    index_arr_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
-                elif FieldType.is_generic(symbolic_field):
-                    array_shapes.add(field_arr.shape[:symbolic_field.spatial_dimensions])
-            elif arg.is_field_shape_argument:
-                data_type = to_ctypes(get_base_type(arg.dtype))
-                ct_arguments.append(field_arr.ctypes.shape_as(data_type))
-            elif arg.is_field_stride_argument:
-                data_type = to_ctypes(get_base_type(arg.dtype))
-                strides = field_arr.ctypes.strides_as(data_type)
-                for i in range(len(field_arr.shape)):
-                    assert strides[i] % field_arr.itemsize == 0
-                    strides[i] //= field_arr.itemsize
-                ct_arguments.append(strides)
-            else:
-                assert False
-        else:
-            try:
-                param = argument_dict[arg.name]
-            except KeyError:
-                raise KeyError("Missing parameter for kernel call " + arg.name)
-            expected_type = to_ctypes(arg.dtype)
-            ct_arguments.append(expected_type(param))
-    if len(array_shapes) > 1:
-        raise ValueError("All passed arrays have to have the same size " + str(array_shapes))
-    if len(index_arr_shapes) > 1:
-        raise ValueError("All passed index arrays have to have the same size " + str(array_shapes))
-    return ct_arguments
-def make_python_function_incomplete_params(kernel_function_node, argument_dict, func):
-    parameters = kernel_function_node.parameters
-    cache = {}
-    cache_values = []
-    def wrapper(**kwargs):
-        key = hash(tuple((k, v.ctypes.data, v.strides, v.shape) if isinstance(v, np.ndarray) else (k, id(v))
-                         for k, v in kwargs.items()))
-        try:
-            args = cache[key]
-            func(*args)
-        except KeyError:
-            full_arguments = argument_dict.copy()
-            full_arguments.update(kwargs)
-            args = build_ctypes_argument_list(parameters, full_arguments)
-            cache[key] = args
-            cache_values.append(kwargs)  # keep objects alive such that ids remain unique
-            func(*args)
-    wrapper.ast = kernel_function_node
-    wrapper.parameters = kernel_function_node.parameters
-    return wrapper
-def generate_and_jit(ast):
-    gen = generate_llvm(ast)
-    if isinstance(gen, ir.Module):
-        return compile_llvm(gen)
-    else:
-        return compile_llvm(gen.module)
-def make_python_function(ast, argument_dict={}, func=None):
-    if func is None:
-        jit = generate_and_jit(ast)
-        func = jit.get_function_ptr(ast.function_name)
-    try:
-        args = build_ctypes_argument_list(ast.parameters, argument_dict)
-    except KeyError:
-        # not all parameters specified yet
-        return make_python_function_incomplete_params(ast, argument_dict, func)
-    return lambda: func(*args)
-def compile_llvm(module):
-    jit = Jit()
-    jit.parse(module)
-    jit.optimize()
-    jit.compile()
-    return jit
-class Jit(object):
-    def __init__(self):
-        llvm.initialize()
-        llvm.initialize_all_targets()
-        llvm.initialize_native_target()
-        llvm.initialize_native_asmprinter()
-        self.module = None
-        self._llvmmod = llvm.parse_assembly("")
-        self.target = llvm.Target.from_default_triple()
-        self.cpu = llvm.get_host_cpu_name()
-        self.cpu_features = llvm.get_host_cpu_features()
-        self.target_machine = self.target.create_target_machine(cpu=self.cpu, features=self.cpu_features.flatten(),
-                                                                opt=2)
-        llvm.check_jit_execution()
-        self.ee = llvm.create_mcjit_compiler(self.llvmmod, self.target_machine)
-        self.ee.finalize_object()
-        self.fptr = None
-    @property
-    def llvmmod(self):
-        return self._llvmmod
-    @llvmmod.setter
-    def llvmmod(self, mod):
-        self.ee.remove_module(self.llvmmod)
-        self.ee.add_module(mod)
-        self.ee.finalize_object()
-        self.compile()
-        self._llvmmod = mod
-    def parse(self, module):
-        self.module = module
-        llvmmod = llvm.parse_assembly(str(module))
-        llvmmod.verify()
-        llvmmod.triple = self.target.triple
-        llvmmod.name = 'module'
-        self.llvmmod = llvmmod
-    def write_ll(self, file):
-        with open(file, 'w') as f:
-            f.write(str(self.llvmmod))
-    def write_assembly(self, file):
-        with open(file, 'w') as f:
-            f.write(self.target_machine.emit_assembly(self.llvmmod))
-    def write_object_file(self, file):
-        with open(file, 'wb') as f:
-            f.write(self.target_machine.emit_object(self.llvmmod))
-    def optimize(self):
-        pmb = llvm.create_pass_manager_builder()
-        pmb.opt_level = 2
-        pmb.disable_unit_at_a_time = False
-        pmb.loop_vectorize = True
-        pmb.slp_vectorize = True
-        # TODO possible to pass for functions
-        pm = llvm.create_module_pass_manager()
-        pm.add_instruction_combining_pass()
-        pm.add_function_attrs_pass()
-        pm.add_constant_merge_pass()
-        pm.add_licm_pass()
-        pmb.populate(pm)
-        pm.run(self.llvmmod)
-    def compile(self):
-        fptr = {}
-        for func in self.module.functions:
-            if not func.is_declaration:
-                return_type = None
-                if func.ftype.return_type != ir.VoidType():
-                    return_type = to_ctypes(create_composite_type_from_string(str(func.ftype.return_type)))
-                args = [ctypes_from_llvm(arg) for arg in func.ftype.args]
-                function_address = self.ee.get_function_address(func.name)
-                fptr[func.name] = ct.CFUNCTYPE(return_type, *args)(function_address)
-        self.fptr = fptr
-    def __call__(self, func, *args, **kwargs):
-        target_function = next(f for f in self.module.functions if f.name == func)
-        arg_types = [ctypes_from_llvm(arg.type) for arg in target_function.args]
-        transformed_args = []
-        for i, arg in enumerate(args):
-            if isinstance(arg, np.ndarray):
-                transformed_args.append(arg.ctypes.data_as(arg_types[i]))
-            else:
-                transformed_args.append(arg)
-        self.fptr[func](*transformed_args)
-    def print_functions(self):
-        for f in self.module.functions:
-            print(f.ftype.return_type, f.name, f.args)
-    def get_function_ptr(self, name):
-        fptr = self.fptr[name]
-        fptr.jit = self
-        return fptr
--- a/pyproject.toml
+++ b/pyproject.toml
+[project]
+name = "pystencils"
+description = "Speeding up stencil computations on CPUs and GPUs"
+dynamic = ["version"]
+readme = "README.md"
+authors = [
+    { name = "Martin Bauer" },
+    { name = "Jan Hönig " },
+    { name = "Markus Holzer" },
+    { name = "Frederik Hennig" },
+    { email = "cs10-codegen@fau.de" },
+]
+license = { file = "COPYING.txt" }
+requires-python = ">=3.10"
+dependencies = ["sympy>=1.9,<=1.12.1", "numpy>=1.8.0", "appdirs", "joblib", "pyyaml", "fasteners"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Framework :: Jupyter",
+    "Topic :: Software Development :: Code Generators",
+    "Topic :: Scientific/Engineering :: Physics",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
+]
+[project.urls]
+"Bug Tracker" = "https://i10git.cs.fau.de/pycodegen/pystencils/-/issues"
+"Documentation" = "https://pycodegen.pages.i10git.cs.fau.de/pystencils/"
+"Source Code" = "https://i10git.cs.fau.de/pycodegen/pystencils"
+[project.optional-dependencies]
+gpu = ['cupy']
+alltrafos = ['islpy', 'py-cpuinfo']
+bench_db = ['blitzdb', 'pymongo', 'pandas']
+interactive = [
+    'matplotlib',
+    'ipy_table',
+    'imageio',
+    'jupyter',
+    'pyevtk',
+    'rich',
+    'graphviz',
+]
+use_cython = [
+    'Cython'
+]
+doc = [
+    'sphinx',
+    'sphinx_rtd_theme',
+    'nbsphinx',
+    'sphinxcontrib-bibtex',
+    'sphinx_autodoc_typehints',
+    'pandoc',
+]
+tests = [
+    'pytest',
+    'pytest-cov',
+    'pytest-html',
+    'ansi2html',
+    'pytest-xdist',
+    'flake8',
+    'nbformat',
+    'nbconvert',
+    'ipython',
+    'matplotlib',
+    'py-cpuinfo',
+    'randomgen>=1.18',
+]
+[build-system]
+requires = [
+    "setuptools>=61",
+    "versioneer[toml]>=0.29",
+    # 'Cython'
+]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.package-data]
+pystencils = [
+    "include/*.h",
+    "boundaries/createindexlistcython.pyx"
+]
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["pystencils", "pystencils.*"]
+namespaces = false
+[tool.versioneer]
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+VCS = "git"
+style = "pep440"
+versionfile_source = "src/pystencils/_version.py"
+versionfile_build = "pystencils/_version.py"
+tag_prefix = "release/"
+parentdir_prefix = "pystencils-"
--- a/pytest.ini
+++ b/pytest.ini
+[pytest]
+testpaths = src tests doc/notebooks
+pythonpath = src
+python_files = test_*.py *_test.py scenario_*.py
+norecursedirs = *.egg-info .git .cache .ipynb_checkpoints htmlcov
+addopts = --doctest-modules --durations=20  --cov-config pytest.ini
+markers =
+       longrun: tests only run at night since they have large execution time
+       notebook: mark for notebooks
+# these warnings all come from third party libraries.
+filterwarnings =
+       ignore:an integer is required:DeprecationWarning
+       ignore:\s*load will be removed, use:PendingDeprecationWarning
+       ignore:the imp module is deprecated in favour of importlib:DeprecationWarning
+       ignore:.*is a deprecated alias for the builtin `bool`:DeprecationWarning
+       ignore:'contextfilter' is renamed to 'pass_context':DeprecationWarning
+       ignore:Using or importing the ABCs from 'collections' instead of from 'collections.abc':DeprecationWarning
+       ignore:Animation was deleted without rendering anything:UserWarning
+[run]
+branch = True
+source = src/pystencils
+         tests
+omit = doc/*
+       tests/*
+       setup.py
+       quicktest.py
+       conftest.py
+       versioneer.py
+       src/pystencils/jupytersetup.py
+       src/pystencils/cpu/msvc_detection.py
+       src/pystencils/sympy_gmpy_bug_workaround.py
+       src/pystencils/cache.py
+       src/pystencils/pacxx/benchmark.py
+       src/pystencils/_version.py
+       venv/
+[report]
+exclude_lines =
+       # Have to re-enable the standard pragma
+       pragma: no cover
+       def __repr__
+       def _repr_html_
+       # Don't complain if tests don't hit defensive assertion code:
+       raise AssertionError
+       raise NotImplementedError
+       NotImplementedError()
+       #raise ValueError
+       # Don't complain if non-runnable code isn't run:
+       if 0:
+       if False:
+       if __name__ == .__main__.:
+skip_covered = True
+fail_under = 85
+[html]
+directory = coverage_report
--- a/quicktest.py
+++ b/quicktest.py
+#!/usr/bin/env python3
+from contextlib import redirect_stdout
+import io
+from tests.test_quicktests import (
+    test_basic_kernel,
+    test_basic_blocking_staggered,
+    test_basic_vectorization,
+)
+quick_tests = [
+    test_basic_kernel,
+    test_basic_blocking_staggered,
+    test_basic_vectorization,
+]
+if __name__ == "__main__":
+    print("Running pystencils quicktests")
+    for qt in quick_tests:
+        print(f"   -> {qt.__name__}")
+        with redirect_stdout(io.StringIO()):
+            qt()
--- a/release.sh
+++ b/release.sh
+#!/bin/bash
+echo "Existing versions"
+git tag -l | grep release
+echo "Enter the next version"
+read new_version
+git tag -s release/${new_version}
+git push origin master release/${new_version}
+rm -rf dist
+python setup.py sdist
+twine upload dist/*
--- a/session.py
+++ b/session.py
-import pystencils.sympy_gmpy_bug_workaround
-import sympy as sp
-import numpy as np
-import pystencils as ps
-import pystencils.plot2d as plt
-import pystencils.jupytersetup as ps_notebook
-__all__ = ['sp', 'np', 'ps', 'plt', 'ps_notebook']
--- a/setup.py
+++ b/setup.py
+from setuptools import setup, __version__ as setuptools_version
+if int(setuptools_version.split('.')[0]) < 61:
+    raise Exception(
+        "[ERROR] pystencils requires at least setuptools version 61 to install.\n"
+        "If this error occurs during an installation via pip, it is likely that there is a conflict between "
+        "versions of setuptools installed by pip and the system package manager. "
+        "In this case, it is recommended to install pystencils into a virtual environment instead."
+    )
+import versioneer
+def get_cmdclass():
+    return versioneer.get_cmdclass()
+setup(
+    version=versioneer.get_version(),
+    cmdclass=get_cmdclass(),
+)
--- a/simp/simplifications.py
+++ b/simp/simplifications.py
-import sympy as sp
-from typing import Callable, List
-from pystencils.assignment import Assignment
-from pystencils.simp.assignment_collection import AssignmentCollection
-from pystencils.sympyextensions import subs_additive
-AC = AssignmentCollection
-def sympy_cse(ac: AC) -> AC:
-    """Searches for common subexpressions inside the equation collection.
-    Searches is done in both the existing subexpressions as well as the assignments themselves.
-    It uses the sympy subexpression detection to do this. Return a new equation collection
-    with the additional subexpressions found
-    """
-    symbol_gen = ac.subexpression_symbol_generator
-    replacements, new_eq = sp.cse(ac.subexpressions + ac.main_assignments,
-                                  symbols=symbol_gen)
-    replacement_eqs = [Assignment(*r) for r in replacements]
-    modified_subexpressions = new_eq[:len(ac.subexpressions)]
-    modified_update_equations = new_eq[len(ac.subexpressions):]
-    new_subexpressions = replacement_eqs + modified_subexpressions
-    topologically_sorted_pairs = sp.cse_main.reps_toposort([[e.lhs, e.rhs] for e in new_subexpressions])
-    new_subexpressions = [Assignment(a[0], a[1]) for a in topologically_sorted_pairs]
-    return ac.copy(modified_update_equations, new_subexpressions)
-def sympy_cse_on_assignment_list(assignments: List[Assignment]) -> List[Assignment]:
-    """Extracts common subexpressions from a list of assignments."""
-    ec = AC([], assignments)
-    return sympy_cse(ec).all_assignments
-def subexpression_substitution_in_existing_subexpressions(ac: AC) -> AC:
-    """Goes through the subexpressions list and replaces the term in the following subexpressions."""
-    result = []
-    for outer_ctr, s in enumerate(ac.subexpressions):
-        new_rhs = s.rhs
-        for inner_ctr in range(outer_ctr):
-            sub_expr = ac.subexpressions[inner_ctr]
-            new_rhs = subs_additive(new_rhs, sub_expr.lhs, sub_expr.rhs, required_match_replacement=1.0)
-            new_rhs = new_rhs.subs(sub_expr.rhs, sub_expr.lhs)
-        result.append(Assignment(s.lhs, new_rhs))
-    return ac.copy(ac.main_assignments, result)
-def subexpression_substitution_in_main_assignments(ac: AC) -> AC:
-    """Replaces already existing subexpressions in the equations of the assignment_collection."""
-    result = []
-    for s in ac.main_assignments:
-        new_rhs = s.rhs
-        for sub_expr in ac.subexpressions:
-            new_rhs = subs_additive(new_rhs, sub_expr.lhs, sub_expr.rhs, required_match_replacement=1.0)
-        result.append(Assignment(s.lhs, new_rhs))
-    return ac.copy(result)
-def add_subexpressions_for_divisions(ac: AC) -> AC:
-    r"""Introduces subexpressions for all divisions which have no constant in the denominator.
-    For example :math:`\frac{1}{x}` is replaced while :math:`\frac{1}{3}` is not replaced.
-    """
-    divisors = set()
-    def search_divisors(term):
-        if term.func == sp.Pow:
-            if term.exp.is_integer and term.exp.is_number and term.exp < 0:
-                divisors.add(term)
-        else:
-            for a in term.args:
-                search_divisors(a)
-    for eq in ac.all_assignments:
-        search_divisors(eq.rhs)
-    new_symbol_gen = ac.subexpression_symbol_generator
-    substitutions = {divisor: new_symbol for new_symbol, divisor in zip(new_symbol_gen, divisors)}
-    return ac.new_with_substitutions(substitutions, True)
-def apply_to_all_assignments(operation: Callable[[sp.Expr], sp.Expr]) -> Callable[[AC], AC]:
-    """Applies sympy expand operation to all equations in collection."""
-    def f(assignment_collection: AC) -> AC:
-        result = [Assignment(eq.lhs, operation(eq.rhs)) for eq in assignment_collection.main_assignments]
-        return assignment_collection.copy(result)
-    f.__name__ = operation.__name__
-    return f
-def apply_on_all_subexpressions(operation: Callable[[sp.Expr], sp.Expr]) -> Callable[[AC], AC]:
-    """Applies the given operation on all subexpressions of the AC."""
-    def f(ac: AC) -> AC:
-        result = [Assignment(eq.lhs, operation(eq.rhs)) for eq in ac.subexpressions]
-        return ac.copy(ac.main_assignments, result)
-    f.__name__ = operation.__name__
-    return f
--- a/__init__.py
+++ b/__init__.py
 """Module to generate stencil kernels in C or CUDA using sympy expressions and call them as Python functions"""
-from . import sympy_gmpy_bug_workaround  # NOQA
+from .enums import Backend, Target
+from . import fd
+from . import stencil as stencil
+from .assignment import Assignment, AddAugmentedAssignment, assignment_from_stencil
+from .typing.typed_sympy import TypedSymbol
+from .display_utils import get_code_obj, get_code_str, show_code, to_dot
 from .field import Field, FieldType, fields
-from .data_types import TypedSymbol
+from .config import CreateKernelConfig
-from .slicing import make_slice
+from .cache import clear_cache
-from .kernelcreation import create_kernel, create_indexed_kernel, create_staggered_kernel
+from .kernel_decorator import kernel, kernel_config
-from .display_utils import show_code, to_dot
+from .kernelcreation import create_kernel, create_staggered_kernel
 from .simp import AssignmentCollection
-from .assignment import Assignment
+from .slicing import make_slice
+from .spatial_coordinates import x_, x_staggered, x_staggered_vector, x_vector, y_, y_staggered, z_, z_staggered
 from .sympyextensions import SymbolCreator
 from .datahandling import create_data_handling
-from .kernel_decorator import kernel
-from .stencils import visualize_stencil_expression
-from . import fd
 __all__ = ['Field', 'FieldType', 'fields',
           'TypedSymbol',
           'make_slice',
-           'create_kernel', 'create_indexed_kernel', 'create_staggered_kernel',
+           'CreateKernelConfig',
-           'show_code', 'to_dot',
+           'create_kernel', 'create_staggered_kernel',
+           'Target', 'Backend',
+           'show_code', 'to_dot', 'get_code_obj', 'get_code_str',
           'AssignmentCollection',
-           'Assignment',
+           'Assignment', 'AddAugmentedAssignment',
+           'assignment_from_stencil',
           'SymbolCreator',
           'create_data_handling',
-           'kernel',
+           'clear_cache',
+           'kernel', 'kernel_config',
+           'x_', 'y_', 'z_',
+           'x_staggered', 'y_staggered', 'z_staggered',
+           'x_vector', 'x_staggered_vector',
           'fd',
-           'visualize_stencil_expression']
+           'stencil']
+from . import _version
+__version__ = _version.get_versions()['version']
--- a/src/pystencils/_version.py
+++ b/src/pystencils/_version.py
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+# This file is released into the public domain.
+# Generated by versioneer-0.29
+# https://github.com/python-versioneer/python-versioneer
+"""Git implementation of _version.py."""
+import errno
+import os
+import re
+import subprocess
+import sys
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import functools
+def get_keywords() -> Dict[str, str]:
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+    VCS: str
+    style: str
+    tag_prefix: str
+    parentdir_prefix: str
+    versionfile_source: str
+    verbose: bool
+def get_config() -> VersioneerConfig:
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "release/"
+    cfg.parentdir_prefix = "pystencils-"
+    cfg.versionfile_source = "src/pystencils/_version.py"
+    cfg.verbose = False
+    return cfg
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+LONG_VERSION_PY: Dict[str, str] = {}
+HANDLERS: Dict[str, Dict[str, Callable]] = {}
+def register_vcs_handler(vcs: str, method: str) -> Callable:  # decorator
+    """Create decorator to mark a method as the handler of a VCS."""
+    def decorate(f: Callable) -> Callable:
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+def run_command(
+    commands: List[str],
+    args: List[str],
+    cwd: Optional[str] = None,
+    verbose: bool = False,
+    hide_stderr: bool = False,
+    env: Optional[Dict[str, str]] = None,
+) -> Tuple[Optional[str], Optional[int]]:
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    process = None
+    popen_kwargs: Dict[str, Any] = {}
+    if sys.platform == "win32":
+        # This hides the console window if pythonw.exe is used
+        startupinfo = subprocess.STARTUPINFO()
+        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+        popen_kwargs["startupinfo"] = startupinfo
+    for command in commands:
+        try:
+            dispcmd = str([command] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            process = subprocess.Popen([command] + args, cwd=cwd, env=env,
+                                       stdout=subprocess.PIPE,
+                                       stderr=(subprocess.PIPE if hide_stderr
+                                               else None), **popen_kwargs)
+            break
+        except OSError as e:
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = process.communicate()[0].strip().decode()
+    if process.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, process.returncode
+    return stdout, process.returncode
+def versions_from_parentdir(
+    parentdir_prefix: str,
+    root: str,
+    verbose: bool,
+) -> Dict[str, Any]:
+    """Try to determine the version from the parent directory name.
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+    for _ in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        rootdirs.append(root)
+        root = os.path.dirname(root)  # up a level
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords: Dict[str, str] = {}
+    try:
+        with open(versionfile_abs, "r") as fobj:
+            for line in fobj:
+                if line.strip().startswith("git_refnames ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["refnames"] = mo.group(1)
+                if line.strip().startswith("git_full ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["full"] = mo.group(1)
+                if line.strip().startswith("git_date ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["date"] = mo.group(1)
+    except OSError:
+        pass
+    return keywords
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(
+    keywords: Dict[str, str],
+    tag_prefix: str,
+    verbose: bool,
+) -> Dict[str, Any]:
+    """Get version information from git keywords."""
+    if "refnames" not in keywords:
+        raise NotThisMethod("Short version file found")
+    date = keywords.get("date")
+    if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = {r.strip() for r in refnames.strip("()").split(",")}
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = {r[len(TAG):] for r in refs if r.startswith(TAG)}
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = {r for r in refs if re.search(r'\d', r)}
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            # Filter out refs that exactly match prefix or that don't start
+            # with a number once the prefix is stripped (mostly a concern
+            # when prefix is '')
+            if not re.match(r'\d', r):
+                continue
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(
+    tag_prefix: str,
+    root: str,
+    verbose: bool,
+    runner: Callable = run_command
+) -> Dict[str, Any]:
+    """Get version from 'git describe' in the root of the source tree.
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+    # GIT_DIR can interfere with correct operation of Versioneer.
+    # It may be intended to be passed to the Versioneer-versioned project,
+    # but that should not change where we get our version from.
+    env = os.environ.copy()
+    env.pop("GIT_DIR", None)
+    runner = functools.partial(runner, env=env)
+    _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                   hide_stderr=not verbose)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = runner(GITS, [
+        "describe", "--tags", "--dirty", "--always", "--long",
+        "--match", f"{tag_prefix}[[:digit:]]*"
+    ], cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+    pieces: Dict[str, Any] = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+    branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"],
+                             cwd=root)
+    # --abbrev-ref was added in git-1.6.3
+    if rc != 0 or branch_name is None:
+        raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
+    branch_name = branch_name.strip()
+    if branch_name == "HEAD":
+        # If we aren't exactly on a branch, pick a branch which represents
+        # the current commit. If all else fails, we are on a branchless
+        # commit.
+        branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
+        # --contains was added in git-1.5.4
+        if rc != 0 or branches is None:
+            raise NotThisMethod("'git branch --contains' returned error")
+        branches = branches.split("\n")
+        # Remove the first line if we're running detached
+        if "(" in branches[0]:
+            branches.pop(0)
+        # Strip off the leading "* " from the list of branches.
+        branches = [branch[2:] for branch in branches]
+        if "master" in branches:
+            branch_name = "master"
+        elif not branches:
+            branch_name = None
+        else:
+            # Pick the first branch that is returned. Good or bad.
+            branch_name = branches[0]
+    pieces["branch"] = branch_name
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+    # now we have TAG-NUM-gHEX or HEX
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparsable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
+        pieces["distance"] = len(out.split())  # total number of commits
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    return pieces
+def plus_or_dot(pieces: Dict[str, Any]) -> str:
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+def render_pep440(pieces: Dict[str, Any]) -> str:
+    """Build up version string, with post-release "local version identifier".
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+def render_pep440_branch(pieces: Dict[str, Any]) -> str:
+    """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
+    The ".dev0" means not master branch. Note that .dev0 sorts backwards
+    (a feature branch will appear "older" than the master branch).
+    Exceptions:
+    1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0"
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
+    """Split pep440 version string at the post-release segment.
+    Returns the release segments before the post-release and the
+    post-release version number (or -1 if no post-release segment is present).
+    """
+    vc = str.split(ver, ".post")
+    return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
+def render_pep440_pre(pieces: Dict[str, Any]) -> str:
+    """TAG[.postN.devDISTANCE] -- No -dirty.
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        if pieces["distance"]:
+            # update the post release segment
+            tag_version, post_version = pep440_split_post(pieces["closest-tag"])
+            rendered = tag_version
+            if post_version is not None:
+                rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
+            else:
+                rendered += ".post0.dev%d" % (pieces["distance"])
+        else:
+            # no commits, use the tag as the version
+            rendered = pieces["closest-tag"]
+    else:
+        # exception #1
+        rendered = "0.post0.dev%d" % pieces["distance"]
+    return rendered
+def render_pep440_post(pieces: Dict[str, Any]) -> str:
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
+    """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
+    The ".dev0" means not master branch.
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+def render_pep440_old(pieces: Dict[str, Any]) -> str:
+    """TAG[.postDISTANCE[.dev0]] .
+    The ".dev0" means dirty.
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+def render_git_describe(pieces: Dict[str, Any]) -> str:
+    """TAG[-DISTANCE-gHEX][-dirty].
+    Like 'git describe --tags --dirty --always'.
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+def render_git_describe_long(pieces: Dict[str, Any]) -> str:
+    """TAG-DISTANCE-gHEX[-dirty].
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+    if not style or style == "default":
+        style = "pep440"  # the default
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-branch":
+        rendered = render_pep440_branch(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-post-branch":
+        rendered = render_pep440_post_branch(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+def get_versions() -> Dict[str, Any]:
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+    cfg = get_config()
+    verbose = cfg.verbose
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for _ in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
--- a/alignedarray.py
+++ b/alignedarray.py
 import numpy as np
-def aligned_empty(shape, byte_alignment=32, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
+def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
    """
    Creates an aligned empty numpy array
    Args:
        shape: size of the array
        byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
+                        By default, use the maximum required by the CPU (or 512 bits if this cannot be detected).
+                        When 'cacheline' is specified, the size of a cache line is used.
        dtype: numpy data type
        byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
                    typically used to align first inner cell instead of ghost layer
        order: storage linearization order
        align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
    """
+    if byte_alignment is True or byte_alignment == 'cacheline':
+        from pystencils.backends.simd_instruction_sets import (get_supported_instruction_sets, get_cacheline_size,
+                                                               get_vector_instruction_set)
+        instruction_sets = get_supported_instruction_sets()
+        if instruction_sets is None:
+            byte_alignment = 64
+        elif byte_alignment == 'cacheline':
+            cacheline_sizes = [get_cacheline_size(is_name) for is_name in instruction_sets]
+            if all([s is None for s in cacheline_sizes]) or \
+                    max([s for s in cacheline_sizes if s is not None]) > 0x100000:
+                widths = [get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
+                          for is_name in instruction_sets
+                          if type(get_vector_instruction_set(dtype, is_name)['width']) is int]
+                byte_alignment = 64 if all([s is None for s in widths]) else max(widths)
+            else:
+                byte_alignment = max([s for s in cacheline_sizes if s is not None])
+        elif not any([type(get_vector_instruction_set(dtype, is_name)['width']) is int
+                      for is_name in instruction_sets]):
+            byte_alignment = 64
+        else:
+            byte_alignment = max([get_vector_instruction_set(dtype, is_name)['width'] * np.dtype(dtype).itemsize
+                                  for is_name in instruction_sets
+                                  if type(get_vector_instruction_set(dtype, is_name)['width']) is int])
    if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
        size = np.prod(shape)
        d = np.dtype(dtype)
@@ -51,7 +77,7 @@ def aligned_empty(shape, byte_alignment=32, dtype=np.float64, byte_offset=0, ord
        return tmp
-def aligned_zeros(shape, byte_alignment=16, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True):
+def aligned_zeros(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
    arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset,
                        order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate)
    x = np.zeros((), arr.dtype)
@@ -59,7 +85,7 @@ def aligned_zeros(shape, byte_alignment=16, dtype=float, byte_offset=0, order='C
    return arr
-def aligned_ones(shape, byte_alignment=16, dtype=float, byte_offset=0, order='C', align_inner_coordinate=True):
+def aligned_ones(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True):
    arr = aligned_empty(shape, dtype=dtype, byte_offset=byte_offset,
                        order=order, byte_alignment=byte_alignment, align_inner_coordinate=align_inner_coordinate)
    x = np.ones((), arr.dtype)
No results found