5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6 · 5600b6b6
--- a/src/pystencils/typing/utilities.py
+++ b/src/pystencils/typing/utilities.py
+from collections import defaultdict
+from functools import partial
+from typing import Tuple, Union, Sequence
+
+import numpy as np
+import sympy as sp
+from sympy.logic.boolalg import Boolean, BooleanFunction
+
+import pystencils
+from pystencils.cache import memorycache_if_hashable
+from pystencils.typing.types import BasicType, VectorType, PointerType, create_type
+from pystencils.typing.cast_functions import CastFunc
+from pystencils.typing.typed_sympy import TypedSymbol
+from pystencils.utils import all_equal
+
+
+def typed_symbols(names, dtype, **kwargs):
+    """
+    Creates TypedSymbols with the same functionality as sympy.symbols
+    Args:
+        names: See sympy.symbols
+        dtype: The data type all symbols will have
+        **kwargs: Key value arguments passed to sympy.symbols
+
+    Returns:
+        TypedSymbols
+    """
+    symbols = sp.symbols(names, **kwargs)
+    if isinstance(symbols, Tuple):
+        return tuple(TypedSymbol(str(s), dtype) for s in symbols)
+    else:
+        return TypedSymbol(str(symbols), dtype)
+
+
+def get_base_type(data_type):
+    """
+    Returns the BasicType of a Pointer or a Vector
+    """
+    while data_type.base_type is not None:
+        data_type = data_type.base_type
+    return data_type
+
+
+def result_type(*args: np.dtype):
+    """Returns the type of the result if the np.dtype arguments would be collated.
+    We can't use numpy functionality, because numpy casts don't behave exactly like C casts"""
+    s = sorted(args, key=lambda x: x.itemsize)
+
+    def kind_to_value(kind: str) -> int:
+        if kind == 'f':
+            return 3
+        elif kind == 'i':
+            return 2
+        elif kind == 'u':
+            return 1
+        elif kind == 'b':
+            return 0
+        else:
+            raise NotImplementedError(f'{kind=} is not a supported kind of a type. See "numpy.dtype.kind" for options')
+    s = sorted(s, key=lambda x: kind_to_value(x.kind))
+    return s[-1]
+
+
+def collate_types(types: Sequence[Union[BasicType, VectorType]]):
+    """
+    Takes a sequence of types and returns their "common type" e.g. (float, double, float) -> double
+    Uses the collation rules from numpy.
+    """
+    # Pointer arithmetic case i.e. pointer + [int, uint] is allowed
+    if any(isinstance(t, PointerType) for t in types):
+        pointer_type = None
+        for t in types:
+            if isinstance(t, PointerType):
+                if pointer_type is not None:
+                    raise ValueError(f'Cannot collate the combination of two pointer types "{pointer_type}" and "{t}"')
+                pointer_type = t
+            elif isinstance(t, BasicType):
+                if not (t.is_int() or t.is_uint()):
+                    raise ValueError("Invalid pointer arithmetic")
+            else:
+                raise ValueError("Invalid pointer arithmetic")
+        return pointer_type
+
+    # # peel of vector types, if at least one vector type occurred the result will also be the vector type
+    vector_type = [t for t in types if isinstance(t, VectorType)]
+    if not all_equal(t.width for t in vector_type):
+        raise ValueError("Collation failed because of vector types with different width")
+
+    # TODO: check if this is needed
+    # def peel_off_type(dtype, type_to_peel_off):
+    #     while type(dtype) is type_to_peel_off:
+    #         dtype = dtype.base_type
+    #     return dtype
+    # types = [peel_off_type(t, VectorType) for t in types]
+
+    types = [t.base_type if isinstance(t, VectorType) else t for t in types]
+
+    # now we should have a list of basic types - struct types are not yet supported
+    assert all(type(t) is BasicType for t in types)
+
+    result_numpy_type = result_type(*(t.numpy_dtype for t in types))
+    result = BasicType(result_numpy_type)
+    if vector_type:
+        result = VectorType(result, vector_type[0].width)
+    return result
+
+
+# TODO get_type_of_expression should be used after leaf_typing. So no defaults should be necessary
+@memorycache_if_hashable(maxsize=2048)
+def get_type_of_expression(expr,
+                           default_float_type='double',
+                           default_int_type='int',
+                           symbol_type_dict=None):
+    from pystencils.astnodes import ResolvedFieldAccess
+    from pystencils.cpu.vectorization import vec_all, vec_any
+
+    if default_float_type == 'float':
+        default_float_type = 'float32'
+
+    if not symbol_type_dict:
+        symbol_type_dict = defaultdict(lambda: create_type('double'))
+
+    # TODO this line is quite hard to understand, if possible simpl
+    get_type = partial(get_type_of_expression,
+                       default_float_type=default_float_type,
+                       default_int_type=default_int_type,
+                       symbol_type_dict=symbol_type_dict)
+
+    expr = sp.sympify(expr)
+    if isinstance(expr, sp.Integer):
+        return create_type(default_int_type)
+    elif isinstance(expr, sp.Rational) or isinstance(expr, sp.Float):
+        return create_type(default_float_type)
+    elif isinstance(expr, ResolvedFieldAccess):
+        return expr.field.dtype
+    elif isinstance(expr, pystencils.field.Field.Access):
+        return expr.field.dtype
+    elif isinstance(expr, TypedSymbol):
+        return expr.dtype
+    elif isinstance(expr, sp.Symbol):
+        # TODO delete if case
+        if symbol_type_dict:
+            return symbol_type_dict[expr.name]
+        else:
+            raise ValueError("All symbols inside this expression have to be typed! ", str(expr))
+    elif isinstance(expr, CastFunc):
+        return expr.args[1]
+    elif isinstance(expr, (vec_any, vec_all)):
+        return create_type("bool")
+    elif hasattr(expr, 'func') and expr.func == sp.Piecewise:
+        collated_result_type = collate_types(tuple(get_type(a[0]) for a in expr.args))
+        collated_condition_type = collate_types(tuple(get_type(a[1]) for a in expr.args))
+        if type(collated_condition_type) is VectorType and type(collated_result_type) is not VectorType:
+            collated_result_type = VectorType(collated_result_type, width=collated_condition_type.width)
+        return collated_result_type
+    elif isinstance(expr, sp.Indexed):
+        typed_symbol = expr.base.label
+        return typed_symbol.dtype.base_type
+    elif isinstance(expr, (Boolean, BooleanFunction)):
+        # if any arg is of vector type return a vector boolean, else return a normal scalar boolean
+        result = create_type("bool")
+        vec_args = [get_type(a) for a in expr.args if isinstance(get_type(a), VectorType)]
+        if vec_args:
+            result = VectorType(result, width=vec_args[0].width)
+        return result
+    elif isinstance(expr, sp.Pow):
+        base_type = get_type(expr.args[0])
+        if expr.exp.is_integer:
+            return base_type
+        else:
+            return collate_types([create_type(default_float_type), base_type])
+    elif isinstance(expr, (sp.Sum, sp.Product)):
+        return get_type(expr.args[0])
+    elif isinstance(expr, sp.Expr):
+        expr: sp.Expr
+        if expr.args:
+            types = tuple(get_type(a) for a in expr.args)
+            return collate_types(types)
+        else:
+            if expr.is_integer:
+                return create_type(default_int_type)
+            else:
+                return create_type(default_float_type)
+
+    raise NotImplementedError("Could not determine type for", expr, type(expr))
+
+
+# Fix for sympy versions from 1.9
+sympy_version = sp.__version__.split('.')
+sympy_version_int = int(sympy_version[0]) * 100 + int(sympy_version[1])
+if sympy_version_int >= 109:
+    # __setstate__ would bypass the contructor, so we remove it
+    if sympy_version_int >= 111:
+        del sp.Basic.__setstate__
+        del sp.Symbol.__setstate__
+    else:
+        sp.Number.__getstate__ = sp.Basic.__getstate__
+        del sp.Basic.__getstate__
+
+    # __reduce_ex__ would strip kwargs, so we override it
+    def basic_reduce_ex(self, protocol):
+        if hasattr(self, '__getnewargs_ex__'):
+            args, kwargs = self.__getnewargs_ex__()
+        else:
+            args, kwargs = self.__getnewargs__(), {}
+        if hasattr(self, '__getstate__'):
+            state = self.__getstate__()
+        else:
+            state = None
+        return partial(type(self), **kwargs), args, state
+    sp.Basic.__reduce_ex__ = basic_reduce_ex
+
+
+def get_next_parent_of_type(node, parent_type):
+    """Returns the next parent node of given type or None, if root is reached.
+
+    Traverses the AST nodes parents until a parent of given type was found.
+    If no such parent is found, None is returned
+    """
+    parent = node.parent
+    while parent is not None:
+        if isinstance(parent, parent_type):
+            return parent
+        parent = parent.parent
+    return None
+
+
+def parents_of_type(node, parent_type, include_current=False):
+    """Generator for all parent nodes of given type"""
+    parent = node if include_current else node.parent
+    while parent is not None:
+        if isinstance(parent, parent_type):
+            yield parent
+        parent = parent.parent
--- a/pystencils/utils.py
+++ b/pystencils/utils.py
 import os
-from tempfile import NamedTemporaryFile
+import itertools
+from itertools import groupby
+from collections import Counter
 from contextlib import contextmanager
+from tempfile import NamedTemporaryFile
 from typing import Mapping
-from collections import Counter

-import sympy as sp
 import numpy as np
+import sympy as sp


 class DotDict(dict):
@@ -14,14 +16,21 @@ class DotDict(dict):
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

+    # Recursively make DotDict: https://stackoverflow.com/questions/13520421/recursive-dotdict
+    def __init__(self, dct={}):
+        for key, value in dct.items():
+            if isinstance(value, dict):
+                value = DotDict(value)
+            self[key] = value

-def all_equal(iterator):
-    iterator = iter(iterator)
-    try:
-        first = next(iterator)
-    except StopIteration:
-        return True
-    return all(first == rest for rest in iterator)
+
+def all_equal(iterable):
+    """
+    Returns ``True`` if all the elements are equal to each other.
+    Copied from: more-itertools 8.12.0
+    """
+    g = groupby(iterable)
+    return next(g, True) and not next(g, False)


 def recursive_dict_update(d, u):
@@ -43,33 +52,13 @@ def recursive_dict_update(d, u):
    return d


-@contextmanager
-def file_handle_for_atomic_write(file_path):
-    """Open temporary file object that atomically moves to destination upon exiting.
-
-    Allows reading and writing to and from the same filename.
-    The file will not be moved to destination in case of an exception.
-
-    Args:
-        file_path: path to file to be opened
-    """
-    target_folder = os.path.dirname(os.path.abspath(file_path))
-    with NamedTemporaryFile(delete=False, dir=target_folder, mode='w') as f:
-        try:
-            yield f
-        finally:
-            f.flush()
-            os.fsync(f.fileno())
-    os.rename(f.name, file_path)
-
-
 @contextmanager
 def atomic_file_write(file_path):
    target_folder = os.path.dirname(os.path.abspath(file_path))
    with NamedTemporaryFile(delete=False, dir=target_folder) as f:
        f.file.close()
        yield f.name
-    os.rename(f.name, file_path)
+    os.replace(f.name, file_path)


 def fully_contains(l1, l2):
@@ -89,19 +78,39 @@ def fully_contains(l1, l2):


 def boolean_array_bounding_box(boolean_array):
-    """Returns bounding box around "true" area of boolean array"""
-    dim = len(boolean_array.shape)
+    """Returns bounding box around "true" area of boolean array
+
+    >>> a = np.zeros((4, 4), dtype=bool)
+    >>> a[1:-1, 1:-1] = True
+    >>> boolean_array_bounding_box(a) == [(1, 3), (1, 3)]
+    True
+    """
+    dim = boolean_array.ndim
+    shape = boolean_array.shape
+    assert 0 not in shape, "Shape must not contain zero"
    bounds = []
-    for i in range(dim):
-        for j in range(dim):
-            if i != j:
-                arr_1d = np.any(boolean_array, axis=j)
-        begin = np.argmax(arr_1d)
-        end = begin + np.argmin(arr_1d[begin:])
-        bounds.append((begin, end))
+    for ax in itertools.combinations(reversed(range(dim)), dim - 1):
+        nonzero = np.any(boolean_array, axis=ax)
+        t = np.where(nonzero)[0][[0, -1]]
+        bounds.append((t[0], t[1] + 1))
    return bounds


+def binary_numbers(n):
+    """Returns all binary numbers up to 2^n - 1
+
+    Example:
+        >>> binary_numbers(2)
+        [[0, 0], [0, 1], [1, 0], [1, 1]]
+    """
+    result = list()
+    for i in range(1 << n):
+        binary_number = bin(i)[2:]
+        binary_number = '0' * (n - len(binary_number)) + binary_number
+        result.append((list(map(int, binary_number))))
+    return result
+
+
 class LinearEquationSystem:
    """Symbolic linear system of equations - consisting of matrix and right hand side.

@@ -210,7 +219,8 @@ class LinearEquationSystem:
                return 'multiple'

    def solution(self):
-        """Solves the system if it has a single solution. Returns a dictionary mapping symbol to solution value."""
+        """Solves the system. Under- and overdetermined systems are supported.
+        Returns a dictionary mapping symbol to solution value."""
        return sp.solve_linear_system(self._matrix, *self.unknowns)

    def _resize_if_necessary(self, new_rows=1):
@@ -228,6 +238,15 @@ class LinearEquationSystem:
        self.next_zero_row = result


-def find_unique_solutions_with_zeros(system: LinearEquationSystem):
-    if not system.solution_structure() != 'multiple':
-        raise ValueError("Function works only for underdetermined systems")
+class ContextVar:
+    def __init__(self, value):
+        self.stack = [value]
+
+    @contextmanager
+    def __call__(self, new_value):
+        self.stack.append(new_value)
+        yield self
+        self.stack.pop()
+
+    def get(self):
+        return self.stack[-1]
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/test_Min_Max.py
+++ b/tests/test_Min_Max.py
+import pytest
+
+import sympy as sp
+import numpy
+import pystencils
+from pystencils.datahandling import create_data_handling
+
+
+@pytest.mark.parametrize('dtype', ["float64", "float32"])
+@pytest.mark.parametrize('sympy_function', [sp.Min, sp.Max])
+def test_max(dtype, sympy_function):
+    dh = create_data_handling(domain_size=(10, 10), periodicity=True)
+
+    x = dh.add_array('x', values_per_cell=1, dtype=dtype)
+    dh.fill("x", 0.0, ghost_layers=True)
+    y = dh.add_array('y', values_per_cell=1, dtype=dtype)
+    dh.fill("y", 1.0, ghost_layers=True)
+    z = dh.add_array('z', values_per_cell=1, dtype=dtype)
+    dh.fill("z", 2.0, ghost_layers=True)
+
+    config = pystencils.CreateKernelConfig(default_number_float=dtype)
+
+    # test sp.Max with one argument
+    assignment_1 = pystencils.Assignment(x.center, sympy_function(y.center + 3.3))
+    ast_1 = pystencils.create_kernel(assignment_1, config=config)
+    kernel_1 = ast_1.compile()
+    # pystencils.show_code(ast_1)
+
+    # test sp.Max with two arguments
+    assignment_2 = pystencils.Assignment(x.center, sympy_function(0.5, y.center - 1.5))
+    ast_2 = pystencils.create_kernel(assignment_2, config=config)
+    kernel_2 = ast_2.compile()
+    # pystencils.show_code(ast_2)
+
+    # test sp.Max with many arguments
+    assignment_3 = pystencils.Assignment(x.center, sympy_function(z.center, 4.5, y.center - 1.5, y.center + z.center))
+    ast_3 = pystencils.create_kernel(assignment_3, config=config)
+    kernel_3 = ast_3.compile()
+    # pystencils.show_code(ast_3)
+
+    if sympy_function is sp.Max:
+        results = [4.3, 0.5, 4.5]
+    else:
+        results = [4.3, -0.5, -0.5]
+
+    dh.run_kernel(kernel_1)
+    assert numpy.all(dh.gather_array('x') == results[0])
+    dh.run_kernel(kernel_2)
+    assert numpy.all(dh.gather_array('x') == results[1])
+    dh.run_kernel(kernel_3)
+    assert numpy.all(dh.gather_array('x') == results[2])
+
+
+@pytest.mark.parametrize('dtype', ["int64", 'int32'])
+@pytest.mark.parametrize('sympy_function', [sp.Min, sp.Max])
+def test_max_integer(dtype, sympy_function):
+    dh = create_data_handling(domain_size=(10, 10), periodicity=True)
+
+    x = dh.add_array('x', values_per_cell=1, dtype=dtype)
+    dh.fill("x", 0, ghost_layers=True)
+    y = dh.add_array('y', values_per_cell=1, dtype=dtype)
+    dh.fill("y", 1, ghost_layers=True)
+    z = dh.add_array('z', values_per_cell=1, dtype=dtype)
+    dh.fill("z", 2, ghost_layers=True)
+
+    config = pystencils.CreateKernelConfig(default_number_int=dtype)
+
+    # test sp.Max with one argument
+    assignment_1 = pystencils.Assignment(x.center, sympy_function(y.center + 3))
+    ast_1 = pystencils.create_kernel(assignment_1, config=config)
+    kernel_1 = ast_1.compile()
+    # pystencils.show_code(ast_1)
+
+    # test sp.Max with two arguments
+    assignment_2 = pystencils.Assignment(x.center, sympy_function(1, y.center - 1))
+    ast_2 = pystencils.create_kernel(assignment_2, config=config)
+    kernel_2 = ast_2.compile()
+    # pystencils.show_code(ast_2)
+
+    # test sp.Max with many arguments
+    assignment_3 = pystencils.Assignment(x.center, sympy_function(z.center, 4, y.center - 1, y.center + z.center))
+    ast_3 = pystencils.create_kernel(assignment_3, config=config)
+    kernel_3 = ast_3.compile()
+    # pystencils.show_code(ast_3)
+
+    if sympy_function is sp.Max:
+        results = [4, 1, 4]
+    else:
+        results = [4, 0, 0]
+
+    dh.run_kernel(kernel_1)
+    assert numpy.all(dh.gather_array('x') == results[0])
+    dh.run_kernel(kernel_2)
+    assert numpy.all(dh.gather_array('x') == results[1])
+    dh.run_kernel(kernel_3)
+    assert numpy.all(dh.gather_array('x') == results[2])
--- a/tests/test_abs.py
+++ b/tests/test_abs.py
+import pytest
+
+import pystencils.config
+import sympy
+
+import pystencils as ps
+from pystencils.typing import CastFunc, create_type
+
+
+@pytest.mark.parametrize('target', (ps.Target.CPU, ps.Target.GPU))
+def test_abs(target):
+    x, y, z = ps.fields('x, y, z:  float64[2d]')
+
+    default_int_type = create_type('int64')
+
+    assignments = ps.AssignmentCollection({x[0, 0]: sympy.Abs(CastFunc(y[0, 0], default_int_type))})
+
+    config = pystencils.config.CreateKernelConfig(target=target)
+    ast = ps.create_kernel(assignments, config=config)
+    code = ps.get_code_str(ast)
+    print(code)
+    assert 'fabs(' not in code
--- a/tests/test_address_of.py
+++ b/tests/test_address_of.py
+"""
+Test of pystencils.data_types.address_of
+"""
+import pytest
+import pystencils
+from pystencils.typing import PointerType, CastFunc, BasicType
+from pystencils.functions import AddressOf
+from pystencils.simp.simplifications import sympy_cse
+
+import sympy as sp
+
+
+def test_address_of():
+    x, y = pystencils.fields('x, y: int64[2d]')
+    s = pystencils.TypedSymbol('s', PointerType(BasicType('int64')))
+
+    assert AddressOf(x[0, 0]).canonical() == x[0, 0]
+    assert AddressOf(x[0, 0]).dtype == PointerType(x[0, 0].dtype, restrict=True)
+    with pytest.raises(ValueError):
+        assert AddressOf(sp.Symbol("a")).dtype
+
+    assignments = pystencils.AssignmentCollection({
+        s: AddressOf(x[0, 0]),
+        y[0, 0]: CastFunc(s, BasicType('int64'))
+    })
+
+    kernel = pystencils.create_kernel(assignments).compile()
+    # pystencils.show_code(kernel.ast)
+
+    assignments = pystencils.AssignmentCollection({
+        y[0, 0]: CastFunc(AddressOf(x[0, 0]), BasicType('int64'))
+    })
+
+    kernel = pystencils.create_kernel(assignments).compile()
+    # pystencils.show_code(kernel.ast)
+
+
+def test_address_of_with_cse():
+    x, y = pystencils.fields('x, y: int64[2d]')
+
+    assignments = pystencils.AssignmentCollection({
+        x[0, 0]: CastFunc(AddressOf(x[0, 0]), BasicType('int64')) + 1
+    })
+
+    kernel = pystencils.create_kernel(assignments).compile()
+    # pystencils.show_code(kernel.ast)
+    assignments_cse = sympy_cse(assignments)
+
+    kernel = pystencils.create_kernel(assignments_cse).compile()
+    # pystencils.show_code(kernel.ast)
--- a/pystencils_tests/test_aligned_array.py
+++ b/pystencils_tests/test_aligned_array.py
+import pytest
+
 from pystencils import create_data_handling
 from pystencils.alignedarray import *
 from pystencils.field import create_numpy_array_with_layout
@@ -11,45 +13,45 @@ def is_aligned(arr, alignment, byte_offset=0):
    return rest == 0


-def test_1d_arrays():
-    for alignment in [8, 8*4]:
-        for shape in [17, 16, (16, 16), (17, 17), (18, 18), (19, 19)]:
-            arrays = [
-                aligned_zeros(shape, alignment),
-                aligned_ones(shape, alignment),
-                aligned_empty(shape, alignment),
-            ]
-            for arr in arrays:
-                assert is_aligned(arr, alignment)
+@pytest.mark.parametrize("alignment", [8, 8*4, True])
+@pytest.mark.parametrize("shape", [17, 16, (16, 16), (17, 17), (18, 18), (19, 19)])
+def test_1d_arrays(alignment, shape):
+    arrays = [
+        aligned_zeros(shape, alignment),
+        aligned_ones(shape, alignment),
+        aligned_empty(shape, alignment),
+    ]
+    for arr in arrays:
+        assert is_aligned(arr, alignment)


-def test_3d_arrays():
-    for order in ('C', 'F'):
-        for alignment in [8, 8*4]:
-            for shape in [(16, 16), (17, 17), (18, 18), (19, 19)]:
-                arrays = [
-                    aligned_zeros(shape, alignment, order=order),
-                    aligned_ones(shape, alignment, order=order),
-                    aligned_empty(shape, alignment, order=order),
-                ]
-                for arr in arrays:
-                    assert is_aligned(arr, alignment)
-                    if order == 'C':
-                        assert is_aligned(arr[1], alignment)
-                        assert is_aligned(arr[5], alignment)
-                    else:
-                        assert is_aligned(arr[..., 1], alignment)
-                        assert is_aligned(arr[..., 5], alignment)
+@pytest.mark.parametrize("order", ['C', 'F'])
+@pytest.mark.parametrize("alignment", [8, 8*4, True])
+@pytest.mark.parametrize("shape", [(16, 16), (17, 17), (18, 18), (19, 19)])
+def test_3d_arrays(order, alignment, shape):
+    arrays = [
+        aligned_zeros(shape, alignment, order=order),
+        aligned_ones(shape, alignment, order=order),
+        aligned_empty(shape, alignment, order=order),
+    ]
+    for arr in arrays:
+        assert is_aligned(arr, alignment)
+        if order == 'C':
+            assert is_aligned(arr[1], alignment)
+            assert is_aligned(arr[5], alignment)
+        else:
+            assert is_aligned(arr[..., 1], alignment)
+            assert is_aligned(arr[..., 5], alignment)


-def test_data_handling():
-    for parallel in (False, True):
-        for tries in range(16):  # try a few times, since we might get lucky and get randomly a correct alignment
-            dh = create_data_handling((6, 7), default_ghost_layers=1, parallel=parallel)
-            dh.add_array('test', alignment=8 * 4)
-            for b in dh.iterate(ghost_layers=True, inner_ghost_layers=True):
-                arr = b['test']
-                assert is_aligned(arr[1:, 3:], 8*4)
+@pytest.mark.parametrize("parallel", [False, True])
+def test_data_handling(parallel):
+    for tries in range(16):  # try a few times, since we might get lucky and get randomly a correct alignment
+        dh = create_data_handling((6, 7), default_ghost_layers=1, parallel=parallel)
+        dh.add_array('test', alignment=8 * 4, values_per_cell=1)
+        for b in dh.iterate(ghost_layers=True, inner_ghost_layers=True):
+            arr = b['test']
+            assert is_aligned(arr[1:, 3:], 8*4)


 def test_alignment_of_different_layouts():
@@ -57,13 +59,13 @@ def test_alignment_of_different_layouts():
    byte_offset = 8
    for tries in range(16):  # try a few times, since we might get lucky and get randomly a correct alignment
        arr = create_numpy_array_with_layout((3, 4, 5), layout=(0, 1, 2),
-                                             alignment=True, byte_offset=byte_offset)
+                                             alignment=8*4, byte_offset=byte_offset)
        assert is_aligned(arr[offset, ...], 8*4, byte_offset)

        arr = create_numpy_array_with_layout((3, 4, 5), layout=(2, 1, 0),
-                                             alignment=True, byte_offset=byte_offset)
+                                             alignment=8*4, byte_offset=byte_offset)
        assert is_aligned(arr[..., offset], 8*4, byte_offset)

        arr = create_numpy_array_with_layout((3, 4, 5), layout=(2, 0, 1),
-                                             alignment=True, byte_offset=byte_offset)
+                                             alignment=8*4, byte_offset=byte_offset)
        assert is_aligned(arr[:, 0, :], 8*4, byte_offset)
--- a/tests/test_assignment_collection.py
+++ b/tests/test_assignment_collection.py
+import pytest
+import sympy as sp
+import pystencils as ps
+
+from pystencils import Assignment, AssignmentCollection
+from pystencils.astnodes import Conditional
+from pystencils.simp.assignment_collection import SymbolGen
+
+a, b, c = sp.symbols("a b c")
+x, y, z, t = sp.symbols("x y z t")
+symbol_gen = SymbolGen("a")
+f = ps.fields("f(2) : [2D]")
+d = ps.fields("d(2) : [2D]")
+
+
+def test_assignment_collection():
+    ac = AssignmentCollection([Assignment(z, x + y)],
+                              [], subexpression_symbol_generator=symbol_gen)
+
+    lhs = ac.add_subexpression(t)
+    assert lhs == sp.Symbol("a_0")
+    ac.subexpressions.append(Assignment(t, 3))
+    ac.topological_sort(sort_main_assignments=False, sort_subexpressions=True)
+    assert ac.subexpressions[0].lhs == t
+
+    assert ac.new_with_inserted_subexpression(sp.Symbol("not_defined")) == ac
+    ac_inserted = ac.new_with_inserted_subexpression(t)
+    ac_inserted2 = ac.new_without_subexpressions({lhs})
+    assert all(a == b for a, b in zip(ac_inserted.all_assignments, ac_inserted2.all_assignments))
+
+    print(ac_inserted)
+    assert ac_inserted.subexpressions[0] == Assignment(lhs, 3)
+
+    assert 'a_0' in str(ac_inserted)
+    assert '<table' in ac_inserted._repr_html_()
+
+
+def test_free_and_defined_symbols():
+    ac = AssignmentCollection([Assignment(z, x + y), Conditional(t > 0, Assignment(a, b+1), Assignment(a, b+2))],
+                              [], subexpression_symbol_generator=symbol_gen)
+
+    print(ac)
+    print(ac.__repr__)
+
+
+def test_vector_assignments():
+    """From #17 (https://i10git.cs.fau.de/pycodegen/pystencils/issues/17)"""
+    assignments = ps.Assignment(sp.Matrix([a, b, c]), sp.Matrix([1, 2, 3]))
+    print(assignments)
+
+
+def test_wrong_vector_assignments():
+    """From #17 (https://i10git.cs.fau.de/pycodegen/pystencils/issues/17)"""
+    with pytest.raises(AssertionError,
+                       match=r'Matrix(.*) and Matrix(.*) must have same length when performing vector assignment!'):
+        ps.Assignment(sp.Matrix([a, b]), sp.Matrix([1, 2, 3]))
+
+
+def test_vector_assignment_collection():
+    """From #17 (https://i10git.cs.fau.de/pycodegen/pystencils/issues/17)"""
+
+    y_m, x_m = sp.Matrix([a, b, c]), sp.Matrix([1, 2, 3])
+    assignments = ps.AssignmentCollection({y_m: x_m})
+    print(assignments)
+
+    assignments = ps.AssignmentCollection([ps.Assignment(y_m, x_m)])
+    print(assignments)
+
+
+def test_new_with_substitutions():
+    a1 = ps.Assignment(f[0, 0](0), a * b)
+    a2 = ps.Assignment(f[0, 0](1), b * c)
+
+    ac = ps.AssignmentCollection([a1, a2], subexpressions=[])
+    subs_dict = {f[0, 0](0): d[0, 0](0), f[0, 0](1): d[0, 0](1)}
+    subs_ac = ac.new_with_substitutions(subs_dict,
+                                        add_substitutions_as_subexpressions=False,
+                                        substitute_on_lhs=True,
+                                        sort_topologically=True)
+
+    assert subs_ac.main_assignments[0].lhs == d[0, 0](0)
+    assert subs_ac.main_assignments[1].lhs == d[0, 0](1)
+
+    subs_ac = ac.new_with_substitutions(subs_dict,
+                                        add_substitutions_as_subexpressions=False,
+                                        substitute_on_lhs=False,
+                                        sort_topologically=True)
+
+    assert subs_ac.main_assignments[0].lhs == f[0, 0](0)
+    assert subs_ac.main_assignments[1].lhs == f[0, 0](1)
+
+    subs_dict = {a * b: sp.symbols('xi')}
+    subs_ac = ac.new_with_substitutions(subs_dict,
+                                        add_substitutions_as_subexpressions=False,
+                                        substitute_on_lhs=False,
+                                        sort_topologically=True)
+
+    assert subs_ac.main_assignments[0].rhs == sp.symbols('xi')
+    assert len(subs_ac.subexpressions) == 0
+
+    subs_ac = ac.new_with_substitutions(subs_dict,
+                                        add_substitutions_as_subexpressions=True,
+                                        substitute_on_lhs=False,
+                                        sort_topologically=True)
+
+    assert subs_ac.main_assignments[0].rhs == sp.symbols('xi')
+    assert len(subs_ac.subexpressions) == 1
+    assert subs_ac.subexpressions[0].lhs == sp.symbols('xi')
+
+
+def test_copy():
+    a1 = ps.Assignment(f[0, 0](0), a * b)
+    a2 = ps.Assignment(f[0, 0](1), b * c)
+
+    ac = ps.AssignmentCollection([a1, a2], subexpressions=[])
+    ac2 = ac.copy()
+    assert ac2 == ac
+
+
+def test_set_expressions():
+    a1 = ps.Assignment(f[0, 0](0), a * b)
+    a2 = ps.Assignment(f[0, 0](1), b * c)
+
+    ac = ps.AssignmentCollection([a1, a2], subexpressions=[])
+
+    ac.set_main_assignments_from_dict({d[0, 0](0): b * c})
+    assert len(ac.main_assignments) == 1
+    assert ac.main_assignments[0] == ps.Assignment(d[0, 0](0), b * c)
+
+    ac.set_sub_expressions_from_dict({sp.symbols('xi'): a * b})
+    assert len(ac.subexpressions) == 1
+    assert ac.subexpressions[0] == ps.Assignment(sp.symbols('xi'), a * b)
+
+    ac = ac.new_without_subexpressions(subexpressions_to_keep={sp.symbols('xi')})
+    assert ac.subexpressions[0] == ps.Assignment(sp.symbols('xi'), a * b)
+
+    ac = ac.new_without_unused_subexpressions()
+    assert len(ac.subexpressions) == 0
+
+    ac2 = ac.new_without_subexpressions()
+    assert ac == ac2
+
+
+def test_free_and_bound_symbols():
+    a1 = ps.Assignment(a, d[0, 0](0))
+    a2 = ps.Assignment(f[0, 0](1), b * c)
+
+    ac = ps.AssignmentCollection([a2], subexpressions=[a1])
+    assert f[0, 0](1) in ac.bound_symbols
+    assert d[0, 0](0) in ac.free_symbols
+
+
+def test_new_merged():
+    a1 = ps.Assignment(a, b * c)
+    a2 = ps.Assignment(a, x * y)
+    a3 = ps.Assignment(t, x ** 2)
+
+    # main assignments
+    a4 = ps.Assignment(f[0, 0](0), a)
+    a5 = ps.Assignment(d[0, 0](0), a)
+
+    ac = ps.AssignmentCollection([a4], subexpressions=[a1])
+    ac2 = ps.AssignmentCollection([a5], subexpressions=[a2, a3])
+
+    merged_ac = ac.new_merged(ac2)
+
+    assert len(merged_ac.subexpressions) == 3
+    assert len(merged_ac.main_assignments) == 2
+    assert ps.Assignment(sp.symbols('xi_0'), x * y) in merged_ac.subexpressions
+    assert ps.Assignment(d[0, 0](0), sp.symbols('xi_0')) in merged_ac.main_assignments
+    assert a1 in merged_ac.subexpressions
+    assert a3 in merged_ac.subexpressions
+
+    a1 = ps.Assignment(a, 20)
+    a2 = ps.Assignment(a, 10)
+    acommon = ps.Assignment(b, a)
+
+    # main assignments
+    a3 = ps.Assignment(f[0, 0](0), b)
+    a4 = ps.Assignment(d[0, 0](0), b)
+
+    ac = ps.AssignmentCollection([a3], subexpressions=[a1, acommon])
+    ac2 = ps.AssignmentCollection([a4], subexpressions=[a2, acommon])
+
+    merged_ac = ac.new_merged(ac2).new_without_subexpressions()
+
+    assert ps.Assignment(f[0, 0](0), 20) in merged_ac.main_assignments
+    assert ps.Assignment(d[0, 0](0), 10) in merged_ac.main_assignments
--- a/pystencils_tests/test_assignment_collection_dict_conversion.py
+++ b/pystencils_tests/test_assignment_collection_dict_conversion.py
@@ -30,11 +30,3 @@ def test_assignment_collection_dict_conversion():
    assert collection_dict.main_assignments_dict == {y[1, 0]: x.center(),
                                                     y[0, 0]: x.center()}
    assert collection_dict.subexpressions_dict == {}
-
-
-def main():
-    test_assignment_collection_dict_conversion()
-
-
-if __name__ == '__main__':
-    main()
--- a/pystencils_tests/test_assignment_from_stencil.py
+++ b/pystencils_tests/test_assignment_from_stencil.py
-import pystencils
 import numpy as np

+import pystencils
+

 def test_assignment_from_stencil():


--- a/tests/test_astnodes.py
+++ b/tests/test_astnodes.py
+import pytest
+import sys
+
+import pystencils.config
+import sympy as sp
+
+import pystencils as ps
+from pystencils import Assignment
+from pystencils.astnodes import Block, LoopOverCoordinate, SkipIteration, SympyAssignment
+
+dst = ps.fields('dst(8): double[2D]')
+s = sp.symbols('s_:8')
+x = sp.symbols('x')
+y = sp.symbols('y')
+
+python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
+
+
+def test_kernel_function():
+    assignments = [
+        Assignment(dst[0, 0](0), s[0]),
+        Assignment(x, dst[0, 0](2))
+    ]
+
+    ast_node = ps.create_kernel(assignments)
+
+    assert ast_node.target == ps.Target.CPU
+    assert ast_node.backend == ps.Backend.C
+    # symbols_defined and undefined_symbols will always return an emtpy set
+    assert ast_node.symbols_defined == set()
+    assert ast_node.undefined_symbols == set()
+    assert ast_node.fields_written == {dst}
+    assert ast_node.fields_read == {dst}
+
+
+def test_skip_iteration():
+    # skip iteration is an object which should give back empty data structures.
+    skipped = SkipIteration()
+    assert skipped.args == []
+    assert skipped.symbols_defined == set()
+    assert skipped.undefined_symbols == set()
+
+
+def test_block():
+    assignments = [
+        Assignment(dst[0, 0](0), s[0]),
+        Assignment(x, dst[0, 0](2))
+    ]
+    bl = Block(assignments)
+    assert bl.symbols_defined == {dst[0, 0](0), dst[0, 0](2), s[0], x}
+
+    bl.append([Assignment(y, 10)])
+    assert bl.symbols_defined == {dst[0, 0](0), dst[0, 0](2), s[0], x, y}
+    assert len(bl.args) == 3
+
+    list_iterator = iter([Assignment(s[1], 11)])
+    bl.insert_front(list_iterator)
+
+    assert bl.args[0] == Assignment(s[1], 11)
+
+
+def test_loop_over_coordinate():
+    assignments = [
+        Assignment(dst[0, 0](0), s[0]),
+        Assignment(x, dst[0, 0](2))
+    ]
+
+    body = Block(assignments)
+    loop = LoopOverCoordinate(body, coordinate_to_loop_over=0, start=0, stop=10, step=1)
+
+    assert loop.body == body
+
+    new_body = Block([assignments[0]])
+    loop = loop.new_loop_with_different_body(new_body)
+    assert loop.body == new_body
+
+    assert loop.start == 0
+    assert loop.stop == 10
+    assert loop.step == 1
+
+    loop.replace(loop.start, 2)
+    loop.replace(loop.stop, 20)
+    loop.replace(loop.step, 2)
+
+    assert loop.start == 2
+    assert loop.stop == 20
+    assert loop.step == 2
+
--- a/tests/test_augmented_assignment.py
+++ b/tests/test_augmented_assignment.py
+import pytest
+import pystencils as ps
+
+
+@pytest.mark.parametrize('target', [ps.Target.CPU, ps.Target.GPU])
+def test_add_augmented_assignment(target):
+    if target == ps.Target.GPU:
+        pytest.importorskip("cupy")
+
+    domain_size = (5, 5)
+    dh = ps.create_data_handling(domain_size=domain_size, periodicity=True, default_target=target)
+
+    f = dh.add_array("f", values_per_cell=1)
+    dh.fill(f.name, 0.0)
+
+    g = dh.add_array("g", values_per_cell=1)
+    dh.fill(g.name, 1.0)
+
+    up = ps.AddAugmentedAssignment(f.center, g.center)
+
+    config = ps.CreateKernelConfig(target=dh.default_target)
+    ast = ps.create_kernel(up, config=config)
+
+    kernel = ast.compile()
+    for i in range(10):
+        dh.run_kernel(kernel)
+
+    if target == ps.Target.GPU:
+        dh.all_to_cpu()
+
+    result = dh.gather_array(f.name)
+
+    for x in range(domain_size[0]):
+        for y in range(domain_size[1]):
+            assert result[x, y] == 10
--- a/tests/test_base_pointer_specification.py
+++ b/tests/test_base_pointer_specification.py
+import pytest
+
+from pystencils import Assignment, CreateKernelConfig, Target, fields, create_kernel, get_code_str
+
+
+@pytest.mark.parametrize('target', (Target.CPU, Target.GPU))
+def test_intermediate_base_pointer(target):
+    x = fields(f'x: double[3d]')
+    y = fields(f'y: double[3d]')
+    update = Assignment(x.center, y.center)
+
+    config = CreateKernelConfig(base_pointer_specification=[], target=target)
+    ast = create_kernel(update, config=config)
+    code = get_code_str(ast)
+
+    # no intermediate base pointers are created
+    assert "_data_x[_stride_x_0*ctr_0 + _stride_x_1*ctr_1 + _stride_x_2*ctr_2] = " \
+           "_data_y[_stride_y_0*ctr_0 + _stride_y_1*ctr_1 + _stride_y_2*ctr_2];" in code
+
+
+    config = CreateKernelConfig(base_pointer_specification=[[0]], target=target)
+    ast = create_kernel(update, config=config)
+    code = get_code_str(ast)
+
+    # intermediate base pointers for y and z
+    assert "double * RESTRICT  _data_x_10_20 = _data_x + _stride_x_1*ctr_1 + _stride_x_2*ctr_2;" in code
+    assert " double * RESTRICT _data_y_10_20 = _data_y + _stride_y_1*ctr_1 + _stride_y_2*ctr_2;" in code
+    assert "_data_x_10_20[_stride_x_0*ctr_0] = _data_y_10_20[_stride_y_0*ctr_0];" in code
+
+    config = CreateKernelConfig(base_pointer_specification=[[1]], target=target)
+    ast = create_kernel(update, config=config)
+    code = get_code_str(ast)
+
+    # intermediate base pointers for x and z
+    assert "double * RESTRICT  _data_x_00_20 = _data_x + _stride_x_0*ctr_0 + _stride_x_2*ctr_2;" in code
+    assert "double * RESTRICT _data_y_00_20 = _data_y + _stride_y_0*ctr_0 + _stride_y_2*ctr_2;" in code
+    assert "_data_x_00_20[_stride_x_1*ctr_1] = _data_y_00_20[_stride_y_1*ctr_1];" in code
+
+    config = CreateKernelConfig(base_pointer_specification=[[2]], target=target)
+    ast = create_kernel(update, config=config)
+    code = get_code_str(ast)
+
+    # intermediate base pointers for x and y
+    assert "double * RESTRICT  _data_x_00_10 = _data_x + _stride_x_0*ctr_0 + _stride_x_1*ctr_1;" in code
+    assert "double * RESTRICT _data_y_00_10 = _data_y + _stride_y_0*ctr_0 + _stride_y_1*ctr_1;" in code
+    assert "_data_x_00_10[_stride_x_2*ctr_2] = _data_y_00_10[_stride_y_2*ctr_2];" in code
+
+    config = CreateKernelConfig(target=target)
+    ast = create_kernel(update, config=config)
+    code = get_code_str(ast)
+
+    # by default no intermediate base pointers are created
+    assert "_data_x[_stride_x_0*ctr_0 + _stride_x_1*ctr_1 + _stride_x_2*ctr_2] = " \
+           "_data_y[_stride_y_0*ctr_0 + _stride_y_1*ctr_1 + _stride_y_2*ctr_2];" in code
--- a/tests/test_bit_masks.py
+++ b/tests/test_bit_masks.py
+import pytest
+import numpy as np
+
+import pystencils as ps
+from pystencils import Field, Assignment, create_kernel
+from pystencils.bit_masks import flag_cond
+
+
+@pytest.mark.parametrize('mask_type', [np.uint8, np.uint16, np.uint32, np.uint64])
+def test_flag_condition(mask_type):
+    f_arr = np.zeros((2, 2, 2), dtype=np.float64)
+    mask_arr = np.zeros((2, 2), dtype=mask_type)
+
+    mask_arr[0, 1] = (1 << 3)
+    mask_arr[1, 0] = (1 << 5)
+    mask_arr[1, 1] = (1 << 3) + (1 << 5)
+
+    f = Field.create_from_numpy_array('f', f_arr, index_dimensions=1)
+    mask = Field.create_from_numpy_array('mask', mask_arr)
+
+    v1 = 42.3
+    v2 = 39.7
+    v3 = 119
+
+    assignments = [
+        Assignment(f(0), flag_cond(3, mask(0), v1)),
+        Assignment(f(1), flag_cond(5, mask(0), v2, v3))
+    ]
+
+    kernel = create_kernel(assignments).compile()
+    kernel(f=f_arr, mask=mask_arr)
+    code = ps.get_code_str(kernel)
+    assert '119.0' in code
+
+    reference = np.zeros((2, 2, 2), dtype=np.float64)
+    reference[0, 1, 0] = v1
+    reference[1, 1, 0] = v1
+
+    reference[0, 0, 1] = v3
+    reference[0, 1, 1] = v3
+
+    reference[1, 0, 1] = v2
+    reference[1, 1, 1] = v2
+
+    np.testing.assert_array_equal(f_arr, reference)
--- a/pystencils_tests/test_blocking.py
+++ b/pystencils_tests/test_blocking.py
-import sympy as sp
 import numpy as np
+import sympy as sp
+
 import pystencils as ps


@@ -14,21 +15,32 @@ def jacobi(dst, src):

 def check_equivalence(assignments, src_arr):
    for openmp in (False, True):
-        for vectorization in (True, False):
+        for vectorization in [False, {'assume_inner_stride_one': True}]:
            with_blocking = ps.create_kernel(assignments, cpu_blocking=(8, 16, 4), cpu_openmp=openmp,
                                             cpu_vectorize_info=vectorization).compile()
+            with_blocking_only_over_y = ps.create_kernel(assignments, cpu_blocking=(0, 16, 0), cpu_openmp=openmp,
+                                                         cpu_vectorize_info=vectorization).compile()
            without_blocking = ps.create_kernel(assignments).compile()
-            print("  openmp {}, vectorization {}".format(openmp, vectorization))
+
+            only_omp = ps.create_kernel(assignments, cpu_openmp=2).compile()
+
+            print(f"  openmp {openmp}, vectorization {vectorization}")
            dst_arr = np.zeros_like(src_arr)
+            dst2_arr = np.zeros_like(src_arr)
+            dst3_arr = np.zeros_like(src_arr)
            ref_arr = np.zeros_like(src_arr)
            np.copyto(src_arr, np.random.rand(*src_arr.shape))
            with_blocking(src=src_arr, dst=dst_arr)
+            with_blocking_only_over_y(src=src_arr, dst=dst2_arr)
            without_blocking(src=src_arr, dst=ref_arr)
+            only_omp(src=src_arr, dst=dst3_arr)
            np.testing.assert_almost_equal(ref_arr, dst_arr)
+            np.testing.assert_almost_equal(ref_arr, dst2_arr)
+            np.testing.assert_almost_equal(ref_arr, dst3_arr)


 def test_jacobi3d_var_size():
-    src, dst = ps.fields("src, dst: double[3D]")
+    src, dst = ps.fields("src, dst: double[3D]", layout='c')

    print("Var Size: Smaller than block sizes")
    arr = np.empty([4, 5, 6])
@@ -59,3 +71,10 @@ def test_jacobi3d_fixed_size():
    src, dst = ps.fields("src, dst: double[3D]", src=arr, dst=arr)
    check_equivalence(jacobi(dst, src), arr)

+
+def test_jacobi3d_fixed_field_size():
+    src, dst = ps.fields("src, dst: double[3, 5, 6]", layout='c')
+
+    print("Fixed Field Size: Smaller than block sizes")
+    arr = np.empty([3, 5, 6])
+    check_equivalence(jacobi(dst, src), arr)
--- a/pystencils_tests/test_blocking_staggered.py
+++ b/pystencils_tests/test_blocking_staggered.py
 import numpy as np
+
 import pystencils as ps


 def test_blocking_staggered():
-    f, stag = ps.fields("f, stag(3): double[3D]")
+    f = ps.fields("f: double[3D]")
+    stag = ps.fields("stag(3): double[3D]", field_type=ps.FieldType.STAGGERED)
    terms = [
       f[0, 0, 0] - f[-1, 0, 0],
       f[0, 0, 0] - f[0, -1, 0],
       f[0, 0, 0] - f[0, 0, -1],
    ]
-    kernel = ps.create_staggered_kernel(stag, terms, cpu_blocking=(3, 16, 8)).compile()
-    reference_kernel = ps.create_staggered_kernel(stag, terms).compile()
+    assignments = [ps.Assignment(stag.staggered_access(d), terms[i]) for i, d in enumerate(stag.staggered_stencil)]
+    reference_kernel = ps.create_staggered_kernel(assignments)
+    print(ps.show_code(reference_kernel))
+    reference_kernel = reference_kernel.compile()
+    kernel = ps.create_staggered_kernel(assignments, cpu_blocking=(3, 16, 8)).compile()
    print(ps.show_code(kernel.ast))

    f_arr = np.random.rand(80, 33, 19)

--- a/pystencils_tests/test_boundary.py
+++ b/pystencils_tests/test_boundary.py
-from tempfile import TemporaryDirectory
 import os
+from tempfile import TemporaryDirectory
+
 import numpy as np
+import pytest

-from pystencils import create_kernel, Assignment
-from pystencils.boundaries import add_neumann_boundary, Neumann, BoundaryHandling
+import pystencils
+from pystencils import Assignment, create_kernel
+from pystencils.boundaries import BoundaryHandling, Dirichlet, Neumann, add_neumann_boundary
 from pystencils.datahandling import SerialDataHandling
+from pystencils.enums import Target
 from pystencils.slicing import slice_from_direction
+from pystencils.timeloop import TimeLoop


 def test_kernel_vs_copy_boundary():
@@ -82,5 +87,160 @@ def test_kernel_vs_copy_boundary():
    np.testing.assert_almost_equal(python_copy_result, handling_result)

    with TemporaryDirectory() as tmp_dir:
+        pytest.importorskip('pyevtk')
        boundary_handling.geometry_to_vtk(file_name=os.path.join(tmp_dir, 'test_output1'), ghost_layers=False)
        boundary_handling.geometry_to_vtk(file_name=os.path.join(tmp_dir, 'test_output2'), ghost_layers=True)
+
+        boundaries = list(boundary_handling._boundary_object_to_boundary_info.keys()) + ['domain']
+        boundary_handling.geometry_to_vtk(file_name=os.path.join(tmp_dir, 'test_output3'),
+                                          boundaries=boundaries[0], ghost_layers=False)
+
+
+def test_boundary_gpu():
+    pytest.importorskip('cupy')
+    dh = SerialDataHandling(domain_size=(7, 7), default_target=Target.GPU)
+    src = dh.add_array('src')
+    dh.fill("src", 0.0, ghost_layers=True)
+    dh.fill("src", 1.0, ghost_layers=False)
+    src_cpu = dh.add_array('src_cpu', gpu=False)
+    dh.fill("src_cpu", 0.0, ghost_layers=True)
+    dh.fill("src_cpu", 1.0, ghost_layers=False)
+
+    boundary_stencil = [(1, 0), (-1, 0), (0, 1), (0, -1)]
+    boundary_handling_cpu = BoundaryHandling(dh, src_cpu.name, boundary_stencil,
+                                             name="boundary_handling_cpu", target=Target.CPU)
+
+    boundary_handling = BoundaryHandling(dh, src.name, boundary_stencil,
+                                         name="boundary_handling_gpu", target=Target.GPU)
+
+    neumann = Neumann()
+    for d in ('N', 'S', 'W', 'E'):
+        boundary_handling.set_boundary(neumann, slice_from_direction(d, dim=2))
+        boundary_handling_cpu.set_boundary(neumann, slice_from_direction(d, dim=2))
+
+    boundary_handling.prepare()
+    boundary_handling_cpu.prepare()
+
+    boundary_handling_cpu()
+
+    dh.all_to_gpu()
+    boundary_handling()
+    dh.all_to_cpu()
+    np.testing.assert_almost_equal(dh.cpu_arrays["src_cpu"], dh.cpu_arrays["src"])
+
+
+def test_boundary_utility():
+    dh = SerialDataHandling(domain_size=(7, 7))
+    src = dh.add_array('src')
+    dh.fill("src", 0.0, ghost_layers=True)
+
+    boundary_stencil = [(1, 0), (-1, 0), (0, 1), (0, -1)]
+
+    boundary_handling = BoundaryHandling(dh, src.name, boundary_stencil,
+                                         name="boundary_handling", target=Target.CPU)
+
+    neumann = Neumann()
+    dirichlet = Dirichlet(2)
+    for d in ('N', 'S', 'W', 'E'):
+        boundary_handling.set_boundary(neumann, slice_from_direction(d, dim=2))
+
+    boundary_handling.set_boundary(neumann, (slice(2, 4, None), slice(2, 4, None)))
+
+    boundary_handling.prepare()
+
+    assert boundary_handling.get_flag(boundary_handling.boundary_objects[0]) == 2
+    assert boundary_handling.shape == dh.shape
+    assert boundary_handling.flag_array_name == 'boundary_handlingFlags'
+    mask_neumann = boundary_handling.get_mask((slice(0, 7), slice(0, 7)), boundary_handling.boundary_objects[0])
+    np.testing.assert_almost_equal(mask_neumann[1:3, 1:3], 2)
+
+    mask_domain = boundary_handling.get_mask((slice(0, 7), slice(0, 7)), "domain")
+    assert np.sum(mask_domain) == 7 ** 2 - 4
+
+    def set_sphere(x, y):
+        mid = (4, 4)
+        radius = 2
+        return (x - mid[0]) ** 2 + (y - mid[1]) ** 2 < radius ** 2
+
+    boundary_handling.set_boundary(dirichlet, mask_callback=set_sphere, force_flag_value=4)
+    mask_dirichlet = boundary_handling.get_mask((slice(0, 7), slice(0, 7)), boundary_handling.boundary_objects[1])
+    assert np.sum(mask_dirichlet) == 48
+
+    assert boundary_handling.set_boundary("domain") == 1
+
+    assert boundary_handling.set_boundary(dirichlet, mask_callback=set_sphere, force_flag_value=8, replace=False) == 4
+    assert boundary_handling.set_boundary(dirichlet, force_flag_value=16, replace=False) == 4
+
+    assert boundary_handling.set_boundary_where_flag_is_set(boundary_handling.boundary_objects[0], 16) == 16
+
+
+def test_add_fix_steps():
+    dh = SerialDataHandling(domain_size=(7, 7))
+    src = dh.add_array('src')
+    dh.fill("src", 0.0, ghost_layers=True)
+    dh.fill("src", 1.0, ghost_layers=False)
+    boundary_stencil = [(1, 0), (-1, 0), (0, 1), (0, -1)]
+
+    boundary_handling = BoundaryHandling(dh, src.name, boundary_stencil,
+                                         name="boundary_handling", target=pystencils.Target.CPU)
+
+    neumann = Neumann()
+    for d in ('N', 'S', 'W', 'E'):
+        boundary_handling.set_boundary(neumann, slice_from_direction(d, dim=2))
+
+    timeloop = TimeLoop(steps=1)
+    boundary_handling.add_fixed_steps(timeloop)
+
+    timeloop.run()
+    assert np.sum(dh.cpu_arrays['src']) == 7 * 7 + 7 * 4
+
+
+def test_boundary_data_setter():
+    dh = SerialDataHandling(domain_size=(7, 7))
+    src = dh.add_array('src')
+    dh.fill("src", 0.0, ghost_layers=True)
+    dh.fill("src", 1.0, ghost_layers=False)
+    boundary_stencil = [(1, 0), (-1, 0), (0, 1), (0, -1)]
+
+    boundary_handling = BoundaryHandling(dh, src.name, boundary_stencil,
+                                         name="boundary_handling", target=Target.CPU)
+
+    neumann = Neumann()
+    for d in 'N':
+        boundary_handling.set_boundary(neumann, slice_from_direction(d, dim=2))
+
+    boundary_handling.prepare()
+
+    for b in dh.iterate(ghost_layers=True):
+        index_array_bd = b[boundary_handling._index_array_name]
+        data_setter = index_array_bd.boundary_object_to_data_setter[boundary_handling.boundary_objects[0]]
+
+        y_pos = data_setter.boundary_cell_positions(1)
+
+        assert all(y_pos == 5.5)
+        assert np.all(data_setter.link_offsets() == [0, -1])
+        assert np.all(data_setter.link_positions(1) == 6.)
+
+
+@pytest.mark.parametrize('with_indices', ('with_indices', False))
+def test_dirichlet(with_indices):
+    value = (1, 20, 3) if with_indices else 1
+
+    dh = SerialDataHandling(domain_size=(7, 7))
+    src = dh.add_array('src', values_per_cell=3 if with_indices else 1)
+    dh.cpu_arrays.src[...] = np.random.rand(*src.shape)
+    boundary_stencil = [(1, 0), (-1, 0), (0, 1), (0, -1)]
+    boundary_handling = BoundaryHandling(dh, src.name, boundary_stencil)
+    dirichlet = Dirichlet(value)
+    assert dirichlet.name == 'Dirichlet'
+    dirichlet.name = "wall"
+    assert dirichlet.name == 'wall'
+
+    for d in ('N', 'S', 'W', 'E'):
+        boundary_handling.set_boundary(dirichlet, slice_from_direction(d, dim=2))
+    boundary_handling()
+
+    assert all([np.allclose(a, np.array(value)) for a in dh.cpu_arrays.src[1:-2, 0]])
+    assert all([np.allclose(a, np.array(value)) for a in dh.cpu_arrays.src[1:-2, -1]])
+    assert all([np.allclose(a, np.array(value)) for a in dh.cpu_arrays.src[0, 1:-2]])
+    assert all([np.allclose(a, np.array(value)) for a in dh.cpu_arrays.src[-1, 1:-2]])
--- a/tests/test_boundary_indexlist_creation.py
+++ b/tests/test_boundary_indexlist_creation.py
+import numpy as np
+from itertools import product
+import pystencils.boundaries.createindexlist as cil
+
+import pytest
+
+
+@pytest.mark.parametrize('single_link', [False, True])
+@pytest.mark.skipif(not cil.cython_funcs_available, reason='Cython functions are not available')
+def test_equivalence_cython_python_version(single_link):
+    #   D2Q9
+    stencil_2d = tuple((x, y) for x, y in product([-1, 0, 1], [-1, 0, 1]))
+    #   D3Q19
+    stencil_3d = tuple(
+        (x, y, z) for x, y, z in product([-1, 0, 1], [-1, 0, 1], [-1, 0, 1]) if abs(x) + abs(y) + abs(z) < 3)
+
+    for dtype in [int, np.int16, np.uint32]:
+        fluid_mask = dtype(1)
+        mask = dtype(2)
+        flag_field_2d = np.ones([15, 16], dtype=dtype) * fluid_mask
+        flag_field_3d = np.ones([15, 16, 17], dtype=dtype) * fluid_mask
+
+        flag_field_2d[0, :] = mask
+        flag_field_2d[-1, :] = mask
+        flag_field_2d[7, 7] = mask
+
+        flag_field_3d[0, :, :] = mask
+        flag_field_3d[-1, :, :] = mask
+        flag_field_3d[7, 7, 7] = mask
+
+        result_python_2d = cil._create_index_list_python(flag_field_2d, mask, fluid_mask,
+                                                         stencil_2d, single_link, True, 1)
+
+        result_python_3d = cil._create_index_list_python(flag_field_3d, mask, fluid_mask,
+                                                         stencil_3d, single_link, True, 1)
+
+        result_cython_2d = cil.create_boundary_index_list(flag_field_2d, stencil_2d, mask,
+                                                          fluid_mask, 1, True, single_link)
+        result_cython_3d = cil.create_boundary_index_list(flag_field_3d, stencil_3d, mask,
+                                                          fluid_mask, 1, True, single_link)
+
+        np.testing.assert_equal(result_python_2d, result_cython_2d)
+        np.testing.assert_equal(result_python_3d, result_cython_3d)
+
+
+@pytest.mark.parametrize('single_link', [False, True])
+@pytest.mark.skipif(not cil.cython_funcs_available, reason='Cython functions are not available')
+def test_equivalence_cell_idx_list_cython_python_version(single_link):
+    #   D2Q9
+    stencil_2d = tuple((x, y) for x, y in product([-1, 0, 1], [-1, 0, 1]))
+    #   D3Q19
+    stencil_3d = tuple(
+        (x, y, z) for x, y, z in product([-1, 0, 1], [-1, 0, 1], [-1, 0, 1]) if abs(x) + abs(y) + abs(z) < 3)
+
+    for dtype in [int, np.int16, np.uint32]:
+        fluid_mask = dtype(1)
+        mask = dtype(2)
+        flag_field_2d = np.ones([15, 16], dtype=dtype) * fluid_mask
+        flag_field_3d = np.ones([15, 16, 17], dtype=dtype) * fluid_mask
+
+        flag_field_2d[0, :] = mask
+        flag_field_2d[-1, :] = mask
+        flag_field_2d[7, 7] = mask
+
+        flag_field_3d[0, :, :] = mask
+        flag_field_3d[-1, :, :] = mask
+        flag_field_3d[7, 7, 7] = mask
+
+        result_python_2d = cil._create_index_list_python(flag_field_2d, mask, fluid_mask,
+                                                         stencil_2d, single_link, False)
+
+        result_python_3d = cil._create_index_list_python(flag_field_3d, mask, fluid_mask,
+                                                         stencil_3d, single_link, False)
+
+        result_cython_2d = cil.create_boundary_index_list(flag_field_2d, stencil_2d, mask, fluid_mask, None,
+                                                          False, single_link)
+        result_cython_3d = cil.create_boundary_index_list(flag_field_3d, stencil_3d, mask, fluid_mask, None,
+                                                          False, single_link)
+
+        np.testing.assert_equal(result_python_2d, result_cython_2d)
+        np.testing.assert_equal(result_python_3d, result_cython_3d)
+
+
+@pytest.mark.parametrize('inner_or_boundary', [False, True])
+def test_normal_calculation(inner_or_boundary):
+    stencil = tuple((x, y) for x, y in product([-1, 0, 1], [-1, 0, 1]))
+    domain_size = (32, 32)
+    dtype = np.uint32
+    fluid_mask = dtype(1)
+    mask = dtype(2)
+    flag_field = np.ones([domain_size[0], domain_size[1]], dtype=dtype) * fluid_mask
+
+    radius_inner = domain_size[0] // 4
+    radius_outer = domain_size[0] // 2
+    y_mid = domain_size[1] / 2
+    x_mid = domain_size[0] / 2
+
+    for x in range(0, domain_size[0]):
+        for y in range(0, domain_size[1]):
+            if (y - y_mid) ** 2 + (x - x_mid) ** 2 < radius_inner ** 2:
+                flag_field[x, y] = mask
+            if (x - x_mid) ** 2 + (y - y_mid) ** 2 > radius_outer ** 2:
+                flag_field[x, y] = mask
+
+    args_no_gl = (flag_field, mask, fluid_mask, np.array(stencil, dtype=np.int32), True)
+    index_list = cil._create_index_list_python(*args_no_gl, inner_or_boundary=inner_or_boundary, nr_of_ghost_layers=1)
+
+    checkmask = mask if inner_or_boundary else fluid_mask
+
+    for cell in index_list:
+        idx = cell[2]
+        cell = tuple((cell[0], cell[1]))
+        sum_cells = np.zeros(len(cell))
+        for dir_idx, direction in enumerate(stencil):
+            neighbor_cell = tuple([cell_i + dir_i for cell_i, dir_i in zip(cell, direction)])
+            if any(not 0 <= e < upper for e, upper in zip(neighbor_cell, flag_field.shape)):
+                continue
+            if flag_field[neighbor_cell] & checkmask:
+                sum_cells += np.array(direction)
+
+        assert np.argmax(np.inner(sum_cells, stencil)) == idx
--- a/pystencils_tests/test_buffer.py
+++ b/pystencils_tests/test_buffer.py
 """Tests  (un)packing (from)to buffers."""

 import numpy as np
-from pystencils import Field, FieldType, Assignment, create_kernel
-from pystencils.field import layout_string_to_tuple, create_numpy_array_with_layout
-from pystencils.stencils import direction_string_to_offset
-from pystencils.slicing import add_ghost_layers, get_slice_before_ghost_layer, get_ghost_region_slice

+import pystencils as ps
+from pystencils import Assignment, Field, FieldType, create_kernel
+from pystencils.field import create_numpy_array_with_layout, layout_string_to_tuple
+from pystencils.slicing import (
+    add_ghost_layers, get_ghost_region_slice, get_slice_before_ghost_layer)
+from pystencils.stencil import direction_string_to_offset

 FIELD_SIZES = [(32, 10), (10, 8, 6)]

@@ -18,9 +20,9 @@ def _generate_fields(dt=np.uint64, num_directions=1, layout='numpy'):
    fields = []
    for size in field_sizes:
        field_layout = layout_string_to_tuple(layout, len(size))
-        src_arr = create_numpy_array_with_layout(size, field_layout)
+        src_arr = create_numpy_array_with_layout(size, field_layout, dtype=dt)

-        array_data = np.reshape(np.arange(1, int(np.prod(size)+1)), size)
+        array_data = np.reshape(np.arange(1, int(np.prod(size) + 1)), size)
        # Use flat iterator to input data into the array
        src_arr.flat = add_ghost_layers(array_data, index_dimensions=1 if num_directions > 1 else 0).astype(dt).flat
        dst_arr = np.zeros(src_arr.shape, dtype=dt)
@@ -39,13 +41,18 @@ def test_full_scalar_field():
                                      field_type=FieldType.BUFFER, dtype=src_arr.dtype)

        pack_eqs = [Assignment(buffer.center(), src_field.center())]
-        pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        pack_code = create_kernel(pack_eqs, config=config)
+        code = ps.get_code_str(pack_code)
+        ps.show_code(pack_code)

        pack_kernel = pack_code.compile()
        pack_kernel(buffer=buffer_arr, src_field=src_arr)

        unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
-        unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+
+        config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        unpack_code = create_kernel(unpack_eqs, config=config)

        unpack_kernel = unpack_code.compile()
        unpack_kernel(dst_field=dst_arr, buffer=buffer_arr)
@@ -69,14 +76,18 @@ def test_field_slice():
                                          field_type=FieldType.BUFFER, dtype=src_arr.dtype)

            pack_eqs = [Assignment(buffer.center(), src_field.center())]
-            pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+
+            config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+            pack_code = create_kernel(pack_eqs, config=config)

            pack_kernel = pack_code.compile()
            pack_kernel(buffer=bufferArr, src_field=src_arr[pack_slice])

            # Unpack into ghost layer of dst_field in N direction
            unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
-            unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+
+            config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+            unpack_code = create_kernel(unpack_eqs, config=config)

            unpack_kernel = unpack_code.compile()
            unpack_kernel(buffer=bufferArr, dst_field=dst_arr[unpack_slice])
@@ -101,7 +112,8 @@ def test_all_cell_values():
            eq = Assignment(buffer(idx), src_field(idx))
            pack_eqs.append(eq)

-        pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        pack_code = create_kernel(pack_eqs, config=config)
        pack_kernel = pack_code.compile()
        pack_kernel(buffer=bufferArr, src_field=src_arr)

@@ -111,7 +123,8 @@ def test_all_cell_values():
            eq = Assignment(dst_field(idx), buffer(idx))
            unpack_eqs.append(eq)

-        unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        unpack_code = create_kernel(unpack_eqs, config=config)
        unpack_kernel = unpack_code.compile()
        unpack_kernel(buffer=bufferArr, dst_field=dst_arr)

@@ -137,7 +150,8 @@ def test_subset_cell_values():
            eq = Assignment(buffer(buffer_idx), src_field(cell_idx))
            pack_eqs.append(eq)

-        pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+        pack_code = create_kernel(pack_eqs, config=config)
        pack_kernel = pack_code.compile()
        pack_kernel(buffer=bufferArr, src_field=src_arr)

@@ -147,7 +161,8 @@ def test_subset_cell_values():
            eq = Assignment(dst_field(cell_idx), buffer(buffer_idx))
            unpack_eqs.append(eq)

-        unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+        unpack_code = create_kernel(unpack_eqs, config=config)
        unpack_kernel = unpack_code.compile()
        unpack_kernel(buffer=bufferArr, dst_field=dst_arr)

@@ -172,7 +187,8 @@ def test_field_layouts():
                eq = Assignment(buffer(idx), src_field(idx))
                pack_eqs.append(eq)

-            pack_code = create_kernel(pack_eqs, data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+            config = ps.CreateKernelConfig(data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+            pack_code = create_kernel(pack_eqs, config=config)
            pack_kernel = pack_code.compile()
            pack_kernel(buffer=bufferArr, src_field=src_arr)

@@ -182,6 +198,62 @@ def test_field_layouts():
                eq = Assignment(dst_field(idx), buffer(idx))
                unpack_eqs.append(eq)

-            unpack_code = create_kernel(unpack_eqs, data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+            config = ps.CreateKernelConfig(data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+            unpack_code = create_kernel(unpack_eqs, config=config)
            unpack_kernel = unpack_code.compile()
            unpack_kernel(buffer=bufferArr, dst_field=dst_arr)
+
+
+def test_iteration_slices():
+    num_cell_values = 19
+    dt = np.uint64
+    fields = _generate_fields(dt=dt, num_directions=num_cell_values)
+    for (src_arr, dst_arr, bufferArr) in fields:
+        spatial_dimensions = len(src_arr.shape) - 1
+        # src_field = Field.create_from_numpy_array("src_field", src_arr, index_dimensions=1)
+        # dst_field = Field.create_from_numpy_array("dst_field", dst_arr, index_dimensions=1)
+        src_field = Field.create_generic("src_field", spatial_dimensions, index_shape=(num_cell_values,), dtype=dt)
+        dst_field = Field.create_generic("dst_field", spatial_dimensions, index_shape=(num_cell_values,), dtype=dt)
+        buffer = Field.create_generic("buffer", spatial_dimensions=1, index_dimensions=1,
+                                      field_type=FieldType.BUFFER, dtype=src_arr.dtype)
+
+        pack_eqs = []
+        # Since we are packing all cell values for all cells, then
+        # the buffer index is equivalent to the field index
+        for idx in range(num_cell_values):
+            eq = Assignment(buffer(idx), src_field(idx))
+            pack_eqs.append(eq)
+
+        dim = src_field.spatial_dimensions
+
+        #   Pack only the leftmost slice, only every second cell
+        pack_slice = (slice(None, None, 2),) * (dim - 1) + (0,)
+
+        #   Fill the entire array with data
+        src_arr[(slice(None, None, 1),) * dim] = np.arange(num_cell_values)
+        dst_arr.fill(0)
+
+        config = ps.CreateKernelConfig(iteration_slice=pack_slice,
+                                       data_type={'src_field': src_arr.dtype, 'buffer': buffer.dtype})
+
+        pack_code = create_kernel(pack_eqs, config=config)
+        pack_kernel = pack_code.compile()
+        pack_kernel(buffer=bufferArr, src_field=src_arr)
+
+        unpack_eqs = []
+
+        for idx in range(num_cell_values):
+            eq = Assignment(dst_field(idx), buffer(idx))
+            unpack_eqs.append(eq)
+
+        config = ps.CreateKernelConfig(iteration_slice=pack_slice,
+                                       data_type={'dst_field': dst_arr.dtype, 'buffer': buffer.dtype})
+
+        unpack_code = create_kernel(unpack_eqs, config=config)
+        unpack_kernel = unpack_code.compile()
+        unpack_kernel(buffer=bufferArr, dst_field=dst_arr)
+
+        #   Check if only every second entry of the leftmost slice has been copied
+        np.testing.assert_equal(dst_arr[pack_slice], src_arr[pack_slice])
+        np.testing.assert_equal(dst_arr[(slice(1, None, 2),) * (dim - 1) + (0,)], 0)
+        np.testing.assert_equal(dst_arr[(slice(None, None, 1),) * (dim - 1) + (slice(1, None),)], 0)
--- a/pystencils_tests/test_buffer_gpu.py
+++ b/pystencils_tests/test_buffer_gpu.py
 """Tests for the (un)packing (from)to buffers on a CUDA GPU."""

+from dataclasses import replace
 import numpy as np
-from pystencils import Field, FieldType, Assignment
-from pystencils.field import layout_string_to_tuple, create_numpy_array_with_layout
-from pystencils.stencils import direction_string_to_offset
-from pystencils.gpucuda import make_python_function, create_cuda_kernel
-from pystencils.slicing import add_ghost_layers, get_slice_before_ghost_layer, get_ghost_region_slice
 import pytest
+
+import pystencils
+from pystencils import Assignment, Field, FieldType, Target, CreateKernelConfig, create_kernel, fields
+from pystencils.bit_masks import flag_cond
+from pystencils.field import create_numpy_array_with_layout, layout_string_to_tuple
+from pystencils.slicing import (
+    add_ghost_layers, get_ghost_region_slice, get_slice_before_ghost_layer)
+from pystencils.stencil import direction_string_to_offset
+
 try:
    # noinspection PyUnresolvedReferences
-    import pycuda.autoinit
-    import pycuda.gpuarray as gpuarray
+    import cupy as cp
 except ImportError:
    pass

@@ -19,6 +23,7 @@ FIELD_SIZES = [(4, 3), (9, 3, 7)]


 def _generate_fields(dt=np.uint8, stencil_directions=1, layout='numpy'):
+    pytest.importorskip('cupy')
    field_sizes = FIELD_SIZES
    if stencil_directions > 1:
        field_sizes = [s + (stencil_directions,) for s in field_sizes]
@@ -33,15 +38,15 @@ def _generate_fields(dt=np.uint8, stencil_directions=1, layout='numpy'):
        src_arr.flat = add_ghost_layers(array_data,
                                        index_dimensions=1 if stencil_directions > 1 else 0).astype(dt).flat

-        gpu_src_arr = gpuarray.to_gpu(src_arr)
-        gpu_dst_arr = gpuarray.zeros_like(gpu_src_arr)
-        gpu_buffer_arr = gpuarray.zeros(np.prod(src_arr.shape), dtype=dt)
+        gpu_src_arr = cp.asarray(src_arr)
+        gpu_dst_arr = cp.zeros_like(gpu_src_arr)
+        size = int(np.prod(src_arr.shape))
+        gpu_buffer_arr = cp.zeros(size, dtype=dt)

        fields.append((src_arr, gpu_src_arr, gpu_dst_arr, gpu_buffer_arr))
    return fields


-@pytest.mark.gpu
 def test_full_scalar_field():
    """Tests fully (un)packing a scalar field (from)to a GPU buffer."""
    fields = _generate_fields()
@@ -53,16 +58,20 @@ def test_full_scalar_field():

        pack_eqs = [Assignment(buffer.center(), src_field.center())]
        pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)

-        pack_kernel = make_python_function(pack_code)
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+        pack_ast = create_kernel(pack_eqs, config=config)
+
+        pack_kernel = pack_ast.compile()
        pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)

        unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
        unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)

-        unpack_kernel = make_python_function(unpack_code)
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+        unpack_ast = create_kernel(unpack_eqs, config=config)
+
+        unpack_kernel = unpack_ast.compile()
        unpack_kernel(dst_field=gpu_dst_arr, buffer=gpu_buffer_arr)

        dst_arr = gpu_dst_arr.get()
@@ -70,7 +79,6 @@ def test_full_scalar_field():
        np.testing.assert_equal(src_arr, dst_arr)


-@pytest.mark.gpu
 def test_field_slice():
    """Tests (un)packing slices of a scalar field (from)to a buffer."""
    fields = _generate_fields()
@@ -88,17 +96,21 @@ def test_field_slice():

            pack_eqs = [Assignment(buffer.center(), src_field.center())]
            pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)

-            pack_kernel = make_python_function(pack_code)
+            config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+            pack_ast = create_kernel(pack_eqs, config=config)
+
+            pack_kernel = pack_ast.compile()
            pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr[pack_slice])

            # Unpack into ghost layer of dst_field in N direction
            unpack_eqs = [Assignment(dst_field.center(), buffer.center())]
            unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)

-            unpack_kernel = make_python_function(unpack_code)
+            config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+            unpack_ast = create_kernel(unpack_eqs, config=config)
+
+            unpack_kernel = unpack_ast.compile()
            unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr[unpack_slice])

            dst_arr = gpu_dst_arr.get()
@@ -106,7 +118,6 @@ def test_field_slice():
            np.testing.assert_equal(src_arr[pack_slice], dst_arr[unpack_slice])


-@pytest.mark.gpu
 def test_all_cell_values():
    """Tests (un)packing all cell values of the a field (from)to a buffer."""
    num_cell_values = 7
@@ -125,8 +136,11 @@ def test_all_cell_values():
            pack_eqs.append(eq)

        pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)
-        pack_kernel = make_python_function(pack_code)
+
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+        pack_code = create_kernel(pack_eqs, config=config)
+        pack_kernel = pack_code.compile()
+
        pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)

        unpack_eqs = []
@@ -136,8 +150,10 @@ def test_all_cell_values():
            unpack_eqs.append(eq)

        unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)
-        unpack_kernel = make_python_function(unpack_code)
+
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+        unpack_ast = create_kernel(unpack_eqs, config=config)
+        unpack_kernel = unpack_ast.compile()
        unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)

        dst_arr = gpu_dst_arr.get()
@@ -145,9 +161,8 @@ def test_all_cell_values():
        np.testing.assert_equal(src_arr, dst_arr)


-@pytest.mark.gpu
 def test_subset_cell_values():
-    """Tests (un)packing a subset of cell values of the a field (from)to a buffer."""
+    """Tests (un)packing a subset of cell values of a field (from)to a buffer."""
    num_cell_values = 7
    # Cell indices of the field to be (un)packed (from)to the buffer
    cell_indices = [1, 3, 5, 6]
@@ -166,8 +181,9 @@ def test_subset_cell_values():
            pack_eqs.append(eq)

        pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)
-        pack_kernel = make_python_function(pack_code)
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+        pack_ast = create_kernel(pack_eqs, config=config)
+        pack_kernel = pack_ast.compile()
        pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)

        unpack_eqs = []
@@ -177,8 +193,10 @@ def test_subset_cell_values():
            unpack_eqs.append(eq)

        unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-        unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)
-        unpack_kernel = make_python_function(unpack_code)
+        config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+        unpack_ast = create_kernel(unpack_eqs, config=config)
+        unpack_kernel = unpack_ast.compile()
+
        unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)

        dst_arr = gpu_dst_arr.get()
@@ -187,7 +205,6 @@ def test_subset_cell_values():
        np.testing.assert_equal(dst_arr, mask_arr.filled(int(0)))


-@pytest.mark.gpu
 def test_field_layouts():
    num_cell_values = 7
    for layout_str in ['numpy', 'fzyx', 'zyxf', 'reverse_numpy']:
@@ -206,8 +223,10 @@ def test_field_layouts():
                pack_eqs.append(eq)

            pack_types = {'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            pack_code = create_cuda_kernel(pack_eqs, type_info=pack_types)
-            pack_kernel = make_python_function(pack_code)
+            config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=pack_types)
+            pack_ast = create_kernel(pack_eqs, config=config)
+            pack_kernel = pack_ast.compile()
+
            pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)

            unpack_eqs = []
@@ -217,6 +236,99 @@ def test_field_layouts():
                unpack_eqs.append(eq)

            unpack_types = {'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype}
-            unpack_code = create_cuda_kernel(unpack_eqs, type_info=unpack_types)
-            unpack_kernel = make_python_function(unpack_code)
+            config = CreateKernelConfig(target=pystencils.Target.GPU, data_type=unpack_types)
+            unpack_ast = create_kernel(unpack_eqs, config=config)
+            unpack_kernel = unpack_ast.compile()
+
            unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)
+
+
+def test_buffer_indexing():
+    src_field, dst_field = fields(f'pdfs_src(19), pdfs_dst(19) :double[3D]')
+    mask_field = fields(f'mask : uint32 [3D]')
+    buffer = Field.create_generic('buffer', spatial_dimensions=1, field_type=FieldType.BUFFER,
+                                  dtype="float64",
+                                  index_shape=(19,))
+
+    src_field_size = src_field.spatial_shape
+    mask_field_size = mask_field.spatial_shape
+
+    up = Assignment(buffer(0), flag_cond(1, mask_field.center, src_field[0, 1, 0](1)))
+    iteration_slice = tuple(slice(None, None, 2) for _ in range(3))
+    config = CreateKernelConfig(target=Target.GPU)
+    config = replace(config, iteration_slice=iteration_slice, ghost_layers=0)
+
+    ast = create_kernel(up, config=config)
+    parameters = ast.get_parameters()
+
+    spatial_shape_symbols = [p.symbol for p in parameters if p.is_field_shape]
+
+    # The loop counters as well as the resolved field access should depend on one common spatial shape
+    if spatial_shape_symbols[0] in mask_field_size:
+        for s in spatial_shape_symbols:
+            assert s in mask_field_size
+
+    if spatial_shape_symbols[0] in src_field_size:
+        for s in spatial_shape_symbols:
+            assert s in src_field_size
+
+    assert len(spatial_shape_symbols) <= 3
+
+
+@pytest.mark.parametrize('gpu_indexing', ("block", "line"))
+def test_iteration_slices(gpu_indexing):
+    num_cell_values = 19
+    dt = np.uint64
+    fields = _generate_fields(dt=dt, stencil_directions=num_cell_values)
+    for (src_arr, gpu_src_arr, gpu_dst_arr, gpu_buffer_arr) in fields:
+        src_field = Field.create_from_numpy_array("src_field", gpu_src_arr, index_dimensions=1)
+        dst_field = Field.create_from_numpy_array("dst_field", gpu_src_arr, index_dimensions=1)
+        buffer = Field.create_generic("buffer", spatial_dimensions=1, index_dimensions=1,
+                                      field_type=FieldType.BUFFER, dtype=src_arr.dtype)
+
+        pack_eqs = []
+        # Since we are packing all cell values for all cells, then
+        # the buffer index is equivalent to the field index
+        for idx in range(num_cell_values):
+            eq = Assignment(buffer(idx), src_field(idx))
+            pack_eqs.append(eq)
+
+        dim = src_field.spatial_dimensions
+
+        #   Pack only the leftmost slice, only every second cell
+        pack_slice = (slice(None, None, 2),) * (dim - 1) + (0,)
+
+        #   Fill the entire array with data
+        src_arr[(slice(None, None, 1),) * dim] = np.arange(num_cell_values)
+        gpu_src_arr.set(src_arr)
+        gpu_dst_arr.fill(0)
+
+        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
+                                    data_type={'src_field': gpu_src_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
+                                    gpu_indexing=gpu_indexing)
+
+        pack_code = create_kernel(pack_eqs, config=config)
+        pack_kernel = pack_code.compile()
+        pack_kernel(buffer=gpu_buffer_arr, src_field=gpu_src_arr)
+
+        unpack_eqs = []
+
+        for idx in range(num_cell_values):
+            eq = Assignment(dst_field(idx), buffer(idx))
+            unpack_eqs.append(eq)
+
+        config = CreateKernelConfig(target=Target.GPU, iteration_slice=pack_slice,
+                                    data_type={'dst_field': gpu_dst_arr.dtype, 'buffer': gpu_buffer_arr.dtype},
+                                    gpu_indexing=gpu_indexing)
+
+        unpack_code = create_kernel(unpack_eqs, config=config)
+        unpack_kernel = unpack_code.compile()
+        unpack_kernel(buffer=gpu_buffer_arr, dst_field=gpu_dst_arr)
+
+        dst_arr = gpu_dst_arr.get()
+        src_arr = gpu_src_arr.get()
+
+        #   Check if only every second entry of the leftmost slice has been copied
+        np.testing.assert_equal(dst_arr[pack_slice], src_arr[pack_slice])
+        np.testing.assert_equal(dst_arr[(slice(1, None, 2),) * (dim - 1) + (0,)], 0)
+        np.testing.assert_equal(dst_arr[(slice(None, None, 1),) * (dim - 1) + (slice(1, None),)], 0)
No results found