diff --git a/pystencils/backends/cbackend.py b/pystencils/backends/cbackend.py
index 94f3477290ecf79fcd8a095389a752d1505d67e2..aca5e5da1e36774dbd7b69b1a0f0f57012bc4dd6 100644
--- a/pystencils/backends/cbackend.py
+++ b/pystencils/backends/cbackend.py
@@ -298,7 +298,7 @@ class CBackend:
         return node.get_code(self._dialect, self._vector_instruction_set)
 
     def _print_SourceCodeComment(self, node):
-        return "/* " + node.text + " */"
+        return f"/* {node.text } */"
 
     def _print_EmptyLine(self, node):
         return ""
@@ -316,7 +316,7 @@ class CBackend:
         result = f"if ({condition_expr})\n{true_block} "
         if node.false_block:
             false_block = self._print_Block(node.false_block)
-            result += "else " + false_block
+            result += f"else {false_block}"
         return result
 
 
@@ -336,7 +336,7 @@ class CustomSympyPrinter(CCodePrinter):
             return self._typed_number(expr.evalf(), get_type_of_expression(expr))
 
         if expr.exp.is_integer and expr.exp.is_number and 0 < expr.exp < 8:
-            return "(" + self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False)) + ")"
+            return f"({self._print(sp.Mul(*[expr.base] * expr.exp, evaluate=False))})"
         elif expr.exp.is_integer and expr.exp.is_number and - 8 < expr.exp < 0:
             return f"1 / ({self._print(sp.Mul(*([expr.base] * -expr.exp), evaluate=False))})"
         else:
@@ -589,9 +589,6 @@ class VectorizedCustomSympyPrinter(CustomSympyPrinter):
             result = self.instruction_set['&'].format(result, item)
         return result
 
-    def _print_Max(self, expr):
-        return "test"
-
     def _print_Or(self, expr):
         result = self._scalarFallback('_print_Or', expr)
         if result:
diff --git a/pystencils/backends/cuda_backend.py b/pystencils/backends/cuda_backend.py
index b43cfb4ededa3ea5c84ff2eef9d434d602f358b2..2d7dc579e932aee814e5207801971246122a9a88 100644
--- a/pystencils/backends/cuda_backend.py
+++ b/pystencils/backends/cuda_backend.py
@@ -104,7 +104,6 @@ class CudaSympyPrinter(CustomSympyPrinter):
             assert len(expr.args) == 1, f"__fsqrt_rn has one argument, but {len(expr.args)} where given"
             return f"__fsqrt_rn({self._print(expr.args[0])})"
         elif isinstance(expr, fast_inv_sqrt):
-            print(len(expr.args) == 1)
             assert len(expr.args) == 1, f"__frsqrt_rn has one argument, but {len(expr.args)} where given"
             return f"__frsqrt_rn({self._print(expr.args[0])})"
         return super()._print_Function(expr)
diff --git a/pystencils/datahandling/datahandling_interface.py b/pystencils/datahandling/datahandling_interface.py
index 272f1660ce1c2a6623fb1bb069b37b16d68042f3..0eb101815ec39788cb2b1b9e720f2628bf087edf 100644
--- a/pystencils/datahandling/datahandling_interface.py
+++ b/pystencils/datahandling/datahandling_interface.py
@@ -86,6 +86,13 @@ class DataHandling(ABC):
         Args:
             description (str): String description of the fields to add
             dtype: data type of the array as numpy data type
+            ghost_layers: number of ghost layers - if not specified a default value specified in the constructor
+                         is used
+            layout: memory layout of array, either structure of arrays 'SoA' or array of structures 'AoS'.
+                    this is only important if values_per_cell > 1
+            cpu: allocate field on the CPU
+            gpu: allocate field on the GPU, if None, a GPU field is allocated if default_target is 'gpu'
+            alignment: either False for no alignment, or the number of bytes to align to
         Returns:
             Fields representing the just created arrays
         """
@@ -200,6 +207,10 @@ class DataHandling(ABC):
         directly passed to the kernel function and override possible parameters from the DataHandling
         """
 
+    @abstractmethod
+    def get_kernel_kwargs(self, kernel_function, **kwargs):
+        """Returns the input arguments of a kernel"""
+
     @abstractmethod
     def swap(self, name1, name2, gpu=False):
         """Swaps data of two arrays"""
diff --git a/pystencils/datahandling/serial_datahandling.py b/pystencils/datahandling/serial_datahandling.py
index 7352951d22393d58ca6f6bef52522e5760ba71cb..ce4629f6ab7f35c96cec437ab76b778dfec83e04 100644
--- a/pystencils/datahandling/serial_datahandling.py
+++ b/pystencils/datahandling/serial_datahandling.py
@@ -266,10 +266,10 @@ class SerialDataHandling(DataHandling):
         return name in self.gpu_arrays
 
     def synchronization_function_cpu(self, names, stencil_name=None, **_):
-        return self.synchronization_function(names, stencil_name, 'cpu')
+        return self.synchronization_function(names, stencil_name, target='cpu')
 
     def synchronization_function_gpu(self, names, stencil_name=None, **_):
-        return self.synchronization_function(names, stencil_name, 'gpu')
+        return self.synchronization_function(names, stencil_name, target='gpu')
 
     def synchronization_function(self, names, stencil=None, target=None, **_):
         if target is None:
@@ -425,14 +425,15 @@ class SerialDataHandling(DataHandling):
         np.savez_compressed(file, **self.cpu_arrays)
 
     def load_all(self, file):
+        if '.npz' not in file:
+            file += '.npz'
         file_contents = np.load(file)
         for arr_name, arr_contents in self.cpu_arrays.items():
             if arr_name not in file_contents:
                 print(f"Skipping read data {arr_name} because there is no data with this name in data handling")
                 continue
             if file_contents[arr_name].shape != arr_contents.shape:
-                print("Skipping read data {} because shapes don't match. "
-                      "Read array shape {}, existing array shape {}".format(arr_name, file_contents[arr_name].shape,
-                                                                            arr_contents.shape))
+                print(f"Skipping read data {arr_name} because shapes don't match. "
+                      f"Read array shape {file_contents[arr_name].shape}, existing array shape {arr_contents.shape}")
                 continue
             np.copyto(arr_contents, file_contents[arr_name])
diff --git a/pystencils/stencil.py b/pystencils/stencil.py
index 359a0a29492c537957b8e24a952fa7e052ee898f..28d179c50d2716e4aa1530b2409abb77e6b10e66 100644
--- a/pystencils/stencil.py
+++ b/pystencils/stencil.py
@@ -34,6 +34,8 @@ def is_valid(stencil, max_neighborhood=None):
         True
         >>> is_valid([(2, 0), (1, 0)], max_neighborhood=1)
         False
+        >>> is_valid([(2, 0), (1, 0)], max_neighborhood=2)
+        True
     """
     expected_dim = len(stencil[0])
     for d in stencil:
@@ -67,8 +69,11 @@ def have_same_entries(s1, s2):
     Examples:
         >>> stencil1 = [(1, 0), (-1, 0), (0, 1), (0, -1)]
         >>> stencil2 = [(-1, 0), (0, -1), (1, 0), (0, 1)]
+        >>> stencil3 = [(-1, 0), (0, -1), (1, 0)]
         >>> have_same_entries(stencil1, stencil2)
         True
+        >>> have_same_entries(stencil1, stencil3)
+        False
     """
     if len(s1) != len(s2):
         return False
diff --git a/pystencils/sympyextensions.py b/pystencils/sympyextensions.py
index 55fe74967f6deedeac45233dc198f562d072b485..31e42224ab19f14895e99d7ba9685b5c33a2ef54 100644
--- a/pystencils/sympyextensions.py
+++ b/pystencils/sympyextensions.py
@@ -481,7 +481,7 @@ def count_operations(term: Union[sp.Expr, List[sp.Expr]],
             pass
         elif t.func is sp.Mul:
             if check_type(t):
-                result['muls'] += len(t.args)
+                result['muls'] += len(t.args) - 1
                 for a in t.args:
                     if a == 1 or a == -1:
                         result['muls'] -= 1
@@ -509,7 +509,8 @@ def count_operations(term: Union[sp.Expr, List[sp.Expr]],
                     if t.exp >= 0:
                         result['muls'] += int(t.exp) - 1
                     else:
-                        result['muls'] -= 1
+                        if result['muls'] > 0:
+                            result['muls'] -= 1
                         result['divs'] += 1
                         result['muls'] += (-int(t.exp)) - 1
                 elif sp.nsimplify(t.exp) == sp.Rational(1, 2):
diff --git a/pystencils/transformations.py b/pystencils/transformations.py
index b3f9431bbf3035aaabd25f6eb430c738dddaf3a7..5e306f2de168994575562a92156408f128ab447c 100644
--- a/pystencils/transformations.py
+++ b/pystencils/transformations.py
@@ -1206,13 +1206,13 @@ def get_loop_hierarchy(ast_node):
     return reversed(result)
 
 
-def get_loop_counter_symbol_hierarchy(astNode):
+def get_loop_counter_symbol_hierarchy(ast_node):
     """Determines the loop counter symbols around a given AST node.
-    :param astNode: the AST node
+    :param ast_node: the AST node
     :return: list of loop counter symbols, where the first list entry is the symbol of the innermost loop
     """
     result = []
-    node = astNode
+    node = ast_node
     while node is not None:
         node = get_next_parent_of_type(node, ast.LoopOverCoordinate)
         if node:
diff --git a/pystencils_tests/test_astnodes.py b/pystencils_tests/test_astnodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..98c755efd4d5b57e0b33f47ffe5d7fb2e11df573
--- /dev/null
+++ b/pystencils_tests/test_astnodes.py
@@ -0,0 +1,100 @@
+import sympy as sp
+import pystencils as ps
+
+from pystencils import Assignment
+from pystencils.astnodes import Block, SkipIteration, LoopOverCoordinate, SympyAssignment
+from sympy.codegen.rewriting import optims_c99
+
+dst = ps.fields('dst(8): double[2D]')
+s = sp.symbols('s_:8')
+x = sp.symbols('x')
+y = sp.symbols('y')
+
+
+def test_kernel_function():
+    assignments = [
+        Assignment(dst[0, 0](0), s[0]),
+        Assignment(x, dst[0, 0](2))
+    ]
+
+    ast_node = ps.create_kernel(assignments)
+
+    assert ast_node.target == 'cpu'
+    assert ast_node.backend == 'c'
+    # symbols_defined and undefined_symbols will always return an emtpy set
+    assert ast_node.symbols_defined == set()
+    assert ast_node.undefined_symbols == set()
+    assert ast_node.fields_written == {dst}
+    assert ast_node.fields_read == {dst}
+
+
+def test_skip_iteration():
+    # skip iteration is an object which should give back empty data structures.
+    skipped = SkipIteration()
+    assert skipped.args == []
+    assert skipped.symbols_defined == set()
+    assert skipped.undefined_symbols == set()
+
+
+def test_block():
+    assignments = [
+        Assignment(dst[0, 0](0), s[0]),
+        Assignment(x, dst[0, 0](2))
+    ]
+    bl = Block(assignments)
+    assert bl.symbols_defined == {dst[0, 0](0), dst[0, 0](2), s[0], x}
+
+    bl.append([Assignment(y, 10)])
+    assert bl.symbols_defined == {dst[0, 0](0), dst[0, 0](2), s[0], x, y}
+    assert len(bl.args) == 3
+
+    list_iterator = iter([Assignment(s[1], 11)])
+    bl.insert_front(list_iterator)
+
+    assert bl.args[0] == Assignment(s[1], 11)
+
+
+def test_loop_over_coordinate():
+    assignments = [
+        Assignment(dst[0, 0](0), s[0]),
+        Assignment(x, dst[0, 0](2))
+    ]
+
+    body = Block(assignments)
+    loop = LoopOverCoordinate(body, coordinate_to_loop_over=0, start=0, stop=10, step=1)
+
+    assert loop.body == body
+
+    new_body = Block([assignments[0]])
+    loop = loop.new_loop_with_different_body(new_body)
+    assert loop.body == new_body
+
+    assert loop.start == 0
+    assert loop.stop == 10
+    assert loop.step == 1
+
+    loop.replace(loop.start, 2)
+    loop.replace(loop.stop, 20)
+    loop.replace(loop.step, 2)
+
+    assert loop.start == 2
+    assert loop.stop == 20
+    assert loop.step == 2
+
+
+def test_sympy_assignment():
+
+    assignment = SympyAssignment(dst[0, 0](0), sp.log(x + 3) / sp.log(2) + sp.log(x ** 2 + 1))
+    assignment.optimize(optims_c99)
+
+    ast = ps.create_kernel([assignment])
+    code = ps.get_code_str(ast)
+
+    assert 'log1p' in code
+    assert 'log2' in code
+
+    assignment.replace(assignment.lhs, dst[0, 0](1))
+    assignment.replace(assignment.rhs, sp.log(2))
+
+    assert assignment.lhs == dst[0, 0](1)
+    assert assignment.rhs == sp.log(2)
diff --git a/pystencils_tests/test_data/datahandling_load_test.npz b/pystencils_tests/test_data/datahandling_load_test.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d363a8a0aba1bb78a06314a19b887eb4c4975334
Binary files /dev/null and b/pystencils_tests/test_data/datahandling_load_test.npz differ
diff --git a/pystencils_tests/test_data/datahandling_save_test.npz b/pystencils_tests/test_data/datahandling_save_test.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d363a8a0aba1bb78a06314a19b887eb4c4975334
Binary files /dev/null and b/pystencils_tests/test_data/datahandling_save_test.npz differ
diff --git a/pystencils_tests/test_datahandling.py b/pystencils_tests/test_datahandling.py
index 7f95fe1f04d1bbf72770d2837bf6723681c1f50d..949f067896d3374a7fcb9ecb1369fe5a37eb1f9a 100644
--- a/pystencils_tests/test_datahandling.py
+++ b/pystencils_tests/test_datahandling.py
@@ -1,5 +1,6 @@
 import os
 from tempfile import TemporaryDirectory
+from pathlib import Path
 
 import numpy as np
 
@@ -12,6 +13,9 @@ except ImportError:
     import unittest.mock
     pytest = unittest.mock.MagicMock()
 
+SCRIPT_FOLDER = Path(__file__).parent.absolute()
+INPUT_FOLDER = SCRIPT_FOLDER / "test_data"
+
 
 def basic_iteration(dh):
     dh.add_array('basic_iter_test_gl_default')
@@ -111,6 +115,11 @@ def kernel_execution_jacobi(dh, target):
     test_gpu = target == 'gpu' or target == 'opencl'
     dh.add_array('f', gpu=test_gpu)
     dh.add_array('tmp', gpu=test_gpu)
+
+    if test_gpu:
+        assert dh.is_on_gpu('f')
+        assert dh.is_on_gpu('tmp')
+
     stencil_2d = [(1, 0), (-1, 0), (0, 1), (0, -1)]
     stencil_3d = [(1, 0, 0), (-1, 0, 0), (0, 1, 0), (0, -1, 0), (0, 0, 1), (0, 0, -1)]
     stencil = stencil_2d if dh.dim == 2 else stencil_3d
@@ -197,6 +206,7 @@ def test_access_and_gather():
 def test_kernel():
     for domain_shape in [(4, 5), (3, 4, 5)]:
         dh = create_data_handling(domain_size=domain_shape, periodicity=True)
+        assert all(dh.periodicity)
         kernel_execution_jacobi(dh, 'cpu')
         reduction(dh)
 
@@ -243,3 +253,105 @@ def test_add_arrays():
     assert y_ == y
     assert x == dh.fields['x']
     assert y == dh.fields['y']
+
+
+def test_get_kwarg():
+    domain_shape = (10, 10)
+    field_description = 'src, dst'
+
+    dh = create_data_handling(domain_size=domain_shape, default_ghost_layers=1)
+    src, dst = dh.add_arrays(field_description)
+    dh.fill("src", 1.0, ghost_layers=True)
+    dh.fill("dst", 0.0, ghost_layers=True)
+
+    with pytest.raises(ValueError):
+        dh.add_array('src')
+
+    ur = ps.Assignment(src.center, dst.center)
+    kernel = ps.create_kernel(ur).compile()
+
+    kw = dh.get_kernel_kwargs(kernel)
+    assert np.all(kw[0]['src'] == dh.cpu_arrays['src'])
+    assert np.all(kw[0]['dst'] == dh.cpu_arrays['dst'])
+
+
+def test_add_custom_data():
+    pytest.importorskip('pycuda')
+
+    import pycuda.gpuarray as gpuarray
+    import pycuda.autoinit  # noqa
+
+    def cpu_data_create_func():
+        return np.ones((2, 2), dtype=np.float64)
+
+    def gpu_data_create_func():
+        return gpuarray.zeros((2, 2), dtype=np.float64)
+
+    def cpu_to_gpu_transfer_func(gpuarr, cpuarray):
+        gpuarr.set(cpuarray)
+
+    def gpu_to_cpu_transfer_func(gpuarr, cpuarray):
+        gpuarr.get(cpuarray)
+
+    dh = create_data_handling(domain_size=(10, 10))
+    dh.add_custom_data('custom_data',
+                       cpu_data_create_func,
+                       gpu_data_create_func,
+                       cpu_to_gpu_transfer_func,
+                       gpu_to_cpu_transfer_func)
+
+    assert np.all(dh.custom_data_cpu['custom_data'] == 1)
+    assert np.all(dh.custom_data_gpu['custom_data'].get() == 0)
+
+    dh.to_cpu(name='custom_data')
+    dh.to_gpu(name='custom_data')
+
+    assert 'custom_data' in dh.custom_data_names
+
+
+def test_log():
+    dh = create_data_handling(domain_size=(10, 10))
+    dh.log_on_root()
+    assert dh.is_root
+    assert dh.world_rank == 0
+
+
+def test_save_data():
+    domain_shape = (2, 2)
+
+    dh = create_data_handling(domain_size=domain_shape, default_ghost_layers=1)
+    dh.add_array("src", values_per_cell=9)
+    dh.fill("src", 1.0, ghost_layers=True)
+    dh.add_array("dst", values_per_cell=9)
+    dh.fill("dst", 1.0, ghost_layers=True)
+
+    dh.save_all(str(INPUT_FOLDER) + '/datahandling_save_test')
+
+
+def test_load_data():
+    domain_shape = (2, 2)
+
+    dh = create_data_handling(domain_size=domain_shape, default_ghost_layers=1)
+    dh.add_array("src", values_per_cell=9)
+    dh.fill("src", 0.0, ghost_layers=True)
+    dh.add_array("dst", values_per_cell=9)
+    dh.fill("dst", 0.0, ghost_layers=True)
+
+    dh.load_all(str(INPUT_FOLDER) + '/datahandling_load_test')
+    assert np.all(dh.cpu_arrays['src']) == 1
+    assert np.all(dh.cpu_arrays['dst']) == 1
+
+    domain_shape = (3, 3)
+
+    dh = create_data_handling(domain_size=domain_shape, default_ghost_layers=1)
+    dh.add_array("src", values_per_cell=9)
+    dh.fill("src", 0.0, ghost_layers=True)
+    dh.add_array("dst", values_per_cell=9)
+    dh.fill("dst", 0.0, ghost_layers=True)
+    dh.add_array("dst2", values_per_cell=9)
+    dh.fill("dst2", 0.0, ghost_layers=True)
+
+    dh.load_all(str(INPUT_FOLDER) + '/datahandling_load_test')
+    assert np.all(dh.cpu_arrays['src']) == 0
+    assert np.all(dh.cpu_arrays['dst']) == 0
+    assert np.all(dh.cpu_arrays['dst2']) == 0
diff --git a/pystencils_tests/test_datahandling_parallel.py b/pystencils_tests/test_datahandling_parallel.py
index 0bfbfbd02fc86952e7fba84ce6aa080d2080ec0a..efe106dc0090bf69dff0d187008c1a8653c4a98e 100644
--- a/pystencils_tests/test_datahandling_parallel.py
+++ b/pystencils_tests/test_datahandling_parallel.py
@@ -1,10 +1,17 @@
 import numpy as np
 import waLBerla as wlb
+from pystencils import make_slice
 
 from pystencils.datahandling.parallel_datahandling import ParallelDataHandling
 from pystencils_tests.test_datahandling import (
     access_and_gather, kernel_execution_jacobi, reduction, synchronization, vtk_output)
 
+try:
+    import pytest
+except ImportError:
+    import unittest.mock
+    pytest = unittest.mock.MagicMock()
+
 
 def test_access_and_gather():
     block_size = (4, 7, 1)
@@ -64,3 +71,51 @@ def test_vtk_output():
     blocks = wlb.createUniformBlockGrid(blocks=(3, 2, 4), cellsPerBlock=(3, 2, 5), oneBlockPerProcess=False)
     dh = ParallelDataHandling(blocks)
     vtk_output(dh)
+
+
+def test_block_iteration():
+    block_size = (16, 16, 16)
+    num_blocks = (2, 2, 2)
+    blocks = wlb.createUniformBlockGrid(blocks=num_blocks, cellsPerBlock=block_size, oneBlockPerProcess=False)
+    dh = ParallelDataHandling(blocks, default_ghost_layers=2)
+    dh.add_array('v', values_per_cell=1, dtype=np.int64, ghost_layers=2, gpu=True)
+
+    for b in dh.iterate():
+        b['v'].fill(1)
+
+    s = 0
+    for b in dh.iterate():
+        s += np.sum(b['v'])
+
+    assert s == 40*40*40
+
+    sl = make_slice[0:18, 0:18, 0:18]
+    for b in dh.iterate(slice_obj=sl):
+        b['v'].fill(0)
+
+    s = 0
+    for b in dh.iterate():
+        s += np.sum(b['v'])
+
+    assert s == 40*40*40 - 20*20*20
+
+
+def test_getter_setter():
+    block_size = (2, 2, 2)
+    num_blocks = (2, 2, 2)
+    blocks = wlb.createUniformBlockGrid(blocks=num_blocks, cellsPerBlock=block_size, oneBlockPerProcess=False)
+    dh = ParallelDataHandling(blocks, default_ghost_layers=2)
+    dh.add_array('v', values_per_cell=1, dtype=np.int64, ghost_layers=2, gpu=True)
+
+    assert dh.shape == (4, 4, 4)
+    assert dh.periodicity == (False, False, False)
+    assert dh.values_per_cell('v') == 1
+    assert dh.has_data('v') is True
+    assert 'v' in dh.array_names
+    dh.log_on_root()
+    assert dh.is_root is True
+    assert dh.world_rank == 0
+
+    dh.to_gpu('v')
+    assert dh.is_on_gpu('v') is True
+    dh.all_to_cpu()
diff --git a/pystencils_tests/test_fast_approximation.py b/pystencils_tests/test_fast_approximation.py
index f4d19fa19fa615aaa76a7b1bb445fa7bdb886237..ccd2d7b8ea15a4c36069a58bc1197b314cce2ca1 100644
--- a/pystencils_tests/test_fast_approximation.py
+++ b/pystencils_tests/test_fast_approximation.py
@@ -11,9 +11,9 @@ def test_fast_sqrt():
 
     assert len(insert_fast_sqrts(expr).atoms(fast_sqrt)) == 1
     assert len(insert_fast_sqrts([expr])[0].atoms(fast_sqrt)) == 1
-    ast = ps.create_kernel(ps.Assignment(g[0, 0], insert_fast_sqrts(expr)), target='gpu')
-    ast.compile()
-    code_str = ps.get_code_str(ast)
+    ast_gpu = ps.create_kernel(ps.Assignment(g[0, 0], insert_fast_sqrts(expr)), target='gpu')
+    ast_gpu.compile()
+    code_str = ps.get_code_str(ast_gpu)
     assert '__fsqrt_rn' in code_str
 
     expr = ps.Assignment(sp.Symbol("tmp"), 3 / sp.sqrt(f[0, 0] + f[1, 0]))
@@ -21,9 +21,9 @@ def test_fast_sqrt():
 
     ac = ps.AssignmentCollection([expr], [])
     assert len(insert_fast_sqrts(ac).main_assignments[0].atoms(fast_inv_sqrt)) == 1
-    ast = ps.create_kernel(insert_fast_sqrts(ac), target='gpu')
-    ast.compile()
-    code_str = ps.get_code_str(ast)
+    ast_gpu = ps.create_kernel(insert_fast_sqrts(ac), target='gpu')
+    ast_gpu.compile()
+    code_str = ps.get_code_str(ast_gpu)
     assert '__frsqrt_rn' in code_str
 
 
diff --git a/pystencils_tests/test_kerncraft_coupling.py b/pystencils_tests/test_kerncraft_coupling.py
index 653ed34d90e6ecd45a7a5785fb71d522cc3734f5..aeb4b7acb6f62730f0896b10d6b8fa6d655c871d 100644
--- a/pystencils_tests/test_kerncraft_coupling.py
+++ b/pystencils_tests/test_kerncraft_coupling.py
@@ -7,7 +7,7 @@ from kerncraft.kernel import KernelCode
 from kerncraft.machinemodel import MachineModel
 from kerncraft.models import ECM, ECMData, Benchmark
 
-from pystencils import Assignment, Field
+from pystencils import Assignment, Field, fields
 from pystencils.cpu import create_kernel
 from pystencils.kerncraft_coupling import KerncraftParameters, PyStencilsKerncraftKernel
 from pystencils.kerncraft_coupling.generate_benchmark import generate_benchmark, run_c_benchmark
@@ -159,3 +159,15 @@ def test_benchmark():
     timeloop_time = timeloop.benchmark(number_of_time_steps_for_estimation=1)
 
     np.testing.assert_almost_equal(c_benchmark_run, timeloop_time, decimal=4)
+
+
+@pytest.mark.kerncraft
+def test_kerncraft_generic_field():
+    a = fields('a: double[3D]')
+    b = fields('b: double[3D]')
+    s = sp.Symbol("s")
+    rhs = a[0, -1, 0] + a[0, 1, 0] + a[-1, 0, 0] + a[1, 0, 0] + a[0, 0, -1] + a[0, 0, 1]
+
+    update_rule = Assignment(b[0, 0, 0], s * rhs)
+    ast = create_kernel([update_rule])
+    k = PyStencilsKerncraftKernel(ast, debug_print=True)
diff --git a/pystencils_tests/test_simplification_strategy.py b/pystencils_tests/test_simplification_strategy.py
index 189482c006197160e1f49771dc534206f8d0ef9e..5176ae5f49aa45ed952882cb0313d5e5f7754177 100644
--- a/pystencils_tests/test_simplification_strategy.py
+++ b/pystencils_tests/test_simplification_strategy.py
@@ -1,5 +1,6 @@
 import sympy as sp
 
+import pystencils as ps
 from pystencils import Assignment, AssignmentCollection
 from pystencils.simp import (
     SimplificationStrategy, apply_on_all_subexpressions,
@@ -43,3 +44,39 @@ def test_simplification_strategy():
     assert 'Adds' in report._repr_html_()
 
     assert 'factor' in str(strategy)
+
+
+def test_split_inner_loop():
+    dst = ps.fields('dst(8): double[2D]')
+    s = sp.symbols('s_:8')
+    x = sp.symbols('x')
+    subexpressions = []
+    main = [
+        Assignment(dst[0, 0](0), s[0]),
+        Assignment(dst[0, 0](1), s[1]),
+        Assignment(dst[0, 0](2), s[2]),
+        Assignment(dst[0, 0](3), s[3]),
+        Assignment(dst[0, 0](4), s[4]),
+        Assignment(dst[0, 0](5), s[5]),
+        Assignment(dst[0, 0](6), s[6]),
+        Assignment(dst[0, 0](7), s[7]),
+        Assignment(x, sum(s))
+    ]
+    ac = AssignmentCollection(main, subexpressions)
+    split_groups = [[dst[0, 0](0), dst[0, 0](1)],
+                    [dst[0, 0](2), dst[0, 0](3)],
+                    [dst[0, 0](4), dst[0, 0](5)],
+                    [dst[0, 0](6), dst[0, 0](7), x]]
+    ac.simplification_hints['split_groups'] = split_groups
+    ast = ps.create_kernel(ac)
+
+    code = ps.get_code_str(ast)
+    # we have four inner loops as indicated in split groups (4 elements) plus one outer loop
+    assert code.count('for') == 5
+
+    ac = AssignmentCollection(main, subexpressions)
+    ast = ps.create_kernel(ac)
+
+    code = ps.get_code_str(ast)
+    # one inner loop and one outer loop
+    assert code.count('for') == 2
diff --git a/pystencils_tests/test_slicing.py b/pystencils_tests/test_slicing.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e36576bfb622cae3f4f9ed865a8b2f8308430b
--- /dev/null
+++ b/pystencils_tests/test_slicing.py
@@ -0,0 +1,73 @@
+import numpy as np
+from pystencils import create_data_handling
+from pystencils.slicing import SlicedGetter, make_slice, SlicedGetterDataHandling, shift_slice, slice_intersection
+
+
+def test_sliced_getter():
+    def get_slice(slice_obj=None):
+        arr = np.ones((10, 10))
+        if slice_obj is None:
+            slice_obj = make_slice[:, :]
+
+        return arr[slice_obj]
+
+    sli = SlicedGetter(get_slice)
+
+    test = make_slice[2:-2, 2:-2]
+    assert sli[test].shape == (6, 6)
+
+
+def test_sliced_getter_data_handling():
+    domain_shape = (10, 10)
+
+    dh = create_data_handling(domain_size=domain_shape, default_ghost_layers=1)
+    dh.add_array("src", values_per_cell=1)
+    dh.fill("src", 1.0, ghost_layers=True)
+
+    dh.add_array("dst", values_per_cell=1)
+    dh.fill("dst", 0.0, ghost_layers=True)
+
+    sli = SlicedGetterDataHandling(dh, 'dst')
+    slice_obj = make_slice[2:-2, 2:-2]
+    assert np.sum(sli[slice_obj]) == 0
+
+    sli = SlicedGetterDataHandling(dh, 'src')
+    slice_obj = make_slice[2:-2, 2:-2]
+    assert np.sum(sli[slice_obj]) == 36
+
+
+def test_shift_slice():
+
+    sh = shift_slice(make_slice[2:-2, 2:-2], [1, 2])
+    assert sh[0] == slice(3, -1, None)
+    assert sh[1] == slice(4, 0, None)
+
+    sh = shift_slice(make_slice[2:-2, 2:-2], 1)
+    assert sh[0] == slice(3, -1, None)
+    assert sh[1] == slice(3, -1, None)
+
+    sh = shift_slice([2, 4], 1)
+    assert sh[0] == 3
+    assert sh[1] == 5
+
+    sh = shift_slice([2, None], 1)
+    assert sh[0] == 3
+    assert sh[1] is None
+
+    sh = shift_slice([1.5, 1.5], 1)
+    assert sh[0] == 1.5
+    assert sh[1] == 1.5
+
+
+def test_slice_intersection():
+    sl1 = make_slice[1:10, 1:10]
+    sl2 = make_slice[5:15, 5:15]
+
+    intersection = slice_intersection(sl1, sl2)
+    assert intersection[0] == slice(5, 10, None)
+    assert intersection[1] == slice(5, 10, None)
+
+    sl2 = make_slice[12:15, 12:15]
+
+    intersection = slice_intersection(sl1, sl2)
+    assert intersection is None
diff --git a/pystencils_tests/test_staggered_kernel.py b/pystencils_tests/test_staggered_kernel.py
index c4f491393d2b4d08fbb9b4c99356abdc7e7199b6..eecbf706c5538ad544d29a6b37366944aa504680 100644
--- a/pystencils_tests/test_staggered_kernel.py
+++ b/pystencils_tests/test_staggered_kernel.py
@@ -4,6 +4,7 @@ import sympy as sp
 import pytest
 
 import pystencils as ps
+from pystencils import x_staggered_vector, TypedSymbol
 
 
 class TestStaggeredDiffusion:
@@ -96,3 +97,12 @@ def test_staggered_loop_cutting():
     assignments = [ps.Assignment(j.staggered_access("SW"), 1)]
     ast = ps.create_staggered_kernel(assignments, target=dh.default_target)
     assert not ast.atoms(ps.astnodes.Conditional)
+
+
+def test_staggered_vector():
+    dim = 2
+    v = x_staggered_vector(dim)
+    ctr0 = TypedSymbol('ctr_0', 'int', nonnegative=True)
+    ctr1 = TypedSymbol('ctr_1', 'int', nonnegative=True)
+    expected_result = sp.Matrix(tuple((ctr0 + 0.5, ctr1 + 0.5)))
+    assert v == expected_result
\ No newline at end of file
diff --git a/pystencils_tests/test_stencils.py b/pystencils_tests/test_stencils.py
index bf1418e053d41ebeb80b6ae623f1834b24a8a713..fa205ad7f04a446e1e16d5ea95e679cea70ecde4 100644
--- a/pystencils_tests/test_stencils.py
+++ b/pystencils_tests/test_stencils.py
@@ -1 +1,30 @@
 import pystencils as ps
+import sympy as sp
+
+from pystencils.stencil import coefficient_list, plot_expression
+
+
+def test_coefficient_list():
+    f = ps.fields("f: double[1D]")
+    expr = 2 * f[1] + 3 * f[-1]
+    coff = coefficient_list(expr)
+    assert coff == [3, 0, 2]
+    plot_expression(expr, matrix_form=True)
+
+    f = ps.fields("f: double[3D]")
+    expr = 2 * f[1, 0, 0] + 3 * f[0, -1, 0]
+    coff = coefficient_list(expr)
+    assert coff == [[[0, 3, 0], [0, 0, 2], [0, 0, 0]]]
+
+    expr = 2 * f[1, 0, 0] + 3 * f[0, -1, 0] + 4 * f[0, 0, 1]
+    coff = coefficient_list(expr, matrix_form=True)
+    assert coff[0] == sp.zeros(3, 3)
+
+    # in 3D plot only works if there are entries on every of the three 2D planes. In the above examples z-1 was empty
+    expr = 2 * f[1, 0, 0] + 1 * f[0, -1, 0] + 1 * f[0, 0, 1] + f[0, 0, -1]
+    plot_expression(expr)
+
+
+def test_plot_expression():
+    f = ps.fields("f: double[2D]")
+    plot_expression(2 * f[1, 0] + 3 * f[0, -1], matrix_form=True)
\ No newline at end of file
diff --git a/pystencils_tests/test_sympyextensions.py b/pystencils_tests/test_sympyextensions.py
index 1636df632670a4a909321999067389bdaaa56a62..2135ee88e7ee5d9375ce5213a2bdb953bea3d5b8 100644
--- a/pystencils_tests/test_sympyextensions.py
+++ b/pystencils_tests/test_sympyextensions.py
@@ -119,7 +119,7 @@ def test_count_operations():
     expr = sympy.Pow(1/x + y * sympy.sqrt(z), 100)
     ops = count_operations(expr, only_type=None)
     assert ops['adds'] == 1
-    assert ops['muls'] == 100
+    assert ops['muls'] == 99
     assert ops['divs'] == 1
     assert ops['sqrts'] == 1
 
diff --git a/pystencils/test_type_interference.py b/pystencils_tests/test_type_interference.py
similarity index 100%
rename from pystencils/test_type_interference.py
rename to pystencils_tests/test_type_interference.py
diff --git a/pystencils_tests/test_vectorization.py b/pystencils_tests/test_vectorization.py
index 1fa1812eb8fced19d114ef3cb32900ec9b93f995..3cc1be4f36b126baf19d074657b4394cdd2cefbb 100644
--- a/pystencils_tests/test_vectorization.py
+++ b/pystencils_tests/test_vectorization.py
@@ -4,6 +4,7 @@ import sympy as sp
 import pystencils as ps
 from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
 from pystencils.cpu.vectorization import vectorize
+from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
 from pystencils.transformations import replace_inner_stride_with_one
 
 
@@ -109,7 +110,6 @@ def test_piecewise1():
 
 
 def test_piecewise2():
-
     arr = np.zeros((20, 20))
 
     @ps.kernel
@@ -128,7 +128,6 @@ def test_piecewise2():
 
 
 def test_piecewise3():
-
     arr = np.zeros((22, 22))
 
     @ps.kernel
@@ -146,12 +145,32 @@ def test_logical_operators():
     arr = np.zeros((22, 22))
 
     @ps.kernel
-    def test_kernel(s):
+    def kernel_and(s):
         f, g = ps.fields(f=arr, g=arr)
         s.c @= sp.And(f[0, 1] < 0.0, f[1, 0] < 0.0)
         g[0, 0] @= sp.Piecewise([1.0 / f[1, 0], s.c], [1.0, True])
 
-    ast = ps.create_kernel(test_kernel)
+    ast = ps.create_kernel(kernel_and)
+    vectorize(ast)
+    ast.compile()
+
+    @ps.kernel
+    def kernel_or(s):
+        f, g = ps.fields(f=arr, g=arr)
+        s.c @= sp.Or(f[0, 1] < 0.0, f[1, 0] < 0.0)
+        g[0, 0] @= sp.Piecewise([1.0 / f[1, 0], s.c], [1.0, True])
+
+    ast = ps.create_kernel(kernel_or)
+    vectorize(ast)
+    ast.compile()
+
+    @ps.kernel
+    def kernel_equal(s):
+        f, g = ps.fields(f=arr, g=arr)
+        s.c @= sp.Eq(f[0, 1], 2.0)
+        g[0, 0] @= sp.Piecewise([1.0 / f[1, 0], s.c], [1.0, True])
+
+    ast = ps.create_kernel(kernel_equal)
     vectorize(ast)
     ast.compile()
 
@@ -159,3 +178,61 @@ def test_logical_operators():
 def test_hardware_query():
     instruction_sets = get_supported_instruction_sets()
     assert 'sse' in instruction_sets
+
+
+def test_vectorised_pow():
+    arr = np.zeros((24, 24))
+    f, g = ps.fields(f=arr, g=arr)
+
+    as1 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], 2))
+    as2 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], 0.5))
+    as3 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], -0.5))
+    as4 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], 4))
+    as5 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], -4))
+    as6 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], -1))
+
+    ast = ps.create_kernel(as1)
+    vectorize(ast)
+    ast.compile()
+
+    ast = ps.create_kernel(as2)
+    vectorize(ast)
+    ast.compile()
+
+    ast = ps.create_kernel(as3)
+    vectorize(ast)
+    ast.compile()
+
+    ast = ps.create_kernel(as4)
+    vectorize(ast)
+    ast.compile()
+
+    ast = ps.create_kernel(as5)
+    vectorize(ast)
+    ast.compile()
+
+    ast = ps.create_kernel(as6)
+    vectorize(ast)
+    ast.compile()
+
+
+def test_vectorised_fast_approximations():
+    arr = np.zeros((24, 24))
+    f, g = ps.fields(f=arr, g=arr)
+
+    expr = sp.sqrt(f[0, 0] + f[1, 0])
+    assignment = ps.Assignment(g[0, 0], insert_fast_sqrts(expr))
+    ast = ps.create_kernel(assignment)
+    vectorize(ast)
+    ast.compile()
+
+    expr = f[0, 0] / f[1, 0]
+    assignment = ps.Assignment(g[0, 0], insert_fast_divisions(expr))
+    ast = ps.create_kernel(assignment)
+    vectorize(ast)
+    ast.compile()
+
+    assignment = ps.Assignment(sp.Symbol("tmp"), 3 / sp.sqrt(f[0, 0] + f[1, 0]))
+    ast = ps.create_kernel(insert_fast_sqrts(assignment))
+    vectorize(ast)
+    ast.compile()