diff --git a/boundaries/boundaryhandling.py b/boundaries/boundaryhandling.py
index b38d86656bff6cfe653a189707cea9f531dc1633..0c72ab34b89065fe3ff36694a0c1e37212715d7a 100644
--- a/boundaries/boundaryhandling.py
+++ b/boundaries/boundaryhandling.py
@@ -211,6 +211,23 @@ class BoundaryHandling:
 
                 self._boundary_object_to_boundary_info[b_obj].kernel(**kwargs)
 
+    def add_fixed_steps(self, fixed_loop, **kwargs):
+        if self._dirty:
+            self.prepare()
+
+        for b in self._data_handling.iterate(gpu=self._target == 'gpu'):
+            for b_obj, idx_arr in b[self._index_array_name].boundary_object_to_index_list.items():
+                arguments = kwargs.copy()
+                arguments[self._field_name] = b[self._field_name]
+                arguments['indexField'] = idx_arr
+                data_used_in_kernel = (p.field_name
+                                       for p in self._boundary_object_to_boundary_info[b_obj].kernel.parameters
+                                       if p.is_field_ptr_argument and p.field_name not in arguments)
+                arguments.update({name: b[name] for name in data_used_in_kernel if name not in arguments})
+
+                kernel = self._boundary_object_to_boundary_info[b_obj].kernel
+                fixed_loop.add_call(kernel, arguments)
+
     def geometry_to_vtk(self, file_name='geometry', boundaries='all', ghost_layers=False):
         """
         Writes a VTK field where each cell with the given boundary is marked with 1, other cells are 0
diff --git a/cpu/cpujit.py b/cpu/cpujit.py
index 62ed51cda449a8ac799d08524f6f444113b14da9..c5fddc2c324aca946d0f41f183caa9d80cb03769 100644
--- a/cpu/cpujit.py
+++ b/cpu/cpujit.py
@@ -50,7 +50,6 @@ import platform
 import shutil
 import textwrap
 import numpy as np
-import functools
 import subprocess
 from appdirs import user_config_dir, user_cache_dir
 from collections import OrderedDict
diff --git a/datahandling/parallel_datahandling.py b/datahandling/parallel_datahandling.py
index a5d88c6d943166824becd5e733528759b6c9bffb..0c85e4a77815ef58eca4fa5d85ce37a0043fd2dd 100644
--- a/datahandling/parallel_datahandling.py
+++ b/datahandling/parallel_datahandling.py
@@ -1,11 +1,11 @@
 import numpy as np
+import warnings
 from pystencils import Field
 from pystencils.datahandling.datahandling_interface import DataHandling
 from pystencils.datahandling.blockiteration import sliced_block_iteration, block_iteration
 from pystencils.utils import DotDict
 # noinspection PyPep8Naming
 import waLBerla as wlb
-import warnings
 
 
 class ParallelDataHandling(DataHandling):
@@ -16,14 +16,15 @@ class ParallelDataHandling(DataHandling):
         """
         Creates data handling based on walberla block storage
 
-        :param blocks: walberla block storage
-        :param default_ghost_layers: nr of ghost layers used if not specified in add() method
-        :param default_layout: layout used if no layout is given to add() method
-        :param dim: dimension of scenario,
-                    walberla always uses three dimensions, so if dim=2 the extend of the
-                    z coordinate of blocks has to be 1
-        :param default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated
-                              if not overwritten in add_array, and synchronization functions are for the GPU by default
+        Args:
+            blocks: walberla block storage
+            default_ghost_layers: nr of ghost layers used if not specified in add() method
+            default_layout: layout used if no layout is given to add() method
+            dim: dimension of scenario,
+                 walberla always uses three dimensions, so if dim=2 the extend of the
+                 z coordinate of blocks has to be 1
+            default_target: either 'cpu' or 'gpu' . If set to 'gpu' for each array also a GPU version is allocated
+                           if not overwritten in add_array, and synchronization functions are for the GPU by default
         """
         super(ParallelDataHandling, self).__init__()
         assert dim in (2, 3)
@@ -135,8 +136,6 @@ class ParallelDataHandling(DataHandling):
         self._field_name_to_cpu_data_name[name] = name
         if gpu:
             self._field_name_to_gpu_data_name[name] = self.GPU_DATA_PREFIX + name
-
-        self._rebuild_data_cache()
         return self.fields[name]
 
     def has_data(self, name):
@@ -156,15 +155,8 @@ class ParallelDataHandling(DataHandling):
 
     def swap(self, name1, name2, gpu=False):
         if gpu:
-            for d in self._data_cache_gpu:
-                d[name1], d[name2] = d[name2], d[name1]
-
             name1 = self.GPU_DATA_PREFIX + name1
             name2 = self.GPU_DATA_PREFIX + name2
-        else:
-            for d in self._data_cache_cpu:
-                d[name1], d[name2] = d[name2], d[name1]
-
         for block in self.blocks:
             block[name1].swapDataPointers(block[name2])
 
@@ -222,31 +214,31 @@ class ParallelDataHandling(DataHandling):
             arr = arr[:, :, 0]
         return arr
 
-    def _rebuild_data_cache(self):
-        self._data_cache_cpu = []
-        self._data_cache_gpu = []
-
-        elements = [(self._data_cache_cpu, wlb.field.toArray, self._field_name_to_cpu_data_name)]
-        if self._field_name_to_gpu_data_name:
-            elements.append((self._data_cache_gpu, wlb.cuda.toGpuArray, self._field_name_to_gpu_data_name))
-
-        for cache, to_array, name_to_data_name in elements:
-            for block in self.blocks:
-                field_args = {}
-                for field_name, data_name in name_to_data_name.items():
-                    field = self.fields[field_name]
-                    arr = to_array(block[data_name], withGhostLayers=[True, True, self.dim == 3])
-                    arr = self._normalize_arr_shape(arr, field.index_dimensions)
-                    field_args[field_name] = arr
-                cache.append(field_args)
-
     def run_kernel(self, kernel_function, **kwargs):
+        for arg_dict in self.get_kernel_kwargs(kernel_function, **kwargs):
+            kernel_function(**arg_dict)
+
+    def get_kernel_kwargs(self, kernel_function, **kwargs):
         if kernel_function.ast.backend == 'gpucuda':
-            for d in self._data_cache_gpu:
-                kernel_function(**d, **kwargs)
+            name_map = self._field_name_to_gpu_data_name
+            to_array = wlb.cuda.toGpuArray
         else:
-            for d in self._data_cache_cpu:
-                kernel_function(**d, **kwargs)
+            name_map = self._field_name_to_cpu_data_name
+            to_array = wlb.field.toArray
+        data_used_in_kernel = [(name_map[p.field_name], self.fields[p.field_name])
+                               for p in kernel_function.parameters if
+                               p.is_field_ptr_argument and p.field_name not in kwargs]
+
+        result = []
+        for block in self.blocks:
+            field_args = {}
+            for data_name, f in data_used_in_kernel:
+                arr = to_array(block[data_name], withGhostLayers=[True, True, self.dim == 3])
+                arr = self._normalize_arr_shape(arr, f.index_dimensions)
+                field_args[f.name] = arr
+            field_args.update(kwargs)
+            result.append(field_args)
+        return result
 
     def to_cpu(self, name):
         if name in self._custom_data_transfer_functions:
diff --git a/datahandling/serial_datahandling.py b/datahandling/serial_datahandling.py
index c6026a81ac7ad2d6477fd810195a71fd898d94e7..3bd2ff95c896c38be989a930293b3d010f3608c0 100644
--- a/datahandling/serial_datahandling.py
+++ b/datahandling/serial_datahandling.py
@@ -211,6 +211,12 @@ class SerialDataHandling(DataHandling):
         arrays = self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays
         kernel_function(**arrays, **kwargs)
 
+    def get_kernel_kwargs(self, kernel_function, **kwargs):
+        result = {}
+        result.update(self.gpu_arrays if kernel_function.ast.backend == 'gpucuda' else self.cpu_arrays)
+        result.update(kwargs)
+        return [result]
+
     def to_cpu(self, name):
         if name in self._custom_data_transfer_functions:
             transfer_func = self._custom_data_transfer_functions[name][1]
diff --git a/timeloop.py b/timeloop.py
index 9a4a3c46187ff2b4b19c51301f0078356290542c..ec5b32e472a08f6e6448d375d4c4e75920206134 100644
--- a/timeloop.py
+++ b/timeloop.py
@@ -1,56 +1,81 @@
 import time
 
+from pystencils.integer_functions import modulo_ceil
+
 
 class TimeLoop:
-    def __init__(self):
-        self._preRunFunctions = []
-        self._postRunFunctions = []
-        self._timeStepFunctions = []
-        self._functionNames = []
+    def __init__(self, steps=2):
+        self._call_data = []
+        self._fixed_steps = steps
+        self._pre_run_functions = []
+        self._post_run_functions = []
+        self._single_step_functions = []
         self.time_steps_run = 0
 
-    def add_step(self, step_obj):
-        if hasattr(step_obj, 'pre_run'):
-            self.add_pre_run_function(step_obj.pre_run)
-        if hasattr(step_obj, 'post_run'):
-            self.add_post_run_function(step_obj.post_run)
-        self.add(step_obj.time_step, step_obj.name)
-
-    def add(self, time_step_function, name=None):
-        if name is None:
-            name = str(time_step_function)
-        self._timeStepFunctions.append(time_step_function)
-        self._functionNames.append(name)
-
-    def add_kernel(self, data_handling, kernel_func, name=None):
-        self.add(lambda: data_handling.run_kernel(kernel_func), name)
+    @property
+    def fixed_steps(self):
+        return self._fixed_steps
 
     def add_pre_run_function(self, f):
-        self._preRunFunctions.append(f)
+        self._pre_run_functions.append(f)
 
     def add_post_run_function(self, f):
-        self._postRunFunctions.append(f)
+        self._post_run_functions.append(f)
+
+    def add_single_step_function(self, f):
+        self._single_step_functions.append(f)
+
+    def add_call(self, functor, argument_list):
+        if hasattr(functor, 'kernel'):
+            functor = functor.kernel
+        if not isinstance(argument_list, list):
+            argument_list = [argument_list]
+
+        for argument_dict in argument_list:
+            self._call_data.append((functor, argument_dict))
+
+    def pre_run(self):
+        for f in self._pre_run_functions:
+            f()
+
+    def post_run(self):
+        for f in self._post_run_functions:
+            f()
 
     def run(self, time_steps=1):
         self.pre_run()
-
+        fixed_steps = self._fixed_steps
+        call_data = self._call_data
+        main_iterations, rest_iterations = divmod(time_steps, fixed_steps)
         try:
-            for i in range(time_steps):
-                self.time_step()
+            for _ in range(main_iterations):
+                for func, kwargs in call_data:
+                    func(**kwargs)
+                self.time_steps_run += fixed_steps
+            for _ in range(rest_iterations):
+                for func in self._single_step_functions:
+                    func()
+                self.time_steps_run += 1
         except KeyboardInterrupt:
             pass
-
         self.post_run()
 
     def benchmark_run(self, time_steps=0, init_time_steps=0):
+        init_time_steps_rounded = modulo_ceil(init_time_steps, self._fixed_steps)
+        time_steps_rounded = modulo_ceil(time_steps, self._fixed_steps)
+
         self.pre_run()
-        for i in range(init_time_steps):
-            self.time_step()
+        for i in range(init_time_steps_rounded // self._fixed_steps):
+            for func, kwargs in self._call_data:
+                func(**kwargs)
+        self.time_steps_run += init_time_steps_rounded
 
         start = time.perf_counter()
-        for i in range(time_steps):
-            self.time_step()
+        for i in range(time_steps_rounded // self._fixed_steps):
+            for func, kwargs in self._call_data:
+                func(**kwargs)
         end = time.perf_counter()
+        self.time_steps_run += time_steps_rounded
         self.post_run()
 
         time_for_one_iteration = (end - start) / time_steps
@@ -61,10 +86,12 @@ class TimeLoop:
         self.pre_run()
         start = time.perf_counter()
         while time.perf_counter() < start + seconds:
-            self.time_step()
-            iterations += 1
+            for func, kwargs in self._call_data:
+                func(**kwargs)
+            iterations += self._fixed_steps
         end = time.perf_counter()
         self.post_run()
+        self.time_steps_run += iterations
         return iterations, end - start
 
     def benchmark(self, time_for_benchmark=5, init_time_steps=2, number_of_time_steps_for_estimation='auto'):
@@ -88,16 +115,3 @@ class TimeLoop:
         time_steps = int(time_for_benchmark / duration_of_time_step)
         time_steps = max(time_steps, 4)
         return self.benchmark_run(time_steps, init_time_steps)
-
-    def pre_run(self):
-        for f in self._preRunFunctions:
-            f()
-
-    def post_run(self):
-        for f in self._postRunFunctions:
-            f()
-
-    def time_step(self):
-        for f in self._timeStepFunctions:
-            f()
-        self.time_steps_run += 1