From e33d7b14c741330a522b0f8794a0aa4ebaef49a6 Mon Sep 17 00:00:00 2001
From: Markus Holzer <markus.holzer@fau.de>
Date: Tue, 17 Oct 2023 12:49:00 +0200
Subject: [PATCH] Fix Outflow Boundary

---
 .../lbmpy_walberla/additional_data_handler.py |  46 ++++----
 python/lbmpy_walberla/boundary_collection.py  |  27 +++--
 python/lbmpy_walberla/packing_kernels.py      |   8 +-
 .../lbmpy_walberla/storage_specification.py   |  75 ++++++++++--
 python/lbmpy_walberla/sweep_collection.py     |  38 ++++---
 .../templates/BoundaryCollection.tmpl.h       |   7 +-
 .../LbmStorageSpecification.tmpl.cpp          |  44 +++----
 .../templates/LbmStorageSpecification.tmpl.h  |  36 +++++-
 python/lbmpy_walberla/walberla_lbm_package.py |  21 ++--
 .../additional_data_handler.py                |   4 +
 python/pystencils_walberla/boundary.py        |  17 +--
 .../templates/Boundary.tmpl.h                 |   4 +
 .../templates/SweepCollection.tmpl.h          | 107 ++++++++++--------
 13 files changed, 282 insertions(+), 152 deletions(-)

diff --git a/python/lbmpy_walberla/additional_data_handler.py b/python/lbmpy_walberla/additional_data_handler.py
index 692d0cf57..9a1f35187 100644
--- a/python/lbmpy_walberla/additional_data_handler.py
+++ b/python/lbmpy_walberla/additional_data_handler.py
@@ -1,5 +1,6 @@
 from pystencils import Target
 from pystencils.stencil import inverse_direction
+from pystencils.typing import BasicType
 
 from lbmpy.advanced_streaming import AccessPdfValues, numeric_offsets, numeric_index
 from lbmpy.advanced_streaming.indexing import MirroredStencilDirections
@@ -29,22 +30,6 @@ class FreeSlipAdditionalDataHandler(AdditionalDataHandler):
         self._boundary_object = boundary_object
         super(FreeSlipAdditionalDataHandler, self).__init__(stencil=stencil)
 
-    @property
-    def constructor_arguments(self):
-        return ""
-
-    @property
-    def initialiser_list(self):
-        return ""
-
-    @property
-    def additional_arguments_for_fill_function(self):
-        return ""
-
-    @property
-    def additional_parameters_for_fill_function(self):
-        return ""
-
     def data_initialisation(self, direction):
         def array_pattern(dtype, name, content):
             return f"const {str(dtype)} {name} [] = {{ {','.join(str(c) for c in content)} }};"
@@ -102,10 +87,6 @@ class FreeSlipAdditionalDataHandler(AdditionalDataHandler):
 
         return "\n".join(init_list)
 
-    @property
-    def additional_member_variable(self):
-        return ""
-
 
 class UBBAdditionalDataHandler(AdditionalDataHandler):
     def __init__(self, stencil, boundary_object):
@@ -113,10 +94,14 @@ class UBBAdditionalDataHandler(AdditionalDataHandler):
         self._boundary_object = boundary_object
         super(UBBAdditionalDataHandler, self).__init__(stencil=stencil)
 
+    @property
+    def constructor_argument_name(self):
+        return "velocityCallback"
+
     @property
     def constructor_arguments(self):
-        return ", std::function<Vector3<real_t>(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)>& " \
-               "velocityCallback "
+        return f", std::function<Vector3<real_t>(const Cell &, const shared_ptr<StructuredBlockForest>&, IBlock&)>& " \
+               f"{self.constructor_argument_name} "
 
     @property
     def initialiser_list(self):
@@ -146,7 +131,7 @@ class UBBAdditionalDataHandler(AdditionalDataHandler):
 
 
 class OutflowAdditionalDataHandler(AdditionalDataHandler):
-    def __init__(self, stencil, boundary_object, target=Target.CPU, field_name='pdfs'):
+    def __init__(self, stencil, boundary_object, target=Target.CPU, field_name='pdfs', pdfs_data_type=None):
         assert isinstance(boundary_object, ExtrapolationOutflow)
         self._boundary_object = boundary_object
         self._stencil = boundary_object.stencil
@@ -154,11 +139,22 @@ class OutflowAdditionalDataHandler(AdditionalDataHandler):
         self._normal_direction = boundary_object.normal_direction
         self._field_name = field_name
         self._target = target
+        self._dtype = BasicType(boundary_object.data_type).c_name
+        if pdfs_data_type is None:
+            self._pdfs_data_type = "real_t"
+        else:
+            pdfs_data_type = BasicType(pdfs_data_type)
+            self._pdfs_data_type = pdfs_data_type.c_name
+
         super(OutflowAdditionalDataHandler, self).__init__(stencil=stencil)
 
         assert sum([a != 0 for a in self._normal_direction]) == 1, \
             "The outflow boundary is only implemented for straight walls at the moment."
 
+    @property
+    def constructor_argument_name(self):
+        return f"{self._field_name}CPUID_" if self._target == Target.GPU else ""
+
     @property
     def constructor_arguments(self):
         return f", BlockDataID {self._field_name}CPUID_" if self._target == Target.GPU else ""
@@ -170,7 +166,7 @@ class OutflowAdditionalDataHandler(AdditionalDataHandler):
     @property
     def additional_field_data(self):
         identifier = "CPU" if self._target == Target.GPU else ""
-        return f"auto {self._field_name} = block->getData< field::GhostLayerField<real_t, " \
+        return f"auto {self._field_name} = block->getData< field::GhostLayerField<{self._pdfs_data_type}, " \
                f"{len(self._stencil)}> >({self._field_name}{identifier}ID); "
 
     def data_initialisation(self, direction_index):
@@ -181,7 +177,7 @@ class OutflowAdditionalDataHandler(AdditionalDataHandler):
 
         init_list = []
         for key, value in self.get_init_dict(pdf_acc, direction_index).items():
-            init_list.append(f"element.{key} = {self._field_name}->get({value});")
+            init_list.append(f"element.{key} = {self._dtype}( {self._field_name}->get({value}) );")
 
         return "\n".join(init_list)
 
diff --git a/python/lbmpy_walberla/boundary_collection.py b/python/lbmpy_walberla/boundary_collection.py
index 17bfa245a..082567204 100644
--- a/python/lbmpy_walberla/boundary_collection.py
+++ b/python/lbmpy_walberla/boundary_collection.py
@@ -15,8 +15,9 @@ from pystencils import Target
 import numpy as np
 
 
-def lbm_boundary_generator(class_name: str, flag_uid: str, boundary_object: LbBoundary, additional_data_handler=None):
-    def generation_function(ctx, lb_method, field_name='pdfs',
+def lbm_boundary_generator(class_name: str, flag_uid: str, boundary_object: LbBoundary, additional_data_handler=None,
+                           field_data_type=None):
+    def generation_function(ctx, lb_method, field_name='pdfs', spatial_shape=None,
                             streaming_pattern='pull', after_collision=True,
                             namespace='lbm',
                             **create_kernel_params):
@@ -25,6 +26,8 @@ def lbm_boundary_generator(class_name: str, flag_uid: str, boundary_object: LbBo
                                                       boundary_object=boundary_object,
                                                       lb_method=lb_method,
                                                       field_name=field_name,
+                                                      spatial_shape=spatial_shape,
+                                                      field_data_type=field_data_type,
                                                       streaming_pattern=streaming_pattern,
                                                       after_collision=after_collision,
                                                       additional_data_handler=additional_data_handler,
@@ -41,6 +44,7 @@ def generate_boundary_collection(generation_context,
                                  boundary_generators,
                                  lb_method,
                                  field_name='pdfs',
+                                 spatial_shape=None,
                                  streaming_pattern='pull',
                                  prev_timestep=Timestep.BOTH,
                                  namespace='lbm',
@@ -49,22 +53,26 @@ def generate_boundary_collection(generation_context,
     kernel_list = []
     includes = []
     boundary_classes = []
+    additional_data_handlers = []
     flag_uids = []
     object_names = []
     targets = []
 
     for boundary_generator in boundary_generators:
         boundary_functor = boundary_generator['generator']
-        context = boundary_functor(generation_context, lb_method, field_name, streaming_pattern, prev_timestep,
-                                   namespace, **create_kernel_params)
+        context = boundary_functor(generation_context, lb_method, field_name, spatial_shape,
+                                   streaming_pattern, prev_timestep, namespace, **create_kernel_params)
 
         kernel_list.append(context['kernel'])
         includes.append(f"\"{context['class_name']}.h\"")
         boundary_classes.append(f"{context['namespace']}::{context['class_name']}")
+        additional_data_handlers.append(context['additional_data_handler'])
         flag_uids.append(boundary_generator['flag_id'])
         object_names.append(f"{context['class_name']}Object")
         targets.append(f"{context['target']}")
 
+    additional_constructor_arguments = [a.constructor_arguments[2:] for a in additional_data_handlers]
+
     assert len(set(targets)) == 1
     target = targets[0]
 
@@ -75,6 +83,8 @@ def generate_boundary_collection(generation_context,
         'namespace': namespace,
         'includes': includes,
         'boundary_classes': boundary_classes,
+        'additional_data_handlers': additional_data_handlers,
+        'additional_constructor_arguments': additional_constructor_arguments,
         'flag_uids': flag_uids,
         'object_names': object_names
     }
@@ -94,6 +104,8 @@ def __generate_alternating_lbm_boundary(generation_context,
                                         boundary_object,
                                         lb_method,
                                         field_name='pdfs',
+                                        spatial_shape=None,
+                                        field_data_type=None,
                                         streaming_pattern='pull',
                                         after_collision=True,
                                         additional_data_handler=None,
@@ -132,13 +144,14 @@ def __generate_alternating_lbm_boundary(generation_context,
             return OddIntegerCondition(timestep_param_name, kernel_even, kernel_odd, timestep_param_dtype)
 
     timestep_advancement = {"field_name": field_name, "function": "getTimestep"}
-
     context = pystencils_walberla.boundary.generate_boundary(generation_context,
-                                                             class_name,
-                                                             boundary_object,
+                                                             class_name=class_name,
+                                                             boundary_object=boundary_object,
                                                              field_name=field_name,
                                                              neighbor_stencil=lb_method.stencil,
                                                              index_shape=[lb_method.stencil.Q],
+                                                             spatial_shape=spatial_shape,
+                                                             field_data_type=field_data_type,
                                                              kernel_creation_function=boundary_creation_function,
                                                              namespace=namespace,
                                                              additional_data_handler=additional_data_handler,
diff --git a/python/lbmpy_walberla/packing_kernels.py b/python/lbmpy_walberla/packing_kernels.py
index 53e5d877e..d5caad4e9 100644
--- a/python/lbmpy_walberla/packing_kernels.py
+++ b/python/lbmpy_walberla/packing_kernels.py
@@ -83,7 +83,8 @@ def generate_packing_kernels(generation_context: CodeGenerationContext, class_na
 
 class PackingKernelsCodegen:
 
-    def __init__(self, stencil, streaming_pattern, class_name, config: CreateKernelConfig):
+    def __init__(self, stencil, streaming_pattern, class_name, config: CreateKernelConfig,
+                 src_field=None, dst_field=None):
         self.stencil = stencil
         self.dim = stencil.D
         self.values_per_cell = stencil.Q
@@ -94,8 +95,9 @@ class PackingKernelsCodegen:
         self.config = config
         self.data_type = config.data_type['pdfs'].numpy_dtype
 
-        self.src_field, self.dst_field = fields(
-            f'pdfs_src({self.values_per_cell}), pdfs_dst({self.values_per_cell}) :{self.data_type}[{self.dim}D]')
+        self.src_field = src_field if src_field else fields(f'pdfs_src({stencil.Q}) :{self.data_type}[{stencil.D}D]')
+        self.dst_field = dst_field if dst_field else fields(f'pdfs_dst({stencil.Q}) :{self.data_type}[{stencil.D}D]')
+
         self.accessors = [get_accessor(streaming_pattern, t) for t in get_timesteps(streaming_pattern)]
         self.mask_field = fields(f'mask : uint32 [{self.dim}D]')
 
diff --git a/python/lbmpy_walberla/storage_specification.py b/python/lbmpy_walberla/storage_specification.py
index 544867aac..a07c619b3 100644
--- a/python/lbmpy_walberla/storage_specification.py
+++ b/python/lbmpy_walberla/storage_specification.py
@@ -4,10 +4,10 @@ from dataclasses import replace
 from jinja2 import Environment, PackageLoader, StrictUndefined
 import numpy as np
 
-from pystencils import Target
+from pystencils import fields, Target
 
-from lbmpy import LBMConfig
-from lbmpy.advanced_streaming import is_inplace
+from lbmpy import LBMConfig, LBMOptimisation
+from lbmpy.advanced_streaming import is_inplace, get_accessor, Timestep
 from lbmpy.methods import AbstractLbMethod
 
 from pystencils_walberla.cmake_integration import CodeGenerationContext
@@ -17,8 +17,11 @@ from lbmpy_walberla.packing_kernels import PackingKernelsCodegen
 
 
 def generate_lbm_storage_specification(generation_context: CodeGenerationContext, class_name: str,
-                                       method: AbstractLbMethod, lbm_config: LBMConfig, nonuniform: bool = False,
-                                       target: Target = Target.CPU, data_type=None, cpu_openmp: bool = False,
+                                       method: AbstractLbMethod,
+                                       lbm_config: LBMConfig, lbm_optimisation: LBMOptimisation,
+                                       nonuniform: bool = False,
+                                       target: Target = Target.CPU,
+                                       data_type=None, cpu_openmp: bool = False,
                                        **create_kernel_params):
     namespace = "lbm"
     stencil = method.stencil
@@ -32,10 +35,30 @@ def generate_lbm_storage_specification(generation_context: CodeGenerationContext
     config = replace(config, cpu_vectorize_info=None)
 
     default_dtype = config.data_type.default_factory()
-    is_float = True if issubclass(default_dtype.numpy_dtype.type, np.float32) else False
-    constant_suffix = "f" if is_float else ""
-
-    cg = PackingKernelsCodegen(stencil, streaming_pattern, class_name, config)
+    if issubclass(default_dtype.numpy_dtype.type, np.float64):
+        constant_suffix = ""
+        data_type_string = "double"
+    elif issubclass(default_dtype.numpy_dtype.type, np.float32):
+        constant_suffix = "f"
+        data_type_string = "float"
+    elif issubclass(default_dtype.numpy_dtype.type, np.float16):
+        constant_suffix = ""
+        data_type_string = "half"
+    else:
+        raise ValueError(f"default datatype {default_dtype.numpy_dtype.type} is not supported. "
+                         f"Supported are only np.float64, np.float32 and np.float16")
+
+    symbolic_field = lbm_optimisation.symbolic_field
+    assert symbolic_field, "The symbolic pdf field must be added to LBMOptimisation as symbolic_field"
+    if is_inplace(streaming_pattern):
+        symbolic_temporary_field = symbolic_field.new_field_with_different_name("pdfs_tmp")
+    else:
+        assert lbm_optimisation.symbolic_temporary_field, \
+            "The destination field must be added to LBMOptimisation as symbolic_temporary_field"
+        symbolic_temporary_field = lbm_optimisation.symbolic_temporary_field
+
+    cg = PackingKernelsCodegen(stencil, streaming_pattern, class_name, config,
+                               src_field=symbolic_field, dst_field=symbolic_temporary_field)
     kernels = cg.create_uniform_kernel_families()
 
     if nonuniform:
@@ -50,6 +73,16 @@ def generate_lbm_storage_specification(generation_context: CodeGenerationContext
     cqc = method.conserved_quantity_computation
     equilibrium = method.equilibrium_distribution
 
+    f = fields(f"f({stencil.Q}): double[{stencil.D}D]", layout='fzyx')
+    even_accessor = get_accessor(streaming_pattern, Timestep.EVEN)
+    odd_accessor = get_accessor(streaming_pattern, Timestep.ODD)
+
+    even_read = even_accessor.read(f, stencil)
+    even_write = even_accessor.write(f, stencil)
+
+    odd_read = odd_accessor.read(f, stencil)
+    odd_write = odd_accessor.write(f, stencil)
+
     jinja_context = {
         'class_name': class_name,
         'namespace': namespace,
@@ -62,12 +95,17 @@ def generate_lbm_storage_specification(generation_context: CodeGenerationContext
         'equilibrium_deviation_only': equilibrium.deviation_only,
         'inplace': is_inplace(streaming_pattern),
         'zero_centered': cqc.zero_centered_pdfs,
-        'weights': ",".join(str(w.evalf()) + constant_suffix for w in method.weights),
-        'inverse_weights': ",".join(str((1 / w).evalf()) + constant_suffix for w in method.weights),
+
+        'weights': ", ".join(f"{data_type_string}({str(w.evalf())})" for w in method.weights),
+        'inverse_weights': ", ".join(f"{data_type_string}({str((1 / w).evalf())})" for w in method.weights),
+        'even_read': _get_access_list(even_read, stencil.D),
+        'even_write': _get_access_list(even_write, stencil.D),
+        'odd_read': _get_access_list(odd_read, stencil.D),
+        'odd_write': _get_access_list(odd_write, stencil.D),
 
         'nonuniform': nonuniform,
         'target': target.name.lower(),
-        'dtype': "float" if is_float else "double",
+        'dtype': data_type_string,
         'is_gpu': target == Target.GPU,
         'kernels': kernels,
         'direction_sizes': cg.get_direction_sizes(),
@@ -87,3 +125,16 @@ def generate_lbm_storage_specification(generation_context: CodeGenerationContext
     source_extension = "cu" if target == Target.GPU and generation_context.cuda else "cpp"
     generation_context.write_file(f"{class_name}.h", header)
     generation_context.write_file(f"{class_name}.{source_extension}", source)
+
+
+def _get_access_list(access_list, dim):
+    result = []
+    for i in range(dim):
+        result.append(", ".join([str(int(field_access.offsets[i])) for field_access in access_list]))
+
+    if dim == 2:
+        result.append(", ".join(["0"] * len(access_list)))
+
+    result.append(", ".join([str(int(field_access.index[0])) for field_access in access_list]))
+
+    return result
diff --git a/python/lbmpy_walberla/sweep_collection.py b/python/lbmpy_walberla/sweep_collection.py
index 8edd0779b..7aed4a33b 100644
--- a/python/lbmpy_walberla/sweep_collection.py
+++ b/python/lbmpy_walberla/sweep_collection.py
@@ -7,9 +7,10 @@ import numpy as np
 from pystencils import Target, create_kernel
 from pystencils.config import CreateKernelConfig
 from pystencils.field import Field
+from pystencils.simp import add_subexpressions_for_field_reads
 
 from lbmpy.advanced_streaming import is_inplace, get_accessor, Timestep
-from lbmpy.creationfunctions import LbmCollisionRule
+from lbmpy.creationfunctions import LbmCollisionRule, LBMConfig, LBMOptimisation
 from lbmpy.fieldaccess import CollideOnlyInplaceAccessor
 from lbmpy.macroscopic_value_kernels import macroscopic_values_setter, macroscopic_values_getter
 from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel
@@ -23,35 +24,32 @@ from .function_generator import kernel_family_function_generator
 
 
 def generate_lbm_sweep_collection(ctx, class_name: str, collision_rule: LbmCollisionRule,
-                                  streaming_pattern='pull',
-                                  field_layout='fzyx', refinement_scaling=None,
-                                  macroscopic_fields: Dict[str, Field] = None,
+                                  lbm_config: LBMConfig, lbm_optimisation: LBMOptimisation,
+                                  refinement_scaling=None, macroscopic_fields: Dict[str, Field] = None,
                                   target=Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
                                   max_threads=None,
                                   **create_kernel_params):
+
     config = config_from_context(ctx, target=target, data_type=data_type,
                                  cpu_openmp=cpu_openmp, cpu_vectorize_info=cpu_vectorize_info, **create_kernel_params)
 
+    streaming_pattern = lbm_config.streaming_pattern
+    field_layout = lbm_optimisation.field_layout
+
     # usually a numpy layout is chosen by default i.e. xyzf - which is bad for waLBerla where at least the spatial
     # coordinates should be ordered in reverse direction i.e. zyx
     lb_method = collision_rule.method
 
-    q = lb_method.stencil.Q
-    dim = lb_method.stencil.D
-
     if field_layout == 'fzyx':
         config.cpu_vectorize_info['assume_inner_stride_one'] = True
     elif field_layout == 'zyxf':
         config.cpu_vectorize_info['assume_inner_stride_one'] = False
 
-    src_field = Field.create_generic('pdfs', dim, config.data_type['pdfs'].numpy_dtype,
-                                     index_dimensions=1, layout=field_layout, index_shape=(q,))
+    src_field = lbm_optimisation.symbolic_field
     if is_inplace(streaming_pattern):
         dst_field = src_field
     else:
-        dst_field = Field.create_generic('pdfs_tmp', dim, config.data_type['pdfs_tmp'].numpy_dtype,
-                                         index_dimensions=1, layout=field_layout,
-                                         index_shape=(q,))
+        dst_field = lbm_optimisation.symbolic_temporary_field
 
     config = replace(config, ghost_layers=0)
 
@@ -98,15 +96,17 @@ class RefinementScaling:
 def lbm_kernel_family(class_name, kernel_name,
                       collision_rule, streaming_pattern, src_field, dst_field, config: CreateKernelConfig):
 
+    default_dtype = config.data_type.default_factory()
     if kernel_name == "streamCollide":
         def lbm_kernel(field_accessor, lb_stencil):
-            return create_lbm_kernel(collision_rule, src_field, dst_field, field_accessor)
+            return create_lbm_kernel(collision_rule, src_field, dst_field, field_accessor, data_type=default_dtype)
         advance_timestep = {"field_name": src_field.name, "function": "advanceTimestep"}
         temporary_fields = ['pdfs_tmp']
         field_swaps = [('pdfs', 'pdfs_tmp')]
     elif kernel_name == "collide":
         def lbm_kernel(field_accessor, lb_stencil):
-            return create_lbm_kernel(collision_rule, src_field, dst_field, CollideOnlyInplaceAccessor())
+            return create_lbm_kernel(collision_rule, src_field, dst_field, CollideOnlyInplaceAccessor(),
+                                     data_type=default_dtype)
         advance_timestep = {"field_name": src_field.name, "function": "advanceTimestep"}
         temporary_fields = ()
         field_swaps = ()
@@ -161,6 +161,8 @@ def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
     density = macroscopic_fields.get('density', 1.0)
     velocity = macroscopic_fields.get('velocity', [0.0] * dim)
 
+    default_dtype = config.data_type.default_factory()
+
     get_timestep = {"field_name": pdfs.name, "function": "getTimestep"}
     temporary_fields = ()
     field_swaps = ()
@@ -173,6 +175,9 @@ def get_setter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
                                                density=density, velocity=velocity, pdfs=pdfs,
                                                streaming_pattern=streaming_pattern, previous_timestep=timestep)
 
+            if default_dtype != pdfs.dtype:
+                setter = add_subexpressions_for_field_reads(setter, data_type=default_dtype)
+
             setter_ast = create_kernel(setter, config=config)
             setter_ast.function_name = 'kernel_initialise' + timestep_suffix
             nodes.append(KernelCallNode(setter_ast))
@@ -199,6 +204,8 @@ def get_getter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
     if density is None and velocity is None:
         return None
 
+    default_dtype = config.data_type.default_factory()
+
     get_timestep = {"field_name": pdfs.name, "function": "getTimestep"}
     temporary_fields = ()
     field_swaps = ()
@@ -211,6 +218,9 @@ def get_getter_family(class_name, lb_method, pdfs, streaming_pattern, macroscopi
                                                density=density, velocity=velocity, pdfs=pdfs,
                                                streaming_pattern=streaming_pattern, previous_timestep=timestep)
 
+            if default_dtype != pdfs.dtype:
+                getter = add_subexpressions_for_field_reads(getter, data_type=default_dtype)
+
             getter_ast = create_kernel(getter, config=config)
             getter_ast.function_name = 'kernel_getter' + timestep_suffix
             nodes.append(KernelCallNode(getter_ast))
diff --git a/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h b/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
index 5f4913784..47f313860 100644
--- a/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
+++ b/python/lbmpy_walberla/templates/BoundaryCollection.tmpl.h
@@ -41,12 +41,12 @@ class {{class_name}}
    enum Type { ALL = 0, INNER = 1, OUTER = 2 };
 
 
-   {{class_name}}( {{- ["const shared_ptr<StructuredBlockForest> & blocks", "BlockDataID flagID_", "BlockDataID pdfsID_", "FlagUID domainUID_", [kernel_list|generate_constructor_parameters(['indexVector', 'indexVectorSize', 'pdfs'])]] | type_identifier_list -}} )
+   {{class_name}}( {{- ["const shared_ptr<StructuredBlockForest> & blocks", "BlockDataID flagID_", "BlockDataID pdfsID_", "FlagUID domainUID_", [kernel_list|generate_constructor_parameters(['indexVector', 'indexVectorSize', 'pdfs'])], additional_constructor_arguments] | type_identifier_list -}} )
       : blocks_(blocks), flagID(flagID_), pdfsID(pdfsID_), domainUID(domainUID_)
    {
-      {% for object_name, boundary_class, kernel in zip(object_names, boundary_classes, kernel_list) -%}
+      {% for object_name, boundary_class, kernel, additional_data_handler in zip(object_names, boundary_classes, kernel_list, additional_data_handlers) -%}
 
-      {{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", "pdfsID", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'pdfs', 'timestep', 'gpuStream'])]] | type_identifier_list -}});
+      {{object_name}} = std::make_shared< {{boundary_class}} >({{- ["blocks", "pdfsID", [kernel|generate_function_collection_call(['indexVector', 'indexVectorSize', 'pdfs', 'timestep', 'gpuStream'])], additional_data_handler.constructor_argument_name] | type_identifier_list -}});
       {% endfor %}
 
       {% for object_name, flag_uid in zip(object_names, flag_uids) -%}
@@ -105,4 +105,3 @@ class {{class_name}}
 
 }
 }
-
diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
index 91c7d7d96..92106cddf 100644
--- a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
+++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.cpp
@@ -27,31 +27,31 @@
 #   pragma GCC diagnostic ignored "-Wunused-variable"
 #endif
 
-/*************************************************************************************
+namespace walberla {
+namespace {{namespace}} {
+
+   /*************************************************************************************
  *                                Kernel Definitions
 *************************************************************************************/
-{{ kernels['packAll']      | generate_definitions }}
-{{ kernels['unpackAll']    | generate_definitions }}
-{{ kernels['localCopyAll'] | generate_definitions }}
-
-{{ kernels['packDirection']      | generate_definitions }}
-{{ kernels['unpackDirection']    | generate_definitions }}
-{{ kernels['localCopyDirection'] | generate_definitions }}
-
-{% if nonuniform -%}
-{{ kernels['unpackRedistribute']    | generate_definitions }}
-{{ kernels['packPartialCoalescence']    | generate_definitions }}
-{{ kernels['zeroCoalescenceRegion']    | generate_definitions }}
-{{ kernels['unpackCoalescence']    | generate_definitions }}
-{%- endif %}
-
-/*************************************************************************************
+   {{ kernels['packAll']      | generate_definitions }}
+   {{ kernels['unpackAll']    | generate_definitions }}
+   {{ kernels['localCopyAll'] | generate_definitions }}
+
+   {{ kernels['packDirection']      | generate_definitions }}
+   {{ kernels['unpackDirection']    | generate_definitions }}
+   {{ kernels['localCopyDirection'] | generate_definitions }}
+
+   {% if nonuniform -%}
+   {{ kernels['unpackRedistribute']    | generate_definitions }}
+   {{ kernels['packPartialCoalescence']    | generate_definitions }}
+   {{ kernels['zeroCoalescenceRegion']    | generate_definitions }}
+   {{ kernels['unpackCoalescence']    | generate_definitions }}
+   {%- endif %}
+
+   /*************************************************************************************
  *                                 Kernel Wrappers
 *************************************************************************************/
 
-namespace walberla {
-namespace {{namespace}} {
-
    void {{class_name}}::PackKernels::packAll(
       {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
              "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters,
@@ -128,8 +128,8 @@ namespace {{namespace}} {
       WALBERLA_ASSERT_EQUAL(srcInterval.zSize(), dstInterval.zSize())
 
       {{kernels['localCopyDirection']
-          | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
-          | indent(6) }}
+               | generate_call(cell_interval={src_field : 'srcInterval', dst_field : 'dstInterval'}, stream='stream')
+               | indent(6) }}
    }
 
    {% if nonuniform -%}
diff --git a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
index 5d5409b66..ad40a55ee 100644
--- a/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
+++ b/python/lbmpy_walberla/templates/LbmStorageSpecification.tmpl.h
@@ -78,6 +78,32 @@ class {{class_name}}
    // Inverse lattice weights
    static constexpr {{dtype}} wInv[{{stencil_size}}] = { {{inverse_weights}} };
 
+   struct AccessorEVEN
+   {
+      static constexpr cell_idx_t readX[{{stencil_size}}] = { {{even_read[0]}} };
+      static constexpr cell_idx_t readY[{{stencil_size}}] = { {{even_read[1]}} };
+      static constexpr cell_idx_t readZ[{{stencil_size}}] = { {{even_read[2]}} };
+      static constexpr cell_idx_t readD[{{stencil_size}}] = { {{even_read[3]}} };
+
+      static constexpr cell_idx_t writeX[{{stencil_size}}] = { {{even_write[0]}} };
+      static constexpr cell_idx_t writeY[{{stencil_size}}] = { {{even_write[1]}} };
+      static constexpr cell_idx_t writeZ[{{stencil_size}}] = { {{even_write[2]}} };
+      static constexpr cell_idx_t writeD[{{stencil_size}}] = { {{even_write[3]}} };
+   };
+
+   struct AccessorODD
+   {
+      static constexpr cell_idx_t readX[{{stencil_size}}] = { {{odd_read[0]}} };
+      static constexpr cell_idx_t readY[{{stencil_size}}] = { {{odd_read[1]}} };
+      static constexpr cell_idx_t readZ[{{stencil_size}}] = { {{odd_read[2]}} };
+      static constexpr cell_idx_t readD[{{stencil_size}}] = { {{odd_read[3]}} };
+
+      static constexpr cell_idx_t writeX[{{stencil_size}}] = { {{odd_write[0]}} };
+      static constexpr cell_idx_t writeY[{{stencil_size}}] = { {{odd_write[1]}} };
+      static constexpr cell_idx_t writeZ[{{stencil_size}}] = { {{odd_write[2]}} };
+      static constexpr cell_idx_t writeD[{{stencil_size}}] = { {{odd_write[3]}} };
+   };
+
    // Compute kernels to pack and unpack MPI buffers
    class PackKernels {
 
@@ -96,8 +122,8 @@ class {{class_name}}
       static const bool inplace = {% if inplace -%} true {%- else -%} false {%- endif -%};
 
       /**
-       * Packs all pdfs from the given cell interval to the send buffer.
-       * */
+      * Packs all pdfs from the given cell interval to the send buffer.
+      * */
       void packAll(
          {{- [ "PdfField_T * " + src_field.name, "CellInterval & ci",
                 "unsigned char * outBuffer", kernels['packAll'].kernel_selection_parameters,
@@ -168,7 +194,7 @@ class {{class_name}}
        * @return    The required size of the buffer, in bytes
        * */
       uint_t size (CellInterval & ci, stencil::Direction dir) const {
-         return ci.numCells() * sizes[dir] * sizeof(value_type);
+         return ci.numCells() * sizes[dir] * uint_c(sizeof(value_type));
       }
 
       /**
@@ -178,7 +204,7 @@ class {{class_name}}
        * @return    The required size of the buffer, in bytes
        * */
       uint_t size (CellInterval & ci) const {
-         return ci.numCells() * {{stencil_size}} * sizeof(value_type);
+         return ci.numCells() * {{stencil_size}} * uint_c(sizeof(value_type));
       }
 
       {% if nonuniform -%}
@@ -251,6 +277,8 @@ class {{class_name}}
       const uint_t sizes[{{direction_sizes|length}}] { {{ direction_sizes | join(', ') }} };
    };
 
+   using value_type = PackKernels::value_type;
+
 };
 
 }} //{{namespace}}/walberla
\ No newline at end of file
diff --git a/python/lbmpy_walberla/walberla_lbm_package.py b/python/lbmpy_walberla/walberla_lbm_package.py
index e21d6c961..fd9f4500b 100644
--- a/python/lbmpy_walberla/walberla_lbm_package.py
+++ b/python/lbmpy_walberla/walberla_lbm_package.py
@@ -17,7 +17,9 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
                          lbm_config: LBMConfig, lbm_optimisation: LBMOptimisation,
                          nonuniform: bool = False, boundaries: List[Callable] = None,
                          macroscopic_fields: Dict[str, Field] = None,
-                         target: Target = Target.CPU, data_type=None, cpu_openmp=None, cpu_vectorize_info=None,
+                         target: Target = Target.CPU,
+                         data_type=None, pdfs_data_type=None,
+                         cpu_openmp=None, cpu_vectorize_info=None,
                          max_threads=None,
                          **kernel_parameters):
 
@@ -27,8 +29,10 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
     method = collision_rule.method
 
     storage_spec_name = f'{name}StorageSpecification'
-    generate_lbm_storage_specification(ctx, storage_spec_name, method, lbm_config,
-                                       nonuniform=nonuniform, target=target, data_type=data_type)
+    generate_lbm_storage_specification(ctx, storage_spec_name, method, lbm_config, lbm_optimisation,
+                                       nonuniform=nonuniform, target=target,
+                                       data_type=pdfs_data_type,
+                                       cpu_openmp=cpu_openmp)
 
     if nonuniform:
         omega = get_shear_relaxation_rate(method)
@@ -37,10 +41,8 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
     else:
         refinement_scaling = None
 
-    streaming_pattern = lbm_config.streaming_pattern
     generate_lbm_sweep_collection(ctx, f'{name}SweepCollection', collision_rule,
-                                  streaming_pattern=streaming_pattern,
-                                  field_layout=lbm_optimisation.field_layout,
+                                  lbm_config=lbm_config, lbm_optimisation=lbm_optimisation,
                                   refinement_scaling=refinement_scaling,
                                   macroscopic_fields=macroscopic_fields,
                                   target=target, data_type=data_type,
@@ -48,6 +50,11 @@ def generate_lbm_package(ctx: CodeGenerationContext, name: str,
                                   max_threads=max_threads,
                                   **kernel_parameters)
 
+    spatial_shape = None
+    if lbm_optimisation.symbolic_field.has_fixed_shape:
+        spatial_shape = lbm_optimisation.symbolic_field.spatial_shape + (lbm_config.stencil.Q, )
+
     generate_boundary_collection(ctx, f'{name}BoundaryCollection', boundary_generators=boundaries,
-                                 lb_method=method, streaming_pattern=streaming_pattern,
+                                 lb_method=method, field_name='pdfs', spatial_shape=spatial_shape,
+                                 streaming_pattern=lbm_config.streaming_pattern,
                                  target=target, layout=lbm_optimisation.field_layout)
diff --git a/python/pystencils_walberla/additional_data_handler.py b/python/pystencils_walberla/additional_data_handler.py
index 2c4efcc65..48f87dd53 100644
--- a/python/pystencils_walberla/additional_data_handler.py
+++ b/python/pystencils_walberla/additional_data_handler.py
@@ -16,6 +16,10 @@ class AdditionalDataHandler:
         else:
             self._walberla_stencil = stencil
 
+    @property
+    def constructor_argument_name(self):
+        return ""
+
     @property
     def constructor_arguments(self):
         return ""
diff --git a/python/pystencils_walberla/boundary.py b/python/pystencils_walberla/boundary.py
index c5a5e54c1..7af79ed67 100644
--- a/python/pystencils_walberla/boundary.py
+++ b/python/pystencils_walberla/boundary.py
@@ -19,7 +19,9 @@ def generate_boundary(generation_context,
                       field_name,
                       neighbor_stencil,
                       index_shape,
+                      spatial_shape=None,
                       field_type=FieldType.GENERIC,
+                      field_data_type=None,
                       kernel_creation_function=None,
                       target=Target.CPU,
                       data_type=None,
@@ -47,14 +49,17 @@ def generate_boundary(generation_context,
     del create_kernel_params['default_number_int']
     del create_kernel_params['skip_independence_check']
 
-    field_data_type = config.data_type[field_name].numpy_dtype
+    if field_data_type is None:
+        field_data_type = config.data_type[field_name].numpy_dtype
 
     index_struct_dtype = numpy_data_type_for_boundary_object(boundary_object, dim)
 
-    field = Field.create_generic(field_name, dim,
-                                 field_data_type,
-                                 index_dimensions=len(index_shape), layout=layout, index_shape=index_shape,
-                                 field_type=field_type)
+    if spatial_shape:
+        field = Field.create_fixed_size(field_name, spatial_shape, index_dimensions=len(index_shape),
+                                        dtype=field_data_type, layout=layout, field_type=field_type)
+    else:
+        field = Field.create_generic(field_name, dim, dtype=field_data_type, index_dimensions=len(index_shape),
+                                     layout=layout, index_shape=index_shape, field_type=field_type)
 
     index_field = Field('indexVector', FieldType.INDEXED, index_struct_dtype, layout=[0],
                         shape=(TypedSymbol("indexVectorSize", create_type("int32")), 1), strides=(1, 1))
@@ -126,5 +131,3 @@ def generate_staggered_flux_boundary(generation_context, class_name, boundary_ob
     assert dim == len(neighbor_stencil[0])
     generate_boundary(generation_context, class_name, boundary_object, 'flux', neighbor_stencil, index_shape,
                       FieldType.STAGGERED_FLUX, target=target, **kwargs)
-
-
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.h b/python/pystencils_walberla/templates/Boundary.tmpl.h
index 96a9202c1..75e3cd13a 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.h
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.h
@@ -48,6 +48,10 @@
 #define RESTRICT
 #endif
 
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+using walberla::half;
+#endif
+
 namespace walberla {
 namespace {{namespace}} {
 
diff --git a/python/pystencils_walberla/templates/SweepCollection.tmpl.h b/python/pystencils_walberla/templates/SweepCollection.tmpl.h
index 5db4ccb33..2a89fef5e 100644
--- a/python/pystencils_walberla/templates/SweepCollection.tmpl.h
+++ b/python/pystencils_walberla/templates/SweepCollection.tmpl.h
@@ -64,20 +64,20 @@ namespace {{namespace}} {
 
 class {{class_name}}
 {
-public:
-  enum Type { ALL = 0, INNER = 1, OUTER = 2 };
+ public:
+   enum Type { ALL = 0, INNER = 1, OUTER = 2 };
 
    {{class_name}}(const shared_ptr< StructuredBlockStorage > & blocks, {{kernel_list|generate_constructor_parameters}}, const Cell & outerWidth=Cell(1, 1, 1))
-     : blocks_(blocks), {{ kernel_list|generate_constructor_initializer_list(parameter_registration=parameter_scaling) }}, outerWidth_(outerWidth)
+      : blocks_(blocks), {{ kernel_list|generate_constructor_initializer_list(parameter_registration=parameter_scaling) }}, outerWidth_(outerWidth)
    {
       {{kernel_list|generate_constructor(parameter_registration=parameter_scaling) |indent(6)}}
 
+      validInnerOuterSplit_= true;
+
       for (auto& iBlock : *blocks)
       {
-         if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 ||
-             int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 ||
-             int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2)
-          WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
+         if (int_c(blocks->getNumberOfXCells(iBlock)) <= outerWidth_[0] * 2 || int_c(blocks->getNumberOfYCells(iBlock)) <= outerWidth_[1] * 2 || int_c(blocks->getNumberOfZCells(iBlock)) <= outerWidth_[2] * 2)
+            validInnerOuterSplit_ = false;
       }
    };
 
@@ -105,54 +105,66 @@ public:
 
    std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", ] | type_identifier_list -}})
    {
+      if (!validInnerOuterSplit_ && type != Type::ALL)
+         WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
+
       switch (type)
       {
-         case Type::INNER:
-            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
-         case Type::OUTER:
-            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
-         default:
-            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); };
+      case Type::INNER:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
+      case Type::OUTER:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
+      default:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", ] | type_identifier_list -}}); };
       }
    }
 
    std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers"] | type_identifier_list -}})
    {
+      if (!validInnerOuterSplit_ && type != Type::ALL)
+         WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
+
       switch (type)
       {
-         case Type::INNER:
-            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
-         case Type::OUTER:
-            return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
-         default:
-            return [{{- ["this", "ghost_layers"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers"] | type_identifier_list -}}); };
+      case Type::INNER:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", ] | type_identifier_list -}}); };
+      case Type::OUTER:
+         return [{{- ["this", ] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", ] | type_identifier_list -}}); };
+      default:
+         return [{{- ["this", "ghost_layers"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers"] | type_identifier_list -}}); };
       }
    }
 
    {% if target is equalto 'gpu' -%}
    std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "const cell_idx_t ghost_layers", "gpuStream_t gpuStream"] | type_identifier_list -}})
    {
+      if (!validInnerOuterSplit_ && type != Type::ALL)
+         WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
+
       switch (type)
       {
-         case Type::INNER:
-            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
-         case Type::OUTER:
-            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
-         default:
-            return [{{- ["this", "ghost_layers", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers", "gpuStream"] | type_identifier_list -}}); };
+      case Type::INNER:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+      case Type::OUTER:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+      default:
+         return [{{- ["this", "ghost_layers", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "ghost_layers", "gpuStream"] | type_identifier_list -}}); };
       }
    }
 
    std::function<void (IBlock *)> {{kernel['function_name']}}({{- ["Type type", "gpuStream_t gpuStream"] | type_identifier_list -}})
    {
+      if (!validInnerOuterSplit_ && type != Type::ALL)
+         WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller, increase cellsPerBlock or avoid communication hiding")
+
       switch (type)
       {
-         case Type::INNER:
-            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
-         case Type::OUTER:
-            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
-         default:
-            return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "cell_idx_c(0)", "gpuStream"] | type_identifier_list -}}); };
+      case Type::INNER:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Inner({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+      case Type::OUTER:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}Outer({{- ["block", "gpuStream"] | type_identifier_list -}}); };
+      default:
+         return [{{- ["this", "gpuStream"] | type_identifier_list -}}](IBlock* block) { {{kernel['function_name']}}({{- ["block", "cell_idx_c(0)", "gpuStream"] | type_identifier_list -}}); };
       }
    }
    {%- endif %}
@@ -247,24 +259,24 @@ public:
          layers_.push_back(ci);
       }
 
-    {%if target is equalto 'gpu'%}
+      {%if target is equalto 'gpu'%}
       {
          auto parallelSection_ = parallelStreams_.parallelSection( gpuStream );
          for( auto & ci: layers_ )
          {
-          parallelSection_.run([&]( auto s ) {
-             {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
-          });
+            parallelSection_.run([&]( auto s ) {
+               {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
+            });
          }
       }
-    {% else %}
+      {% else %}
       for( auto & ci: layers_ )
       {
          {{kernel['function_name']}}CellInterval({{kernel['kernel']|generate_function_collection_call(cell_interval='ci')}});
       }
-    {% endif %}
+      {% endif %}
 
-    {{kernel['kernel']|generate_swaps|indent(9)}}
+      {{kernel['kernel']|generate_swaps|indent(9)}}
    }
    {% endfor %}
 
@@ -275,17 +287,18 @@ public:
    }
    {%endif%}
 
-   private:
-      shared_ptr< StructuredBlockStorage > blocks_;
-      {{kernel_list|generate_members(parameter_registration=parameter_scaling)|indent(4)}}
+ private:
+   shared_ptr< StructuredBlockStorage > blocks_;
+   {{kernel_list|generate_members(parameter_registration=parameter_scaling)|indent(4)}}
 
-      Cell outerWidth_;
-      std::vector<CellInterval> layers_;
+   Cell outerWidth_;
+   std::vector<CellInterval> layers_;
+   bool validInnerOuterSplit_;
 
-      {%if target is equalto 'gpu' -%}
-      gpu::ParallelStreams parallelStreams_;
-      // std::map<BlockID, gpuStream_t > streams_;
-      {%- endif %}
+   {%if target is equalto 'gpu' -%}
+   gpu::ParallelStreams parallelStreams_;
+   // std::map<BlockID, gpuStream_t > streams_;
+   {%- endif %}
 };
 
 
-- 
GitLab