diff --git a/CMakeLists.txt b/CMakeLists.txt
index 372ac25587c2322c183d06d266664a3053203ac4..cd001a9376f2917b2b31e386ca94b23098f320df 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1011,11 +1011,18 @@ endif()
 option ( WALBERLA_THREAD_SAFE_LOGGING "Enables/Disables thread-safe logging" ON )
 
 if ( WALBERLA_BUILD_WITH_OPENMP )
+    if( APPLE AND EXISTS /opt/local/lib/libomp AND EXISTS /opt/local/include/libomp ) # find libomp from MacPorts
+        set( CMAKE_FRAMEWORK_PATH /opt/local/lib/libomp )
+        set( CMAKE_INCLUDE_PATH /opt/local/include/libomp )
+    endif()
     find_package( OpenMP )
     if (OpenMP_FOUND)
         add_flag ( CMAKE_C_FLAGS   "${OpenMP_C_FLAGS}" )
         add_flag ( CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS}" )
         list ( APPEND SERVICE_LIBS ${OpenMP_CXX_LIBRARIES} )
+        if( OpenMP_CXX_INCLUDE_DIRS )
+            include_directories( ${OpenMP_CXX_INCLUDE_DIRS} )
+        endif()
     else()
         #workarounds
         if ( WALBERLA_CXX_COMPILER_IS_NEC )
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
index 571f3129ef00a3b4a2a258059b25263170cc73fb..f616ba407e10e644878df134f0f295aa75d8fcf6 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
@@ -1,9 +1,9 @@
 from pystencils.field import fields
 
-from lbmpy.advanced_streaming.utility import get_timesteps, Timestep
+from lbmpy.advanced_streaming.utility import get_timesteps
 from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
 from lbmpy.stencils import get_stencil
-from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_method, create_lb_update_rule
+from lbmpy.creationfunctions import create_lb_collision_rule
 from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
 
 from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
index bbdf8c144bd730dc0ec68c650ae56220f32cd4cf..f9a20e01a0ddcd11a65eade491cf05a35f25cca4 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -86,7 +86,6 @@ using FlagField_T      = FlagField< flag_t >;
 #if defined(WALBERLA_BUILD_WITH_CUDA)
 typedef cuda::GPUField< real_t > GPUField;
 #endif
-// using CommScheme_T = cuda::communication::UniformGPUScheme<stencil::D2Q9>;
 
 int main(int argc, char** argv)
 {
@@ -185,7 +184,7 @@ int main(int argc, char** argv)
       auto Comm_velocity_based_distributions =
          make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
       auto generatedPackInfo_velocity_based_distributions =
-         make_shared< pystencils::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
+         make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
@@ -193,7 +192,7 @@ int main(int argc, char** argv)
       auto Comm_phase_field_distributions =
          make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
       auto generatedPackInfo_phase_field_distributions =
-         make_shared< pystencils::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
+         make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
 #else
 
@@ -202,14 +201,14 @@ int main(int argc, char** argv)
 
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
       auto generatedPackInfo_velocity_based_distributions =
-         make_shared< pystencils::PackInfo_velocity_based_distributions >(lb_velocity_field);
+         make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
 
       Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field);
       Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
 
       blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
       auto generatedPackInfo_phase_field_distributions =
-         make_shared< pystencils::PackInfo_phase_field_distributions >(lb_phase_field);
+         make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
       Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
 #endif
 
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py b/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
index a751f35a43bab045b19e3ec94253f6930aacb839..bbd994de58a0b180293b7e643a2a27e0b6c50068 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
@@ -5,11 +5,12 @@ from pystencils import AssignmentCollection
 from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
 from lbmpy.stencils import get_stencil
 
-from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field
+from lbmpy_walberla import generate_lb_pack_info
 
 from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb, \
     initializer_kernel_hydro_lb, interface_tracking_force, \
-    hydrodynamic_force, get_collision_assignments_hydro
+    hydrodynamic_force, get_collision_assignments_hydro, get_collision_assignments_phase
 
 from lbmpy.phasefield_allen_cahn.force_model import MultiphaseForceModel
 
@@ -52,6 +53,7 @@ w_c = 1.0 / (0.5 + (3.0 * M))
 u = fields(f"vel_field({dimensions}): [{dimensions}D]", layout='fzyx')
 # phase-field
 C = fields(f"phase_field: [{dimensions}D]", layout='fzyx')
+C_tmp = fields(f"phase_field_tmp: [{dimensions}D]", layout='fzyx')
 
 # phase-field distribution functions
 h = fields(f"lb_phase_field({q_phase}): [{dimensions}D]", layout='fzyx')
@@ -88,32 +90,26 @@ h_updates = initializer_kernel_phase_field_lb(h, C, u, method_phase, W)
 g_updates = initializer_kernel_hydro_lb(g, u, method_hydro)
 
 
-force_h = [f / 3 for f in interface_tracking_force(C, stencil_phase, W)]
+force_h = [f / 3 for f in interface_tracking_force(C, stencil_phase, W, fd_stencil=get_stencil("D3Q27"))]
 force_model_h = MultiphaseForceModel(force=force_h)
 
-force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta, body_force)
+force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta, body_force,
+                             fd_stencil=get_stencil("D3Q27"))
 
-h_tmp_symbol_list = [h_tmp.center(i) for i, _ in enumerate(stencil_phase)]
-sum_h = np.sum(h_tmp_symbol_list[:])
+force_model_g = MultiphaseForceModel(force=force_g, rho=density)
 
 ####################
 # LBM UPDATE RULES #
 ####################
 
-method_phase.set_force_model(force_model_h)
+phase_field_LB_step = get_collision_assignments_phase(lb_method=method_phase,
+                                                      velocity_input=u,
+                                                      output={'density': C_tmp},
+                                                      force_model=force_model_h,
+                                                      symbolic_fields={"symbolic_field": h,
+                                                                       "symbolic_temporary_field": h_tmp},
+                                                      kernel_type='stream_pull_collide')
 
-phase_field_LB_step = create_lb_update_rule(lb_method=method_phase,
-                                            velocity_input=u,
-                                            compressible=True,
-                                            optimization={"symbolic_field": h,
-                                                          "symbolic_temporary_field": h_tmp},
-                                            kernel_type='stream_pull_collide')
-
-
-phase_field_LB_step.set_main_assignments_from_dict({**phase_field_LB_step.main_assignments_dict, **{C.center: sum_h}})
-
-phase_field_LB_step = AssignmentCollection(main_assignments=phase_field_LB_step.main_assignments,
-                                           subexpressions=phase_field_LB_step.subexpressions)
 phase_field_LB_step = sympy_cse(phase_field_LB_step)
 
 # ---------------------------------------------------------------------------------------------------------
@@ -121,18 +117,12 @@ phase_field_LB_step = sympy_cse(phase_field_LB_step)
 hydro_LB_step = get_collision_assignments_hydro(lb_method=method_hydro,
                                                 density=density,
                                                 velocity_input=u,
-                                                force=force_g,
-                                                sub_iterations=1,
+                                                force_model=force_model_g,
+                                                sub_iterations=2,
                                                 symbolic_fields={"symbolic_field": g,
                                                                  "symbolic_temporary_field": g_tmp},
                                                 kernel_type='collide_stream_push')
 
-# streaming of the hydrodynamic distribution
-stream_hydro = create_lb_update_rule(stencil=stencil_hydro,
-                                     optimization={"symbolic_field": g,
-                                                   "symbolic_temporary_field": g_tmp},
-                                     kernel_type='stream_pull_only')
-
 ###################
 # GENERATE SWEEPS #
 ###################
@@ -161,7 +151,7 @@ with CodeGeneration() as ctx:
         generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates)
 
         generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                       field_swaps=[(h, h_tmp)],
+                       field_swaps=[(h, h_tmp), (C, C_tmp)],
                        inner_outer_split=True,
                        cpu_vectorize_info=cpu_vec)
 
@@ -171,12 +161,13 @@ with CodeGeneration() as ctx:
                        cpu_vectorize_info=cpu_vec)
 
         # communication
-        generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field_distributions',
-                                       phase_field_LB_step.main_assignments, target='cpu')
-        generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field',
-                                       hydro_LB_step.all_assignments, target='cpu', kind='pull')
-        generate_pack_info_from_kernel(ctx, 'PackInfo_velocity_based_distributions',
-                                       hydro_LB_step.all_assignments, target='cpu', kind='push')
+        generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
+                              streaming_pattern='pull', target='cpu')
+
+        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
+                              streaming_pattern='push', target='cpu')
+
+        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target='cpu')
 
         ctx.write_file("GenDefines.h", info_header)
 
@@ -187,7 +178,7 @@ with CodeGeneration() as ctx:
                        g_updates, target='gpu')
 
         generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                       field_swaps=[(h, h_tmp)],
+                       field_swaps=[(h, h_tmp), (C, C_tmp)],
                        inner_outer_split=True,
                        target='gpu',
                        gpu_indexing_params=sweep_params,
@@ -200,12 +191,13 @@ with CodeGeneration() as ctx:
                        gpu_indexing_params=sweep_params,
                        varying_parameters=vp)
         # communication
-        generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field_distributions',
-                                       phase_field_LB_step.main_assignments, target='gpu')
-        generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field',
-                                       hydro_LB_step.all_assignments, target='gpu', kind='pull')
-        generate_pack_info_from_kernel(ctx, 'PackInfo_velocity_based_distributions',
-                                       hydro_LB_step.all_assignments, target='gpu', kind='push')
+        generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
+                              streaming_pattern='pull', target='gpu')
+
+        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
+                              streaming_pattern='push', target='gpu')
+
+        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target='gpu')
 
         ctx.write_file("GenDefines.h", info_header)
 
diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index 29755644d22f0542ac29b50e848f20dd6a2c9974..b3ef93bb589a84f807ac00464fa84cd141c1eb0c 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -4,49 +4,27 @@ waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )
 
 
-foreach (config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4
-      entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt
-      cumulant cumulant_d3q27
-      srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt)
-
-    waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
-          FILE UniformGridGPU.py
-          CODEGEN_CFG ${config}
-          OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h
-          UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
-          UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
-          UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
-          UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
-          UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h
-          UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h
-          UniformGridGPU_Defines.h
-          )
-
-
-    waLBerla_add_executable(NAME UniformGridBenchmarkGPU_${config}
-          FILES UniformGridGPU.cpp
-          DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_${config})
-    set_target_properties( UniformGridBenchmarkGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
-endforeach ()
-
-
-foreach (config srt trt mrt smagorinsky entropic)
-
-    waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_AA_${config}
-          FILE UniformGridGPU_AA.py
-          CODEGEN_CFG ${config}
-          OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
-          UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h
-          UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
-          UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
-          UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
-          UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
-          UniformGridGPU_AA_Defines.h
-          )
-
-
-    waLBerla_add_executable(NAME UniformGridBenchmarkGPU_AA_${config}
-          FILES UniformGridGPU_AA.cpp
-          DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_${config})
-    set_target_properties( UniformGridBenchmarkGPU_AA_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
-endforeach ()
+foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
+    foreach(stencil d3q27) # choose from {d3q19 d3q27}
+        foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky}
+            set(config ${stencil}_${streaming_pattern}_${collision_setup})
+            waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
+                    FILE UniformGridGPU.py
+                    CODEGEN_CFG ${config}
+                    OUT_FILES   UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
+                    UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h
+                    UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h
+                    UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
+                    UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
+                    UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
+                    UniformGridGPU_InfoHeader.h
+                    )
+
+
+            waLBerla_add_executable(NAME UniformGridGPU_${config}
+                    FILES UniformGridGPU.cpp
+                    DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
+            set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
+        endforeach ()
+    endforeach()
+endforeach()
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index a123cbf34d4470fc4daa8943956faa7c206e9ad6..bdcffe3ad3a8c1992d7746f4e40fd34fc9325c4a 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -1,361 +1,302 @@
+#include "blockforest/Initialization.h"
+
 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
-#include "core/math/Random.h"
-#include "python_coupling/CreateConfig.h"
-#include "python_coupling/PythonCallback.h"
-#include "python_coupling/DictWrapper.h"
-#include "blockforest/Initialization.h"
-#include "field/FlagField.h"
-#include "field/AddToStorage.h"
-#include "field/vtk/VTKWriter.h"
-#include "field/communication/PackInfo.h"
-#include "lbm/PerformanceLogger.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-#include "timeloop/all.h"
-#include "core/math/Random.h"
-#include "geometry/all.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/ParallelStreams.h"
-#include "cuda/NVTX.h"
-#include "core/timing/TimingPool.h"
 #include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
 #include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/UniformGPUScheme.h"
 #include "cuda/DeviceSelectMPI.h"
-#include "domain_decomposition/SharedSweep.h"
-#include "gui/Gui.h"
-#include "lbm/gui/Connection.h"
-
-#include "UniformGridGPU_LatticeModel.h"
-#include "UniformGridGPU_LbKernel.h"
-#include "UniformGridGPU_PackInfo.h"
-#include "UniformGridGPU_UBB.h"
-#include "UniformGridGPU_NoSlip.h"
-#include "UniformGridGPU_Communication.h"
-#include "UniformGridGPU_MacroSetter.h"
-#include "UniformGridGPU_MacroGetter.h"
-#include "UniformGridGPU_Defines.h"
+#include "cuda/ParallelStreams.h"
+#include "cuda/communication/UniformGPUScheme.h"
+#include "cuda/FieldCopy.h"
+#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
 
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/communication/PackInfo.h"
+#include "field/vtk/VTKWriter.h"
 
-using namespace walberla;
+#include "geometry/InitBoundaryHandling.h"
 
-using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
+#include "lbm/inplace_streaming/TimestepTracker.h"
 
-const auto Q = LatticeModel_T::Stencil::Q;
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
 
+#include "timeloop/SweepTimeloop.h"
 
-using Stencil_T = LatticeModel_T::Stencil;
-using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
-using PdfField_T = GhostLayerField<real_t, Q>;
-using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
-using VelocityField_T = GhostLayerField<real_t, 3>;
-using flag_t = walberla::uint8_t;
-using FlagField_T = FlagField<flag_t>;
+#include "InitShearVelocity.h"
 
+#include <cmath>
 
-void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
-                       const real_t xMagnitude=real_t(0.1), const real_t fluctuationMagnitude=real_t(0.05) )
-{
-    math::seedRandomGenerator(0);
-    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
-    for( auto & block: *blocks)
-    {
-        auto velField = block.getData<VelocityField_T>( velFieldID );
-        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
-            Cell globalCell;
-            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-            real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
-            velField->get(x, y, z, 1) = real_t(0);
-            velField->get(x, y, z, 2) = randomReal;
-
-            if( globalCell[2] >= halfZ ) {
-                velField->get(x, y, z, 0) = xMagnitude;
-            } else {
-                velField->get(x, y, z, 0) = -xMagnitude;
-            }
-        );
-    }
-}
+#include "UniformGridGPU_InfoHeader.h"
+using namespace walberla;
 
+using FlagField_T            = FlagField<uint8_t>;
 
-int main( int argc, char **argv )
+int main(int argc, char** argv)
 {
-   mpi::Environment env( argc, argv );
+   mpi::Environment env(argc, argv);
    cuda::selectDeviceBasedOnMpiRank();
 
-   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
-      WALBERLA_MPI_WORLD_BARRIER();
+      WALBERLA_MPI_WORLD_BARRIER()
+
+      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                        SETUP AND CONFIGURATION                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
       auto config = *cfg;
-      logging::configureLogging( config );
-      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+      logging::configureLogging(config);
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config);
 
-      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
+      Vector3< uint_t > cellsPerBlock =
+         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
       // Reading parameters
-      auto parameters = config->getOneBlock( "Parameters" );
-      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
-      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
-      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+      auto parameters        = config->getOneBlock("Parameters");
+      const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
       const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
 
       // Creating fields
-      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
-      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
+      BlockDataID pdfFieldCpuID =
+         field::addToStorage< PdfField_T >(blocks, "pdfs cpu", real_t(std::nan("")), field::fzyx);
+      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
+
+      // Initialize velocity on cpu
+      if( initShearFlow ){
+         WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
+         initShearVelocity(blocks, velFieldCpuID);
+      }
+
+      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true);
+      // Velocity field is copied to the GPU
+      BlockDataID velFieldGpuID =
+         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
 
-      if( timeStepStrategy != "kernelOnlyNoInit")
+      pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldGpuID, velFieldGpuID);
+
+      // Set up initial PDF values
+      for (auto& block : *blocks)
+         setterSweep(&block);
+
+      Vector3< int > innerOuterSplit =
+         parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));
+
+      for (uint_t i = 0; i < 3; ++i)
       {
-          if ( initShearFlow )
-          {
-              WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
-              initShearVelocity( blocks, velFieldCpuID );
-          }
-
-          pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
-          for( auto & block : *blocks )
-              setterSweep( &block );
-
-          // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
-          blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
-          initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
-          initialComm();
+         if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
+         {
+            WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
+         }
       }
 
-      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
+      Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
+      bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
+      Vector3< int32_t > gpuBlockSize =
+         parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
 
+      int streamHighPriority = 0;
+      int streamLowPriority  = 0;
+      WALBERLA_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      using LbSweep      = lbm::UniformGridGPU_LbKernel;
+      using PackInfoEven = lbm::UniformGridGPU_PackInfoEven;
+      using PackInfoOdd  = lbm::UniformGridGPU_PackInfoOdd;
+      using cuda::communication::UniformGPUScheme;
+
+      LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
+      lbSweep.setOuterPriority(streamHighPriority);
 
       // Boundaries
       const FlagUID fluidFlagUID( "Fluid" );
+      BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field");
       auto boundariesConfig = config->getBlock( "Boundaries" );
-      bool disableBoundaries = true;
+      bool boundaries = false;
       if( boundariesConfig )
       {
-          disableBoundaries = false;
-          geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
-          geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
+         boundaries = true;
+         geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
+         geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
       }
 
-      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
       lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
+      noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
 
-      ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
-      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
-
-       // Communication setup
-      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
-      Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
-      const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
-      CommunicationSchemeType communicationScheme;
-      if( communicationSchemeStr == "GPUPackInfo_Baseline")
-          communicationScheme = GPUPackInfo_Baseline;
-      else if (communicationSchemeStr == "GPUPackInfo_Streams")
-          communicationScheme = GPUPackInfo_Streams;
-      else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
-          communicationScheme = UniformGPUScheme_Baseline;
-      else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
-          communicationScheme = UniformGPUScheme_Memcpy;
-      else if (communicationSchemeStr == "MPIDatatypes")
-          communicationScheme = MPIDatatypes;
-      else if (communicationSchemeStr == "MPIDatatypesFull")
-          communicationScheme = MPIDatatypesFull;
-      else {
-          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
-      }
+      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
+      ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
 
-      Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
-      for(uint_t i=0; i< 3; ++i)
-      {
-          if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
-              WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
-          }
-      }
+      // Initial setup is the post-collision state of an even time step
+      auto tracker = make_shared< lbm::TimestepTracker >(0);
 
-      int streamHighPriority = 0;
-      int streamLowPriority = 0;
-      WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
-      WALBERLA_CHECK(gpuBlockSize[2] == 1);
-      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
-                                                    1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
-                                                    gpuBlockSize[0], gpuBlockSize[1],
-                                                    Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
-      lbKernel.setOuterPriority( streamHighPriority );
-      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
-         gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
-
-      auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
-      auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
-
-      uint_t currentTimeStep = 0;
-
-      auto simpleOverlapTimeStep = [&] ()
-      {
-          gpuComm.startCommunication(defaultStream);
-          for( auto &block: *blocks )
-              lbKernel.inner( &block, defaultStream );
-          gpuComm.wait(defaultStream);
-          for( auto &block: *blocks )
-              lbKernel.outer( &block, defaultStream );
-      };
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      auto overlapTimeStep = [&]()
-      {
-         cuda::NvtxRange namedRange("timestep");
-         auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
+      UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
+      auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
+      comm.addPackInfo(packInfo);
 
-         innerOuterSection.run([&]( auto innerStream )
-         {
-            cuda::nameStream(innerStream, "inner stream");
-            for( auto &block: *blocks )
-            {
-               if(!disableBoundaries)
-               {
-                  auto p = boundaryInnerStreams.parallelSection( innerStream );
-                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
-                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
-               }
-               lbKernel.inner( &block, innerStream );
-            }
-         });
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-         innerOuterSection.run([&]( auto outerStream )
-         {
-            cuda::nameStream(outerStream, "outer stream");
-            gpuComm( outerStream );
+      auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
 
-            for( auto &block: *blocks )
-            {
-               if(!disableBoundaries)
-               {
-                  auto p = boundaryOuterStreams.parallelSection( outerStream );
-                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
-                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
-               }
-               lbKernel.outer( &block, outerStream );
-            }
-         });
-         currentTimeStep += 1;
+      auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+         noSlip.run(block, t, stream);
+         ubb.run(block, t, stream);
       };
 
+      auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+         noSlip.inner(block, t, stream);
+         ubb.inner(block, t, stream);
+      };
 
-      auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
-      auto normalTimeStep = [&]()
-      {
-         gpuComm();
-         for( auto &block: *blocks )
-         {
-            if(!disableBoundaries)
-            {
-               auto p = boundaryStreams.parallelSection( defaultStream );
-               p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
-               p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
-            }
-            lbKernel( &block );
+      auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+         noSlip.outer(block, t, stream);
+         ubb.outer(block, t, stream);
+      };
+
+      auto simpleOverlapTimeStep = [&]() {
+         // Communicate post-collision values of previous timestep...
+         comm.startCommunication(defaultStream);
+         for (auto& block : *blocks){
+            if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
+            lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
          }
+         comm.wait(defaultStream);
+         for (auto& block : *blocks){
+            if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
+            lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
+         }
+
+         tracker->advance();
       };
 
-      auto kernelOnlyFunc = [&] ()
-      {
-          for( auto &block: *blocks )
-              lbKernel( &block );
+      auto normalTimeStep = [&]() {
+         comm.communicate(defaultStream);
+         for (auto& block : *blocks){
+            if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
+            lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
+         }
+
+         tracker->advance();
       };
 
-      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
+      // With two-fields patterns, ghost layer cells act as constant stream-in boundaries;
+      // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries.
+      auto kernelOnlyFunc = [&]() {
+         tracker->advance();
+         for (auto& block : *blocks)
+            lbSweep(&block, tracker->getCounter(), defaultStream);
+      };
 
-      std::function<void()> timeStep;
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                             TIME LOOP SETUP                                                ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+
+      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
+      std::function< void() > timeStep;
       if (timeStepStrategy == "noOverlap")
-          timeStep = std::function<void()>( normalTimeStep );
-      else if (timeStepStrategy == "complexOverlap")
-          timeStep = std::function<void()>( overlapTimeStep );
+         timeStep = std::function< void() >(normalTimeStep);
       else if (timeStepStrategy == "simpleOverlap")
-          timeStep = simpleOverlapTimeStep;
-      else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
-          WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
-          timeStep = kernelOnlyFunc;
+         timeStep = simpleOverlapTimeStep;
+      else if (timeStepStrategy == "kernelOnly")
+      {
+         WALBERLA_LOG_INFO_ON_ROOT(
+            "Running only compute kernel without boundary - this makes only sense for benchmarking!")
+         // Run initial communication once to provide any missing stream-in populations
+         comm.communicate();
+         timeStep = kernelOnlyFunc;
       }
-      else {
-          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
+      else
+      {
+         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
+                                      "'simpleOverlap', 'kernelOnly'")
       }
 
-      timeLoop.add() << BeforeFunction( timeStep  )
-                     << Sweep( []( IBlock * ) {}, "time step" );
-
-      pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
+      timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
 
       // VTK
-      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
-      if( vtkWriteFrequency > 0 )
+      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      if (vtkWriteFrequency > 0)
       {
-         auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                          "simulation_step", false, true, true, false, 0 );
-         auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                         "simulation_step", false, true, true, false, 0);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel");
          vtkOutput->addCellDataWriter(velWriter);
-         vtkOutput->addBeforeFunction( [&]() {
-             cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
-             for( auto & block : *blocks )
-                 getterSweep( &block );
+
+         vtkOutput->addBeforeFunction([&]() {
+           cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
          });
-         timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
 
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
-      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
-      for(int i=0; i < warmupSteps; ++i )
+      int warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
+      int outerIterations = parameters.getParameter< int >("outerIterations", 1);
+      for (int i = 0; i < warmupSteps; ++i)
          timeLoop.singleStep();
 
-      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
-      if (remainingTimeLoggerFrequency > 0) {
-          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
-          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
-      }
-
-      bool useGui = parameters.getParameter<bool>( "useGui", false );
-      if( useGui )
+      double remainingTimeLoggerFrequency =
+         parameters.getParameter< double >("remainingTimeLoggerFrequency", -1.0); // in seconds
+      if (remainingTimeLoggerFrequency > 0)
       {
-          GUI gui( timeLoop, blocks, argc, argv);
-          lbm::connectToGui<LatticeModel_T>(gui);
-          gui.run();
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations),
+                                                   remainingTimeLoggerFrequency);
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
       }
-      else
+
+      for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
       {
-          for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
-          {
-              timeLoop.setCurrentTimeStepToZero();
-              WcTimer simTimer;
-              cudaDeviceSynchronize();
-              WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
-              simTimer.start();
-              timeLoop.run();
-              cudaDeviceSynchronize();
-              simTimer.end();
-              WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
-              auto time = simTimer.last();
-              auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-              auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-              WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
-              WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
-              WALBERLA_ROOT_SECTION()
-              {
-                  python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                  if ( pythonCallbackResults.isCallable())
-                  {
-                      const char * storagePattern = "twofield";
-                      pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                      pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
-                      pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
-                      pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
-                      pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
-                      pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
-                      // Call Python function to report results
-                      pythonCallbackResults();
-                  }
-              }
-          }
+         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+
+         timeLoop.setCurrentTimeStepToZero();
+         WcTimer simTimer;
+         cudaDeviceSynchronize();
+         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+         WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+         simTimer.start();
+         timeLoop.run();
+         cudaDeviceSynchronize();
+         simTimer.end();
+         WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+         auto time      = simTimer.last();
+         auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+         auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
+         WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
+         WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
+         WALBERLA_ROOT_SECTION()
+         {
+            python_coupling::PythonCallback pythonCallbackResults("results_callback");
+            if (pythonCallbackResults.isCallable())
+            {
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+               // Call Python function to report results
+               pythonCallbackResults();
+            }
+         }
       }
    }
 
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
index 6f9670d1d6b3609b21be57fc7d026389229fb320..7d3c8e7f21e602128564f4ea8e22a119e7080989 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -1,19 +1,22 @@
 import sympy as sp
 import numpy as np
 import pystencils as ps
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
-from lbmpy.boundaries import NoSlip, UBB
-from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
-from pystencils_walberla import generate_pack_info_from_kernel
-from lbmpy_walberla import generate_lattice_model, generate_boundary
-from pystencils_walberla import CodeGeneration, generate_sweep
+
 from pystencils.data_types import TypedSymbol
 from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
-from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
+
+from lbmpy.advanced_streaming import Timestep, is_inplace
+from lbmpy.advanced_streaming.utility import streaming_patterns
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_collision_rule
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+from lbmpy.stencils import get_stencil
+
+from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
+from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary
 
 omega = sp.symbols("omega")
 omega_free = sp.Symbol("omega_free")
-omega_fill = sp.symbols("omega_:10")
 compile_time_block_size = False
 
 if compile_time_block_size:
@@ -21,156 +24,158 @@ if compile_time_block_size:
 else:
     sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
                         TypedSymbol("cudaBlockSize1", np.int32),
-                        1)
+                        TypedSymbol("cudaBlockSize2", np.int32))
 
-sweep_params = {'block_size': sweep_block_size}
+gpu_indexing_params = {'block_size': sweep_block_size}
 
 options_dict = {
     'srt': {
         'method': 'srt',
-        'stencil': 'D3Q19',
         'relaxation_rate': omega,
         'compressible': False,
     },
     'trt': {
         'method': 'trt',
-        'stencil': 'D3Q19',
         'relaxation_rate': omega,
     },
     'mrt': {
         'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235],
+        'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
     },
-    'mrt_full': {
+    'mrt-overrelax': {
         'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
-                             omega_fill[3], omega_fill[4], omega_fill[5]],
+        'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
     },
-    'entropic': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
+    'cumulant': {
+        'method': 'cumulant',
+        'relaxation_rate': omega,
         'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free],
-        'entropic': True,
     },
-    'entropic_kbc_n4': {
-        'method': 'trt-kbc-n4',
-        'stencil': 'D3Q27',
+    'cumulant-overrelax': {
+        'method': 'cumulant',
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
         'compressible': True,
-        'relaxation_rates': [omega, omega_free],
+    },
+    'entropic': {
+        'method': 'mrt',
+        'compressible': True,
+        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
         'entropic': True,
     },
     'smagorinsky': {
         'method': 'srt',
-        'stencil': 'D3Q19',
         'smagorinsky': True,
         'relaxation_rate': omega,
-    },
-    'cumulant': {
-        'method': 'cumulant',
-        'stencil': 'D3Q19',
-        'compressible': True,
-        'relaxation_rate': omega,
-    },
+    }
 }
 
 info_header = """
-#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
 const char * infoStencil = "{stencil}";
-const char * infoConfigName = "{configName}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
 const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
 
+# DEFAULTS
+optimize = True
 
 with CodeGeneration() as ctx:
-    accessor = StreamPullTwoFieldsAccessor()
-    # accessor = StreamPushTwoFieldsAccessor()
-    assert not accessor.is_inplace, "This app does not work for inplace accessors"
+    config_tokens = ctx.config.split('_')
+
+    assert len(config_tokens) >= 3
+    stencil_str = config_tokens[0]
+    streaming_pattern = config_tokens[1]
+    collision_setup = config_tokens[2]
+
+    if len(config_tokens) >= 4:
+        optimize = (config_tokens[3] != 'noopt')
+
+    stencil = get_stencil(stencil_str)
+    assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
+
+    options = options_dict[collision_setup]
+
+    q = len(stencil)
+    dim = len(stencil[0])
+    assert dim == 3, "This app supports only three-dimensional stencils"
+    pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : double[3D]", layout='fzyx')
 
     common_options = {
-        'field_name': 'pdfs',
-        'temporary_field_name': 'pdfs_tmp',
-        'kernel_type': accessor,
-        'optimization': {'cse_global': True,
-                         'cse_pdfs': False}
+        'stencil': stencil,
+        'field_name': pdfs.name,
+        'optimization': {
+            'target': 'gpu',
+            'cse_global': True,
+            'cse_pdfs': False,
+            'symbolic_field': pdfs,
+            'field_layout': 'fzyx',
+            'gpu_indexing_params': gpu_indexing_params,
+        }
     }
-    config_name = ctx.config
-    noopt = False
-    d3q27 = False
-    if config_name.endswith("_noopt"):
-        noopt = True
-        config_name = config_name[:-len("_noopt")]
-    if config_name.endswith("_d3q27"):
-        d3q27 = True
-        config_name = config_name[:-len("_d3q27")]
-
-    options = options_dict[config_name]
-    options.update(common_options)
-    options = options.copy()
 
-    if noopt:
-        options['optimization']['cse_global'] = False
-        options['optimization']['cse_pdfs'] = False
-    if d3q27:
-        options['stencil'] = 'D3Q27'
+    options.update(common_options)
 
-    stencil_str = options['stencil']
-    q = int(stencil_str[stencil_str.find('Q') + 1:])
-    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
-    options['optimization']['symbolic_field'] = pdfs
+    if not is_inplace(streaming_pattern):
+        options['optimization']['symbolic_temporary_field'] = pdfs_tmp
+        field_swaps = [(pdfs, pdfs_tmp)]
+    else:
+        field_swaps = []
 
     vp = [
-        ('double', 'omega_0'),
-        ('double', 'omega_1'),
-        ('double', 'omega_2'),
-        ('double', 'omega_3'),
-        ('double', 'omega_4'),
-        ('double', 'omega_5'),
-        ('double', 'omega_6'),
         ('int32_t', 'cudaBlockSize0'),
         ('int32_t', 'cudaBlockSize1'),
+        ('int32_t', 'cudaBlockSize2')
     ]
-    lb_method = create_lb_method(**options)
-    update_rule = create_lb_update_rule(lb_method=lb_method, **options)
- 
-    if not noopt:
-        update_rule = insert_fast_divisions(update_rule)
-        update_rule = insert_fast_sqrts(update_rule)
-
-    # CPU lattice model - required for macroscopic value computation, VTK output etc.
-    options_without_opt = options.copy()
-    del options_without_opt['optimization']
-    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method,
-                                                                                        **options_without_opt))
-
-    # gpu LB sweep & boundaries
-    generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
-                   field_swaps=[('pdfs', 'pdfs_tmp')],
-                   inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
-                   varying_parameters=vp)
-
-    generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
-    generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
+
+    # LB Sweep
+    collision_rule = create_lb_collision_rule(**options)
+
+    if optimize:
+        collision_rule = insert_fast_divisions(collision_rule)
+        collision_rule = insert_fast_sqrts(collision_rule)
+
+    lb_method = collision_rule.method
+
+    generate_alternating_lbm_sweep(ctx, 'UniformGridGPU_LbKernel', collision_rule, streaming_pattern,
+                                   optimization=options['optimization'],
+                                   inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps)
 
     # getter & setter
-    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=1.0)
-    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=None)
-    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments)
-    generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
+    setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs,
+                                                   streaming_pattern=streaming_pattern,
+                                                   previous_timestep=Timestep.EVEN)
+    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu')
+
+    # Boundaries
+    noslip = NoSlip()
+    ubb = UBB((0.05, 0, 0))
+
+    generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_NoSlip', noslip, lb_method, field_name=pdfs.name,
+                                      streaming_pattern=streaming_pattern, target='gpu')
+    generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_UBB', ubb, lb_method, field_name=pdfs.name,
+                                      streaming_pattern=streaming_pattern, target='gpu')
 
     # communication
-    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
+    generate_lb_pack_info(ctx, 'UniformGridGPU_PackInfo', stencil, pdfs,
+                          streaming_pattern=streaming_pattern, target='gpu',
+                          always_generate_separate_classes=True)
 
     infoHeaderParams = {
         'stencil': stencil_str,
-        'q': q,
-        'configName': ctx.config,
+        'streaming_pattern': streaming_pattern,
+        'collision_setup': collision_setup,
         'cse_global': int(options['optimization']['cse_global']),
         'cse_pdfs': int(options['optimization']['cse_pdfs']),
     }
-    ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams))
+
+    stencil_typedefs = {'Stencil_T': stencil,
+                        'CommunicationStencil_T': stencil}
+    field_typedefs = {'PdfField_T': pdfs,
+                      'VelocityField_T': velocity_field}
+
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'UniformGridGPU_InfoHeader',
+                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
deleted file mode 100644
index dbda68b72a5f2990965e2ff2c1e6e759c6fe0d2e..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-#include "core/Environment.h"
-#include "core/logging/Initialization.h"
-#include "python_coupling/CreateConfig.h"
-#include "python_coupling/PythonCallback.h"
-#include "python_coupling/DictWrapper.h"
-#include "blockforest/Initialization.h"
-#include "field/FlagField.h"
-#include "field/AddToStorage.h"
-#include "field/vtk/VTKWriter.h"
-#include "field/communication/PackInfo.h"
-#include "lbm/PerformanceLogger.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-#include "timeloop/all.h"
-#include "geometry/all.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/ParallelStreams.h"
-#include "core/timing/TimingPool.h"
-#include "core/timing/RemainingTimeLogger.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/DeviceSelectMPI.h"
-#include "domain_decomposition/SharedSweep.h"
-#include "InitShearVelocity.h"
-#include "gui/Gui.h"
-
-#ifdef WALBERLA_ENABLE_GUI
-#include "lbm/gui/PdfFieldDisplayAdaptor.h"
-#endif
-
-
-#include "UniformGridGPU_AA_PackInfoPush.h"
-#include "UniformGridGPU_AA_PackInfoPull.h"
-#include "UniformGridGPU_AA_MacroSetter.h"
-#include "UniformGridGPU_AA_MacroGetter.h"
-#include "UniformGridGPU_AA_LbKernelEven.h"
-#include "UniformGridGPU_AA_LbKernelOdd.h"
-#include "UniformGridGPU_AA_Defines.h"
-
-#include <cmath>
-
-using namespace walberla;
-
-using CommunicationStencil_T = Stencil_T;
-using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
-using VelocityField_T = GhostLayerField< real_t, 3 >;
-
-
-int main( int argc, char **argv )
-{
-    mpi::Environment env( argc, argv );
-    cuda::selectDeviceBasedOnMpiRank();
-
-    for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
-    {
-        WALBERLA_MPI_WORLD_BARRIER();
-
-        WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
-
-        auto config = *cfg;
-        logging::configureLogging( config );
-        auto blocks = blockforest::createUniformBlockGridFromConfig( config );
-
-        Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" );
-        // Reading parameters
-        auto parameters = config->getOneBlock( "Parameters" );
-        const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 ));
-        const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));
-
-        // Creating fields
-        BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx );
-        BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
-
-        WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
-        initShearVelocity( blocks, velFieldCpuID );
-
-        pystencils::UniformGridGPU_AA_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
-        pystencils::UniformGridGPU_AA_MacroSetter setterSweep( pdfFieldCpuID, velFieldCpuID );
-
-        for ( auto &block : *blocks )
-            setterSweep( &block );
-
-        BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
-
-        Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
-
-        for(uint_t i=0; i< 3; ++i)
-        {
-            if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
-                WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
-            }
-        }
-
-        Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
-        bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
-        Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
-
-        int streamHighPriority = 0;
-        int streamLowPriority = 0;
-        WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange( &streamLowPriority, &streamHighPriority ));
-        WALBERLA_CHECK( gpuBlockSize[2] == 1 );
-
-
-        using KernelEven = pystencils::UniformGridGPU_AA_LbKernelEven;
-        using KernelOdd = pystencils::UniformGridGPU_AA_LbKernelOdd;
-        using PackInfoPull = pystencils::UniformGridGPU_AA_PackInfoPull;
-        using PackInfoPush = pystencils::UniformGridGPU_AA_PackInfoPush;
-        using cuda::communication::UniformGPUScheme;
-
-        KernelEven kernelEven( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
-        KernelOdd  kernelOdd ( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
-
-        kernelEven.setOuterPriority( streamHighPriority );
-        kernelOdd .setOuterPriority( streamHighPriority );
-
-        auto pullScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
-        auto pushScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
-        pullScheme->addPackInfo( make_shared< PackInfoPull >( pdfFieldGpuID ) );
-        pushScheme->addPackInfo( make_shared< PackInfoPush >( pdfFieldGpuID ) );
-
-
-        auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
-
-        auto setupPhase = [&]() {
-            for ( auto &block: *blocks )
-                kernelEven( &block );
-
-            pullScheme->communicate();
-
-            for ( auto &block: *blocks )
-                kernelOdd( &block );
-        };
-
-
-        auto tearDownPhase = [&]() {
-            pushScheme->communicate();
-            cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
-            for ( auto &block : *blocks )
-                getterSweep( &block );
-        };
-
-
-        auto simpleOverlapTimeStep = [&]()
-        {
-            // Even
-            pushScheme->startCommunication( defaultStream );
-            for ( auto &block: *blocks )
-                kernelEven.inner( &block, defaultStream );
-            pushScheme->wait( defaultStream );
-            for ( auto &block: *blocks )
-                kernelEven.outer( &block, defaultStream );
-
-            // Odd
-            pullScheme->startCommunication( defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd.inner( &block, defaultStream );
-            pullScheme->wait( defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd.outer( &block, defaultStream );
-        };
-
-        auto normalTimeStep = [&]()
-        {
-            pushScheme->communicate( defaultStream );
-            for ( auto &block: *blocks )
-                kernelEven( &block, defaultStream );
-
-            pullScheme->communicate( defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd( &block, defaultStream );
-        };
-
-        auto kernelOnlyFunc = [&]()
-        {
-            for ( auto &block: *blocks )
-                kernelEven( &block, defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd( &block, defaultStream );
-        };
-
-        SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
-
-        const std::string timeStepStrategy = parameters.getParameter< std::string >( "timeStepStrategy", "normal" );
-        std::function< void() > timeStep;
-        if ( timeStepStrategy == "noOverlap" )
-            timeStep = std::function< void() >( normalTimeStep );
-        else if ( timeStepStrategy == "simpleOverlap" )
-            timeStep = simpleOverlapTimeStep;
-        else if ( timeStepStrategy == "kernelOnly" )
-        {
-            WALBERLA_LOG_INFO_ON_ROOT( "Running only compute kernel without boundary - this makes only sense for benchmarking!" )
-            timeStep = kernelOnlyFunc;
-        }
-        else
-        {
-            WALBERLA_ABORT_NO_DEBUG_INFO(
-                    "Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'" );
-        }
-
-        timeLoop.add() << BeforeFunction( timeStep )
-                       << Sweep( []( IBlock * ) {}, "time step" );
-
-
-        // VTK
-        uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 );
-        if ( vtkWriteFrequency > 0 )
-        {
-            auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                             "simulation_step", false, true, true, false, 0 );
-            auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldCpuID, "vel" );
-            vtkOutput->addCellDataWriter( velWriter );
-            vtkOutput->addBeforeFunction( [&]()
-                                          {
-                                              tearDownPhase();
-                                              setupPhase();
-                                          } );
-            timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
-        }
-
-        int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 );
-        int outerIterations = parameters.getParameter< int >( "outerIterations", 1 );
-        setupPhase();
-        for ( int i = 0; i < warmupSteps; ++i )
-            timeLoop.singleStep();
-
-        double  remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
-        if ( remainingTimeLoggerFrequency > 0 )
-        {
-            auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c(outerIterations), remainingTimeLoggerFrequency );
-            timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
-        }
-
-        bool useGui = parameters.getParameter<bool>( "useGui", false );
-        if( useGui )
-        {
-#ifdef WALBERLA_ENABLE_GUI
-            cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
-            timeLoop.addFuncAfterTimeStep( cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ), "copy to CPU" );
-            GUI gui( timeLoop, blocks, argc, argv);
-                gui.registerDisplayAdaptorCreator(
-                [&](const IBlock & block, ConstBlockDataID blockDataID) -> gui::DisplayAdaptor * {
-                    if ( block.isDataOfType< PdfField_T >( blockDataID) )
-                        return new lbm::PdfFieldDisplayAdaptor<GhostLayerField<real_t, Stencil_T::Q>, Stencil_T >( blockDataID );
-                    return nullptr;
-                });
-            gui.run();
-#else
-            WALBERLA_ABORT_NO_DEBUG_INFO("Application was built without GUI. Set useGui to false or re-compile with GUI.")
-#endif
-        }
-        else
-        {
-            for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
-            {
-                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
-
-                timeLoop.setCurrentTimeStepToZero();
-                WcTimer simTimer;
-                cudaDeviceSynchronize();
-                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
-                WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
-                simTimer.start();
-                timeLoop.run();
-                cudaDeviceSynchronize();
-                simTimer.end();
-                WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
-                auto time = simTimer.last();
-                auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-
-                auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-                WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
-                WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
-                WALBERLA_ROOT_SECTION()
-                {
-                    python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                    if ( pythonCallbackResults.isCallable())
-                    {
-                        const char * storagePattern = "aa";
-                        pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                        pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
-                        pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
-                        pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
-                        pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
-                        pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
-                        // Call Python function to report results
-                        pythonCallbackResults();
-                    }
-                }
-            }
-        }
-    }
-
-    return 0;
-}
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
deleted file mode 100644
index c7e6341ae6d9a125e09427dec23f044e212709f8..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import sympy as sp
-import numpy as np
-import pystencils as ps
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
-from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
-from pystencils_walberla import generate_pack_info_from_kernel
-from pystencils_walberla import CodeGeneration, generate_sweep
-from pystencils.data_types import TypedSymbol
-from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
-from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
-
-omega = sp.symbols("omega")
-omega_free = sp.Symbol("omega_free")
-compile_time_block_size = False
-
-if compile_time_block_size:
-    sweep_block_size = (128, 1, 1)
-else:
-    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                        TypedSymbol("cudaBlockSize1", np.int32),
-                        1)
-
-sweep_params = {'block_size': sweep_block_size}
-
-options_dict = {
-    'srt': {
-        'method': 'srt',
-        'stencil': 'D3Q19',
-        'relaxation_rate': omega,
-        'compressible': False,
-    },
-    'trt': {
-        'method': 'trt',
-        'stencil': 'D3Q19',
-        'relaxation_rate': omega,
-    },
-    'mrt': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
-    },
-    'entropic': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
-        'entropic': True,
-    },
-    'smagorinsky': {
-        'method': 'srt',
-        'stencil': 'D3Q19',
-        'smagorinsky': True,
-        'relaxation_rate': omega,
-    }
-}
-
-
-info_header = """
-#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
-const char * infoStencil = "{stencil}";
-const char * infoConfigName = "{configName}";
-const bool infoCseGlobal = {cse_global};
-const bool infoCsePdfs = {cse_pdfs};
-"""
-
-
-with CodeGeneration() as ctx:
-    accessors = {
-        'Even': AAEvenTimeStepAccessor(),
-        'Odd': AAOddTimeStepAccessor()
-    }
-
-    common_options = {
-        'field_name': 'pdfs',
-        'optimization': {'cse_global': True,
-                         'cse_pdfs': False,
-                         'field_layout': 'fzyx',
-                         }
-    }
-    options = options_dict.get(ctx.config, options_dict['srt'])
-    options.update(common_options)
-
-    stencil_str = options['stencil']
-    q = int(stencil_str[stencil_str.find('Q') + 1:])
-    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
-    options['optimization']['symbolic_field'] = pdfs
-
-    vp = [
-        ('int32_t', 'cudaBlockSize0'),
-        ('int32_t', 'cudaBlockSize1')
-    ]
-    lb_method = create_lb_method(**options)
-
-    # Kernels
-    options_without_opt = options.copy()
-    del options_without_opt['optimization']
-    update_rules = {}
-    for name, accessor in accessors.items():
-        update_rule = create_lb_update_rule(lb_method=lb_method, kernel_type=accessor, **options)
-        update_rule = insert_fast_divisions(update_rule)
-        update_rule = insert_fast_sqrts(update_rule)
-        update_rules[name] = update_rule
-        generate_sweep(ctx, 'UniformGridGPU_AA_LbKernel' + name, update_rule,
-                       inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
-                       varying_parameters=vp)
-
-    # getter & setter
-    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=1.0)
-    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=None)
-    generate_sweep(ctx, 'UniformGridGPU_AA_MacroSetter', setter_assignments)
-    generate_sweep(ctx, 'UniformGridGPU_AA_MacroGetter', getter_assignments)
-
-    # communication
-    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'],
-                                   kind='pull', target='gpu')
-    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'],
-                                   kind='push', target='gpu')
-
-    infoHeaderParams = {
-        'stencil': stencil_str,
-        'q': q,
-        'configName': ctx.config,
-        'cse_global': int(options['optimization']['cse_global']),
-        'cse_pdfs': int(options['optimization']['cse_pdfs']),
-    }
-    ctx.write_file("UniformGridGPU_AA_Defines.h", info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48fca7135ba30b8f546b421c1329e847e0b6d129
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
@@ -0,0 +1,328 @@
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Random.h"
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/PythonCallback.h"
+#include "python_coupling/DictWrapper.h"
+#include "blockforest/Initialization.h"
+#include "field/FlagField.h"
+#include "field/AddToStorage.h"
+#include "field/vtk/VTKWriter.h"
+#include "field/communication/PackInfo.h"
+#include "lbm/PerformanceLogger.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "timeloop/all.h"
+#include "core/math/Random.h"
+#include "geometry/all.h"
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/communication/GPUPackInfo.h"
+#include "cuda/ParallelStreams.h"
+#include "cuda/NVTX.h"
+#include "core/timing/TimingPool.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "cuda/AddGPUFieldToStorage.h"
+#include "cuda/communication/UniformGPUScheme.h"
+#include "cuda/DeviceSelectMPI.h"
+#include "domain_decomposition/SharedSweep.h"
+
+#include "UniformGridGPU_LatticeModel.h"
+#include "UniformGridGPU_LbKernel.h"
+#include "UniformGridGPU_PackInfo.h"
+#include "UniformGridGPU_UBB.h"
+#include "UniformGridGPU_NoSlip.h"
+#include "UniformGridGPU_Communication.h"
+#include "UniformGridGPU_MacroSetter.h"
+#include "UniformGridGPU_MacroGetter.h"
+#include "UniformGridGPU_Defines.h"
+
+#include "InitShearVelocity.h"
+
+
+using namespace walberla;
+
+using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
+
+const auto Q = LatticeModel_T::Stencil::Q;
+
+
+using Stencil_T = LatticeModel_T::Stencil;
+using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
+using PdfField_T = GhostLayerField<real_t, Q>;
+using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
+using VelocityField_T = GhostLayerField<real_t, 3>;
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+
+int main( int argc, char **argv )
+{
+   mpi::Environment env( argc, argv );
+   cuda::selectDeviceBasedOnMpiRank();
+
+   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+   {
+      WALBERLA_MPI_WORLD_BARRIER();
+
+      auto config = *cfg;
+      logging::configureLogging( config );
+      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+
+      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
+      // Reading parameters
+      auto parameters = config->getOneBlock( "Parameters" );
+      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
+      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
+      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
+
+      // Creating fields
+      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
+      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
+
+      if( timeStepStrategy != "kernelOnlyNoInit")
+      {
+          if ( initShearFlow )
+          {
+              WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
+              initShearVelocity( blocks, velFieldCpuID );
+          }
+
+          pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
+          for( auto & block : *blocks )
+              setterSweep( &block );
+
+          // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
+          blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
+          initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
+          initialComm();
+      }
+
+      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
+      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
+
+
+      // Boundaries
+      const FlagUID fluidFlagUID( "Fluid" );
+      auto boundariesConfig = config->getBlock( "Boundaries" );
+      bool disableBoundaries = true;
+      if( boundariesConfig )
+      {
+          disableBoundaries = false;
+          geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
+          geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
+      }
+
+      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
+      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
+
+      ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
+      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
+
+       // Communication setup
+      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
+      Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
+      const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
+      CommunicationSchemeType communicationScheme;
+      if( communicationSchemeStr == "GPUPackInfo_Baseline")
+          communicationScheme = GPUPackInfo_Baseline;
+      else if (communicationSchemeStr == "GPUPackInfo_Streams")
+          communicationScheme = GPUPackInfo_Streams;
+      else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
+          communicationScheme = UniformGPUScheme_Baseline;
+      else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
+          communicationScheme = UniformGPUScheme_Memcpy;
+      else if (communicationSchemeStr == "MPIDatatypes")
+          communicationScheme = MPIDatatypes;
+      else if (communicationSchemeStr == "MPIDatatypesFull")
+          communicationScheme = MPIDatatypesFull;
+      else {
+          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
+      }
+
+      Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
+      for(uint_t i=0; i< 3; ++i)
+      {
+          if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
+              WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
+          }
+      }
+
+      int streamHighPriority = 0;
+      int streamLowPriority = 0;
+      WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
+      WALBERLA_CHECK(gpuBlockSize[2] == 1);
+      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
+                                                    1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
+                                                    gpuBlockSize[0], gpuBlockSize[1],
+                                                    Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
+      lbKernel.setOuterPriority( streamHighPriority );
+      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
+         gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
+
+      auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
+      auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
+      auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
+      auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
+
+      uint_t currentTimeStep = 0;
+
+      auto simpleOverlapTimeStep = [&] ()
+      {
+          gpuComm.startCommunication(defaultStream);
+          for( auto &block: *blocks )
+              lbKernel.inner( &block, defaultStream );
+          gpuComm.wait(defaultStream);
+          for( auto &block: *blocks )
+              lbKernel.outer( &block, defaultStream );
+      };
+
+      auto overlapTimeStep = [&]()
+      {
+         cuda::NvtxRange namedRange("timestep");
+         auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
+
+         innerOuterSection.run([&]( auto innerStream )
+         {
+            cuda::nameStream(innerStream, "inner stream");
+            for( auto &block: *blocks )
+            {
+               if(!disableBoundaries)
+               {
+                  auto p = boundaryInnerStreams.parallelSection( innerStream );
+                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
+                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
+               }
+               lbKernel.inner( &block, innerStream );
+            }
+         });
+
+         innerOuterSection.run([&]( auto outerStream )
+         {
+            cuda::nameStream(outerStream, "outer stream");
+            gpuComm( outerStream );
+
+            for( auto &block: *blocks )
+            {
+               if(!disableBoundaries)
+               {
+                  auto p = boundaryOuterStreams.parallelSection( outerStream );
+                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
+                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
+               }
+               lbKernel.outer( &block, outerStream );
+            }
+         });
+         currentTimeStep += 1;
+      };
+
+
+      auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
+      auto normalTimeStep = [&]()
+      {
+         gpuComm();
+         for( auto &block: *blocks )
+         {
+            if(!disableBoundaries)
+            {
+               auto p = boundaryStreams.parallelSection( defaultStream );
+               p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
+               p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
+            }
+            lbKernel( &block );
+         }
+      };
+
+      auto kernelOnlyFunc = [&] ()
+      {
+          for( auto &block: *blocks )
+              lbKernel( &block );
+      };
+
+      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
+
+      std::function<void()> timeStep;
+      if (timeStepStrategy == "noOverlap")
+          timeStep = std::function<void()>( normalTimeStep );
+      else if (timeStepStrategy == "complexOverlap")
+          timeStep = std::function<void()>( overlapTimeStep );
+      else if (timeStepStrategy == "simpleOverlap")
+          timeStep = simpleOverlapTimeStep;
+      else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
+          WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
+          timeStep = kernelOnlyFunc;
+      }
+      else {
+          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
+      }
+
+      timeLoop.add() << BeforeFunction( timeStep  )
+                     << Sweep( []( IBlock * ) {}, "time step" );
+
+      pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
+
+      // VTK
+      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
+      if( vtkWriteFrequency > 0 )
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                          "simulation_step", false, true, true, false, 0 );
+         auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+         vtkOutput->addBeforeFunction( [&]() {
+             cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+             for( auto & block : *blocks )
+                 getterSweep( &block );
+         });
+         timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+      }
+
+
+
+      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
+      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
+      for(int i=0; i < warmupSteps; ++i )
+         timeLoop.singleStep();
+
+      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
+      if (remainingTimeLoggerFrequency > 0) {
+          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
+          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
+      }
+
+       for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
+       {
+           timeLoop.setCurrentTimeStepToZero();
+           WcTimer simTimer;
+           cudaDeviceSynchronize();
+           WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+           simTimer.start();
+           timeLoop.run();
+           cudaDeviceSynchronize();
+           simTimer.end();
+           WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+           auto time = simTimer.last();
+           auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+           auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+           WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+           WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+           WALBERLA_ROOT_SECTION()
+           {
+               python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+               if ( pythonCallbackResults.isCallable())
+               {
+                   const char * storagePattern = "twofield";
+                   pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                   pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
+                   pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
+                   pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
+                   pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
+                   pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
+                   // Call Python function to report results
+                   pythonCallbackResults();
+               }
+           }
+       }
+
+   }
+
+   return 0;
+}
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.prm
similarity index 100%
rename from apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
rename to apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.prm
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py
new file mode 100644
index 0000000000000000000000000000000000000000..c861cb15578a7af152e382c0b7d4a607c52181f5
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py
@@ -0,0 +1,175 @@
+import sympy as sp
+import numpy as np
+import pystencils as ps
+from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
+from pystencils_walberla import generate_pack_info_from_kernel
+from lbmpy_walberla import generate_lattice_model, generate_boundary
+from pystencils_walberla import CodeGeneration, generate_sweep
+from pystencils.data_types import TypedSymbol
+from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
+from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
+
+omega = sp.symbols("omega")
+omega_free = sp.Symbol("omega_free")
+omega_fill = sp.symbols("omega_:10")
+compile_time_block_size = False
+
+if compile_time_block_size:
+    sweep_block_size = (128, 1, 1)
+else:
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        1)
+
+sweep_params = {'block_size': sweep_block_size}
+
+options_dict = {
+    'srt': {
+        'method': 'srt',
+        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
+        'compressible': False,
+    },
+    'trt': {
+        'method': 'trt',
+        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
+    },
+    'mrt': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235],
+    },
+    'mrt_full': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
+                             omega_fill[3], omega_fill[4], omega_fill[5]],
+    },
+    'entropic': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'compressible': True,
+        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free],
+        'entropic': True,
+    },
+    'entropic_kbc_n4': {
+        'method': 'trt-kbc-n4',
+        'stencil': 'D3Q27',
+        'compressible': True,
+        'relaxation_rates': [omega, omega_free],
+        'entropic': True,
+    },
+    'smagorinsky': {
+        'method': 'srt',
+        'stencil': 'D3Q19',
+        'smagorinsky': True,
+        'relaxation_rate': omega,
+    },
+    'cumulant': {
+        'method': 'cumulant',
+        'stencil': 'D3Q19',
+        'compressible': True,
+        'relaxation_rate': omega,
+    },
+}
+
+info_header = """
+#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
+const char * infoStencil = "{stencil}";
+const char * infoConfigName = "{configName}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+
+with CodeGeneration() as ctx:
+    accessor = StreamPullTwoFieldsAccessor()
+    # accessor = StreamPushTwoFieldsAccessor()
+    assert not accessor.is_inplace, "This app does not work for inplace accessors"
+
+    common_options = {
+        'field_name': 'pdfs',
+        'temporary_field_name': 'pdfs_tmp',
+        'kernel_type': accessor,
+        'optimization': {'cse_global': True,
+                         'cse_pdfs': False}
+    }
+    config_name = ctx.config
+    noopt = False
+    d3q27 = False
+    if config_name.endswith("_noopt"):
+        noopt = True
+        config_name = config_name[:-len("_noopt")]
+    if config_name.endswith("_d3q27"):
+        d3q27 = True
+        config_name = config_name[:-len("_d3q27")]
+
+    options = options_dict[config_name]
+    options.update(common_options)
+    options = options.copy()
+
+    if noopt:
+        options['optimization']['cse_global'] = False
+        options['optimization']['cse_pdfs'] = False
+    if d3q27:
+        options['stencil'] = 'D3Q27'
+
+    stencil_str = options['stencil']
+    q = int(stencil_str[stencil_str.find('Q') + 1:])
+    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
+    options['optimization']['symbolic_field'] = pdfs
+
+    vp = [
+        ('double', 'omega_0'),
+        ('double', 'omega_1'),
+        ('double', 'omega_2'),
+        ('double', 'omega_3'),
+        ('double', 'omega_4'),
+        ('double', 'omega_5'),
+        ('double', 'omega_6'),
+        ('int32_t', 'cudaBlockSize0'),
+        ('int32_t', 'cudaBlockSize1'),
+    ]
+    lb_method = create_lb_method(**options)
+    update_rule = create_lb_update_rule(lb_method=lb_method, **options)
+
+    if not noopt:
+        update_rule = insert_fast_divisions(update_rule)
+        update_rule = insert_fast_sqrts(update_rule)
+
+    # CPU lattice model - required for macroscopic value computation, VTK output etc.
+    options_without_opt = options.copy()
+    del options_without_opt['optimization']
+    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method,
+                                                                                        **options_without_opt))
+
+    # gpu LB sweep & boundaries
+    generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
+                   field_swaps=[('pdfs', 'pdfs_tmp')],
+                   inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
+                   varying_parameters=vp)
+
+    generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
+    generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
+
+    # getter & setter
+    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector, density=1.0)
+    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector, density=None)
+    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments)
+    generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
+
+    # communication
+    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
+
+    infoHeaderParams = {
+        'stencil': stencil_str,
+        'q': q,
+        'configName': ctx.config,
+        'cse_global': int(options['optimization']['cse_global']),
+        'cse_pdfs': int(options['optimization']['cse_pdfs']),
+    }
+    ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
similarity index 100%
rename from apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
rename to apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py
new file mode 100755
index 0000000000000000000000000000000000000000..d13e155f7dd5903465d7a9274bf7ae8200148582
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication.
+Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1  then run e.g.
+ ./UniformGridGPU_d3q27_aa_srt simulation_setup/benchmark_configs.py
+
+Look at the end of the file to select the benchmark to run
+"""
+
+import os
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+from copy import deepcopy
+from functools import reduce
+import operator
+import sys
+import sqlite3
+
+# Number of time steps run for a workload of 128^3 per GPU
+# if double as many cells are on the GPU, half as many time steps are run etc.
+# increase this to get more reliable measurements
+TIME_STEPS_FOR_128_BLOCK = 200
+DB_FILE = "gpu_benchmark.sqlite3"
+
+BASE_CONFIG = {
+    'DomainSetup': {
+        'cellsPerBlock': (256, 128, 128),
+        'periodic': (1, 1, 1),
+    },
+    'Parameters': {
+        'omega': 1.8,
+        'cudaEnabledMPI': False,
+        'warmupSteps': 5,
+        'outerIterations': 3,
+    }
+}
+
+
+def prod(seq):
+    return reduce(operator.mul, seq, 1)
+
+
+def num_time_steps(block_size, time_steps_for_128_block=200):
+    cells = block_size[0] * block_size[1] * block_size[2]
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    return int(time_steps)
+
+
+def cuda_block_size_ok(block_size, regs_per_threads=168):
+    """Checks if a given CUDA block size does not exceed the SM register limit.
+    168 registers per thread was obtained using cuobjdump on both SRT and Cumulant
+    kernels. You might want to validate that for your own kernels."""
+
+    return prod(block_size) * regs_per_threads < 64 * (2 ** 10)
+
+
+def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
+    """Checks if a single block of given size fits into GPU memory"""
+    return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem
+
+
+class Scenario:
+    def __init__(self, cells_per_block=(256, 128, 128), db_file=DB_FILE, **kwargs):
+        self.db_file = db_file
+        self.config_dict = deepcopy(BASE_CONFIG)
+        self.config_dict['Parameters'].update(kwargs)
+        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
+        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
+
+    @wlb.member_callback
+    def config(self):
+        from pprint import pformat
+        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
+        # Write out the configuration as text-based prm:
+        # print(toPrm(self.config_dict))
+        return self.config_dict
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        sequenceValuesToScalars(data)
+
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, "runs", self.db_file)
+                storeSingle(result, "runs", self.db_file)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning("Sqlite DB writing failed: try {}/{}  {}".format(num_try + 1, num_tries, str(e)))
+
+
+# -------------------------------------- Functions trying different parameter sets -----------------------------------
+
+
+def overlap_benchmark():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running different communication overlap strategies")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
+                          (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
+                          (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
+
+    # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
+    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+        # no overlap
+        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy,
+                               innerOuterSplit=(1, 1, 1)))
+
+        # overlap
+        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
+            for inner_outer_split in inner_outer_splits:
+                scenario = Scenario(timeStepStrategy=overlap_strategy,
+                                    communicationScheme=comm_strategy,
+                                    innerOuterSplit=inner_outer_split,
+                                    timesteps=num_time_steps((256, 128, 128)))
+                scenarios.add(scenario)
+
+
+def communication_compare():
+    """Tests different communication strategies"""
+    wlb.log_info_on_root("Running benchmarks to compare communication strategies")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
+        for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+
+            sc = Scenario(cells_per_block=block_size,
+                          gpuBlockSize=(128, 1, 1),
+                          timeStepStrategy='noOverlap',
+                          communicationScheme=comm_strategy,
+                          timesteps=num_time_steps(block_size))
+            scenarios.add(sc)
+            for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
+                # ensure that the inner part of the domain is still large enough
+                if 3 * inner_outer_split[0] > block_size[0]:
+                    continue
+                sc = Scenario(cells_per_block=block_size,
+                              gpuBlockSize=(128, 1, 1),
+                              timeStepStrategy='simpleOverlap',
+                              innerOuterSplit=inner_outer_split,
+                              communicationScheme=comm_strategy,
+                              timesteps=num_time_steps(block_size))
+                scenarios.add(sc)
+
+
+def single_gpu_benchmark():
+    """Benchmarks only the LBM compute kernel"""
+    wlb.log_info_on_root("Running single GPU benchmarks")
+    wlb.log_info_on_root("")
+
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+    gpu_type = os.environ.get('GPU_TYPE')
+
+    kwargs = {}
+    if gpu_type is not None:
+        kwargs['gpu_type'] = gpu_type
+
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
+                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
+                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
+                   (32, 8, 1), (64, 8, 1),
+                   (32, 16, 1)]
+    for block_size in block_sizes:
+        for cuda_block_size in cuda_blocks:
+            if not cuda_block_size_ok(cuda_block_size):
+                wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.")
+                continue
+            if not domain_block_size_ok(block_size, gpu_mem):
+                wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+                continue
+            scenario = Scenario(cells_per_block=block_size,
+                                gpuBlockSize=cuda_block_size,
+                                timeStepStrategy='kernelOnly',
+                                timesteps=num_time_steps(block_size),
+                                **kwargs)
+            scenarios.add(scenario)
+
+
+# -------------------------------------- Optional job script generation for PizDaint ---------------------------------
+
+
+job_script_header = """
+#!/bin/bash -l
+#SBATCH --job-name=scaling
+#SBATCH --time=0:30:00
+#SBATCH --nodes={nodes}
+#SBATCH -o out_scaling_{nodes}_%j.txt
+#SBATCH -e err_scaling_{nodes}_%j.txt
+#SBATCH --ntasks-per-core=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=1
+#SBATCH --partition=normal
+#SBATCH --constraint=gpu
+#SBATCH --account=d105
+
+cd {folder}
+
+source ~/env.sh
+
+module load daint-gpu
+module load craype-accel-nvidia60
+export MPICH_RDMA_ENABLED_CUDA=1  # allow GPU-GPU data transfer
+export CRAY_CUDA_MPS=1            # allow GPU sharing
+export MPICH_G2G_PIPELINE=256     # adapt maximum number of concurrent in-flight messages
+
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export CRAY_CUDA_MPS=1
+
+export MPICH_RANK_REORDER_METHOD=3
+export PMI_MMAP_SYNC_WAIT_TIME=300
+
+
+# grid_order -R -H -c 1,1,8 -g 16,16,8
+
+ulimit -c 0
+"""
+
+job_script_exe_part = """
+
+export WALBERLA_SCENARIO_IDX=0
+while srun -n {nodes} ./{app} {config}
+do
+ ((WALBERLA_SCENARIO_IDX++))
+done
+"""
+
+all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
+                   'UniformGridBenchmarkGPU_smagorinsky_d3q27',
+                   'UniformGridBenchmarkGPU_cumulant'
+                   'UniformGridBenchmarkGPU_cumulant_d3q27')
+
+
+def generate_jobscripts(exe_names=all_executables):
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
+        folder_name = "scaling_{:04d}".format(node_count)
+        os.makedirs(folder_name, exist_ok=True)
+
+        # run grid_order
+        import subprocess
+        decomposition = block_decomposition(node_count)
+        decomposition_str = ",".join(str(e) for e in decomposition)
+        subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
+
+        job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
+        for exe in exe_names:
+            job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count,
+                                                     config='../communication_compare.py')
+
+        with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
+            f.write(job_script)
+
+
+if __name__ == '__main__':
+    print("Called without waLBerla - generating job scripts for PizDaint")
+    generate_jobscripts()
+else:
+    wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE))
+    # Select the benchmark you want to run
+    single_gpu_benchmark()
+    # benchmarks different CUDA block sizes and domain sizes and measures single
+    # GPU performance of compute kernel (no communication)
+    # communication_compare(): benchmarks different communication routines, with and without overlap
+    # overlap_benchmark(): benchmarks different communication overlap options
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/single_node.prm
similarity index 89%
rename from apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm
rename to apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/single_node.prm
index ef257af8bc013da08ad5d04765e786c44a756a5e..1b22fcaaf2fe65fff1f32f162c959bd012995b97 100644
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/single_node.prm
@@ -8,6 +8,7 @@ DomainSetup
 
 Parameters
 {
+    initShearFlow False;
     cudaEnabledMPI False;           // set to true, if MPI was compiled with CUDA
     outerIterations 3;              // number of measurements to make
     timeStepStrategy simpleOverlap; // one of simpleOverlap, noOverlap, the non-AA version also supports complexOverlap
@@ -35,3 +36,10 @@ Parameters
     //   GPUPackInfo_Baseline:       old implementation based on communication mechanism for CPUs
     //   GPUPackInfo_Streams:        same as above but with CUDA streams
 }
+
+Boundaries {
+    Border { direction T; walldistance -1; flag UBB; }
+    Border { direction B; walldistance -1; flag NoSlip; }
+    Border { direction W; walldistance -1; flag NoSlip; }
+    Border { direction E; walldistance -1; flag NoSlip; }
+}
diff --git a/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods b/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods
new file mode 100644
index 0000000000000000000000000000000000000000..99ce7d0a33ff710ea40be7735533adcbb06b91d1
Binary files /dev/null and b/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods differ
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
index 5fbf36a9488874e87318ca402c747378f5a15f46..b3e7ac03396100609e509149f35ab7179712f945 100755
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -2,7 +2,7 @@
 """
 This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication.
 Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1  then run e.g.
- ./UniformGridBenchmarkGPU_AA_trt simulation_setup/benchmark_configs.py
+ ./UniformGridGPU_d3q27_aa_srt simulation_setup/benchmark_configs.py
 
 Look at the end of the file to select the benchmark to run
 """
@@ -11,14 +11,15 @@ import os
 import waLBerla as wlb
 from waLBerla.tools.config import block_decomposition
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
-from copy import deepcopy
+from functools import reduce
+import operator
 import sys
 import sqlite3
 
 # Number of time steps run for a workload of 128^3 per GPU
 # if double as many cells are on the GPU, half as many time steps are run etc.
 # increase this to get more reliable measurements
-TIME_STEPS_FOR_128_BLOCK = 200
+TIME_STEPS_FOR_128_BLOCK = 500
 DB_FILE = "gpu_benchmark.sqlite3"
 
 BASE_CONFIG = {
@@ -35,26 +36,82 @@ BASE_CONFIG = {
 }
 
 
-def num_time_steps(block_size):
+def prod(seq):
+    return reduce(operator.mul, seq, 1)
+
+
+def num_time_steps(block_size, time_steps_for_128_block=200):
     cells = block_size[0] * block_size[1] * block_size[2]
-    time_steps = (128 ** 3 / cells) * TIME_STEPS_FOR_128_BLOCK
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
     return int(time_steps)
 
 
+def cuda_block_size_ok(block_size, regs_per_threads=168):
+    """Checks if a given CUDA block size does not exceed the SM register limit.
+    168 registers per thread was obtained using cuobjdump on both SRT and Cumulant
+    kernels. You might want to validate that for your own kernels."""
+
+    return prod(block_size) * regs_per_threads < 64 * (2 ** 10)
+
+
+def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
+    """Checks if a single block of given size fits into GPU memory"""
+    return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem
+
+
 class Scenario:
-    def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
-        self.config_dict = deepcopy(BASE_CONFIG)
-        self.config_dict['Parameters'].update(kwargs)
-        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
-        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
+    def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(256, 1, 1),
+                 timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False,
+                 inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, init_shear_flow=False,
+                 additional_info=None):
+
+        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+
+        self.cells_per_block = cells_per_block
+        self.periodic = periodic
+
+        self.time_step_strategy = time_step_strategy
+        self.omega = omega
+        self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
+        self.cuda_enabled_mpi = cuda_enabled_mpi
+        self.inner_outer_split = inner_outer_split
+        self.init_shear_flow = init_shear_flow
+        self.warmup_steps = warmup_steps
+        self.outer_iterations = outer_iterations
+        self.cuda_blocks = cuda_blocks
+
+        self.vtk_write_frequency = 0
+
+        self.config_dict = self.config(print_dict=False)
+        self.additional_info = additional_info
 
     @wlb.member_callback
-    def config(self, **kwargs):
+    def config(self, print_dict=True):
         from pprint import pformat
-        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
-        # Write out the configuration as text-based prm:
-        # print(toPrm(self.config_dict))
-        return self.config_dict
+        config_dict = {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+            },
+            'Parameters': {
+                'omega': self.omega,
+                'cudaEnabledMPI': self.cuda_enabled_mpi,
+                'warmupSteps': self.warmup_steps,
+                'outerIterations': self.outer_iterations,
+                'timeStepStrategy': self.time_step_strategy,
+                'timesteps': self.timesteps,
+                'initShearFlow': self.init_shear_flow,
+                'gpuBlockSize': self.cuda_blocks,
+                'innerOuterSplit': self.inner_outer_split,
+                'vtkWriteFrequency': self.vtk_write_frequency
+            }
+        }
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+            if self.additional_info:
+                wlb.log_info_on_root("Additional Info:\n" + pformat(self.additional_info))
+        return config_dict
 
     @wlb.member_callback
     def results_callback(self, **kwargs):
@@ -62,6 +119,10 @@ class Scenario:
         data.update(self.config_dict['Parameters'])
         data.update(self.config_dict['DomainSetup'])
         data.update(kwargs)
+
+        if self.additional_info is not None:
+            data.update(self.additional_info)
+
         data['executable'] = sys.argv[0]
         data['compile_flags'] = wlb.build_info.compiler_flags
         data['walberla_version'] = wlb.build_info.version
@@ -78,7 +139,22 @@ class Scenario:
                 storeSingle(result, "runs", DB_FILE)
                 break
             except sqlite3.OperationalError as e:
-                wlb.log_warning("Sqlite DB writing failed: try {}/{}  {}".format(num_try + 1, num_tries, str(e)))
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+
+
+# -------------------------------------- Profiling -----------------------------------
+def profiling():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running 2 timesteps for profiling")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    cells = (128, 128, 128)
+    cuda_enabled_mpi = False
+
+    scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly',
+                           inner_outer_split=(1, 1, 1), timesteps=2, cuda_enabled_mpi=cuda_enabled_mpi,
+                           outer_iterations=1, warmup_steps=0))
 
 
 # -------------------------------------- Functions trying different parameter sets -----------------------------------
@@ -90,52 +166,21 @@ def overlap_benchmark():
     wlb.log_info_on_root("")
 
     scenarios = wlb.ScenarioManager()
+    cuda_enabled_mpi = False
     inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
                           (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
                           (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
 
-    # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
-    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
-        # no overlap
-        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy,
-                               innerOuterSplit=(1, 1, 1)))
-
-        # overlap
-        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
-            for inner_outer_split in inner_outer_splits:
-                scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                    communicationScheme=comm_strategy,
-                                    innerOuterSplit=inner_outer_split,
-                                    timesteps=num_time_steps((256, 128, 128)))
-                scenarios.add(scenario)
-
-
-def communication_compare():
-    """Tests different communication strategies"""
-    wlb.log_info_on_root("Running benchmarks to compare communication strategies")
-    wlb.log_info_on_root("")
+    # no overlap
+    scenarios.add(Scenario(time_step_strategy='noOverlap',
+                           inner_outer_split=(1, 1, 1),
+                           cuda_enabled_mpi=cuda_enabled_mpi))
 
-    scenarios = wlb.ScenarioManager()
-    for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
-        for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
-
-            sc = Scenario(cells_per_block=block_size,
-                          gpuBlockSize=(128, 1, 1),
-                          timeStepStrategy='noOverlap',
-                          communicationScheme=comm_strategy,
-                          timesteps=num_time_steps(block_size))
-            scenarios.add(sc)
-            for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
-                # ensure that the inner part of the domain is still large enough
-                if 3 * inner_outer_split[0] > block_size[0]:
-                    continue
-                sc = Scenario(cells_per_block=block_size,
-                              gpuBlockSize=(128, 1, 1),
-                              timeStepStrategy='simpleOverlap',
-                              innerOuterSplit=inner_outer_split,
-                              communicationScheme=comm_strategy,
-                              timesteps=num_time_steps(block_size))
-                scenarios.add(sc)
+    for inner_outer_split in inner_outer_splits:
+        scenario = Scenario(time_step_strategy='simpleOverlap',
+                            inner_outer_split=inner_outer_split,
+                            cuda_enabled_mpi=cuda_enabled_mpi)
+        scenarios.add(scenario)
 
 
 def single_gpu_benchmark():
@@ -143,8 +188,16 @@ def single_gpu_benchmark():
     wlb.log_info_on_root("Running single GPU benchmarks")
     wlb.log_info_on_root("")
 
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+    gpu_type = os.environ.get('GPU_TYPE')
+
+    additional_info = {}
+    if gpu_type is not None:
+        additional_info['gpu_type'] = gpu_type
+
     scenarios = wlb.ScenarioManager()
-    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    block_sizes = [(i, i, i) for i in (64, 128, 256, 320, 384, 448, 512)]
     cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
                    (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
                    (32, 4, 1), (64, 4, 1), (128, 4, 1),
@@ -152,10 +205,17 @@ def single_gpu_benchmark():
                    (32, 16, 1)]
     for block_size in block_sizes:
         for cuda_block_size in cuda_blocks:
+            if not cuda_block_size_ok(cuda_block_size):
+                wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.")
+                continue
+            if not domain_block_size_ok(block_size, gpu_mem):
+                wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+                continue
             scenario = Scenario(cells_per_block=block_size,
-                                gpuBlockSize=cuda_block_size,
-                                timeStepStrategy='kernelOnly',
-                                timesteps=num_time_steps(block_size))
+                                cuda_blocks=cuda_block_size,
+                                time_step_strategy='kernelOnly',
+                                timesteps=num_time_steps(block_size),
+                                additional_info=additional_info)
             scenarios.add(scenario)
 
 
@@ -207,7 +267,6 @@ do
 done
 """
 
-
 all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
                    'UniformGridBenchmarkGPU_smagorinsky_d3q27',
                    'UniformGridBenchmarkGPU_cumulant'
@@ -238,10 +297,9 @@ if __name__ == '__main__':
     print("Called without waLBerla - generating job scripts for PizDaint")
     generate_jobscripts()
 else:
-    wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE))
+    wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
     # Select the benchmark you want to run
-    single_gpu_benchmark()
-    # benchmarks different CUDA block sizes and domain sizes and measures single
-    # GPU performance of compute kernel (no communication)
-    # communication_compare(): benchmarks different communication routines, with and without overlap
-    # overlap_benchmark(): benchmarks different communication overlap options
+    single_gpu_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
+    # performance of compute kernel (no communication)
+    # overlap_benchmark()  # benchmarks different communication overlap options
+    # profiling()  # run only two timesteps on a smaller domain for profiling only
diff --git a/apps/showcases/CMakeLists.txt b/apps/showcases/CMakeLists.txt
index 31561a65b4a930f0f6f2380cfe9c0529dbc76be6..1b23d49540e177cec571cf9898a7000c0b335227 100644
--- a/apps/showcases/CMakeLists.txt
+++ b/apps/showcases/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_subdirectory( BidisperseFluidizedBed )
 add_subdirectory( CombinedResolvedUnresolved )
+add_subdirectory( FluidizedBed )
 add_subdirectory( HeatConduction )
 add_subdirectory( Mixer )
 add_subdirectory( PegIntoSphereBed )
diff --git a/apps/showcases/FluidizedBed/CMakeLists.txt b/apps/showcases/FluidizedBed/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e19bf49814f2979efb21258c37dcf49294b14016
--- /dev/null
+++ b/apps/showcases/FluidizedBed/CMakeLists.txt
@@ -0,0 +1,4 @@
+waLBerla_link_files_to_builddir( "*.prm" )
+
+waLBerla_add_executable ( NAME FluidizedBed FILES FluidizedBed.cpp
+                          DEPENDS blockforest boundary core domain_decomposition field lbm lbm_mesapd_coupling mesa_pd postprocessing timeloop vtk )
diff --git a/apps/showcases/FluidizedBed/FluidizedBed.cpp b/apps/showcases/FluidizedBed/FluidizedBed.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4795d48563cb283a5e2d2ad5ea89034e1a8be65b
--- /dev/null
+++ b/apps/showcases/FluidizedBed/FluidizedBed.cpp
@@ -0,0 +1,814 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FluidizedBed.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "boundary/all.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/debug/Debug.h"
+#include "core/math/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/logging/all.h"
+#include "core/mpi/Broadcast.h"
+#include "core/grid_generator/SCIterator.h"
+
+#include "domain_decomposition/SharedSweep.h"
+
+#include "field/AddToStorage.h"
+#include "field/communication/PackInfo.h"
+
+#include "lbm/boundary/all.h"
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/lattice_model/D3Q19.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+
+#include "lbm_mesapd_coupling/mapping/ParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/boundary/CurvedLinear.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/Reconstructor.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/PdfReconstructionManager.h"
+#include "lbm_mesapd_coupling/utility/AddForceOnParticlesKernel.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+#include "lbm_mesapd_coupling/DataTypes.h"
+#include "lbm_mesapd_coupling/utility/AverageHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/AddHydrodynamicInteractionKernel.h"
+#include "lbm_mesapd_coupling/utility/ResetHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/LubricationCorrectionKernel.h"
+#include "lbm_mesapd_coupling/utility/InitializeHydrodynamicForceTorqueForAveragingKernel.h"
+
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/shape/HalfSpace.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/domain/BlockForestDataHandling.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/LinearSpringDashpot.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/kernel/VelocityVerlet.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/mpi/ReduceProperty.h"
+#include "mesa_pd/mpi/ReduceContactHistory.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+#include "mesa_pd/mpi/notifications/ForceTorqueNotification.h"
+#include "mesa_pd/mpi/notifications/HydrodynamicForceTorqueNotification.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "vtk/all.h"
+#include "field/vtk/all.h"
+#include "lbm/vtk/all.h"
+
+namespace fluidized_bed
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using walberla::uint_t;
+
+using LatticeModel_T = lbm::D3Q19< lbm::collision_model::SRT>;
+
+using Stencil_T = LatticeModel_T::Stencil;
+using PdfField_T = lbm::PdfField<LatticeModel_T>;
+
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+
+using ScalarField_T = GhostLayerField< real_t, 1>;
+
+const uint_t FieldGhostLayers = 1;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag( "fluid" );
+const FlagUID NoSlip_Flag( "no slip" );
+const FlagUID MO_Flag( "moving obstacle" );
+const FlagUID FormerMO_Flag( "former moving obstacle" );
+const FlagUID Inflow_Flag("inflow");
+const FlagUID Outflow_Flag("outflow");
+
+/////////////////////////////////////
+// BOUNDARY HANDLING CUSTOMIZATION //
+/////////////////////////////////////
+template <typename ParticleAccessor_T>
+class MyBoundaryHandling
+{
+public:
+
+   using NoSlip_T = lbm::NoSlip< LatticeModel_T, flag_t >;
+   using MO_T = lbm_mesapd_coupling::CurvedLinear< LatticeModel_T, FlagField_T, ParticleAccessor_T >;
+   using Inflow_T = lbm::SimpleUBB< LatticeModel_T, flag_t >;
+   using Outflow_T = lbm::SimplePressure< LatticeModel_T, flag_t >;
+   using Type = BoundaryHandling< FlagField_T, Stencil_T, NoSlip_T, MO_T, Inflow_T, Outflow_T >;
+
+   MyBoundaryHandling( const BlockDataID & flagFieldID, const BlockDataID & pdfFieldID,
+                       const BlockDataID & particleFieldID, const shared_ptr<ParticleAccessor_T>& ac,
+                       Vector3<real_t> inflowVelocity, bool periodicInX, bool periodicInY) :
+         flagFieldID_( flagFieldID ), pdfFieldID_( pdfFieldID ), particleFieldID_( particleFieldID ), ac_( ac ),
+         inflowVelocity_(inflowVelocity), periodicInX_(periodicInX), periodicInY_(periodicInY) {}
+
+   Type * operator()( IBlock* const block, const StructuredBlockStorage* const storage ) const
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR( block );
+      WALBERLA_ASSERT_NOT_NULLPTR( storage );
+
+      auto * flagField     = block->getData< FlagField_T >( flagFieldID_ );
+      auto *  pdfField     = block->getData< PdfField_T > ( pdfFieldID_ );
+      auto * particleField = block->getData< lbm_mesapd_coupling::ParticleField_T > ( particleFieldID_ );
+
+      const auto fluid = flagField->flagExists( Fluid_Flag ) ? flagField->getFlag( Fluid_Flag ) : flagField->registerFlag( Fluid_Flag );
+
+      Type * handling = new Type( "moving obstacle boundary handling", flagField, fluid,
+                                  NoSlip_T( "NoSlip", NoSlip_Flag, pdfField ),
+                                  MO_T( "MO", MO_Flag, pdfField, flagField, particleField, ac_, fluid, *storage, *block ),
+                                  Inflow_T( "Inflow", Inflow_Flag, pdfField, inflowVelocity_),
+                                  Outflow_T( "Outflow", Outflow_Flag, pdfField, real_t(1) ) );
+
+
+      const auto inflow  = flagField->getFlag( Inflow_Flag );
+      const auto outflow = flagField->getFlag( Outflow_Flag );
+      const auto noslip = flagField->getFlag( NoSlip_Flag );
+
+      CellInterval domainBB = storage->getDomainCellBB();
+
+      domainBB.xMin() -= cell_idx_c( FieldGhostLayers );
+      domainBB.xMax() += cell_idx_c( FieldGhostLayers );
+
+      domainBB.zMin() -= cell_idx_c( FieldGhostLayers );
+      domainBB.zMax() += cell_idx_c( FieldGhostLayers );
+
+      domainBB.yMin() -= cell_idx_c( FieldGhostLayers );
+      domainBB.yMax() += cell_idx_c( FieldGhostLayers );
+
+      // inflow at bottom
+      CellInterval bottom( domainBB.xMin(), domainBB.yMin(), domainBB.zMin(), domainBB.xMax(), domainBB.yMax(), domainBB.zMin() );
+      storage->transformGlobalToBlockLocalCellInterval( bottom, *block );
+      handling->forceBoundary( inflow, bottom );
+
+      // outflow at top
+      CellInterval top( domainBB.xMin(), domainBB.yMin(), domainBB.zMax(), domainBB.xMax(), domainBB.yMax(), domainBB.zMax() );
+      storage->transformGlobalToBlockLocalCellInterval( top, *block );
+      handling->forceBoundary( outflow, top );
+
+      if (!periodicInX_) {
+         // left
+         CellInterval left(domainBB.xMin(), domainBB.yMin(), domainBB.zMin(), domainBB.xMin(), domainBB.yMax(), domainBB.zMax());
+         storage->transformGlobalToBlockLocalCellInterval(left, *block);
+         handling->forceBoundary(noslip, left);
+
+         // right
+         CellInterval right(domainBB.xMax(), domainBB.yMin(), domainBB.zMin(), domainBB.xMax(), domainBB.yMax(), domainBB.zMax());
+         storage->transformGlobalToBlockLocalCellInterval(right, *block);
+         handling->forceBoundary(noslip, right);
+      }
+
+      if (!periodicInY_) {
+         // front
+         CellInterval front(domainBB.xMin(), domainBB.yMin(), domainBB.zMin(), domainBB.xMax(), domainBB.yMin(), domainBB.zMax());
+         storage->transformGlobalToBlockLocalCellInterval(front, *block);
+         handling->forceBoundary(noslip, front);
+
+         // back
+         CellInterval back(domainBB.xMin(), domainBB.yMax(), domainBB.zMin(), domainBB.xMax(), domainBB.yMax(), domainBB.zMax());
+         storage->transformGlobalToBlockLocalCellInterval(back, *block);
+         handling->forceBoundary(noslip, back);
+      }
+
+      handling->fillWithDomain( FieldGhostLayers );
+
+      return handling;
+   }
+
+private:
+
+   const BlockDataID flagFieldID_;
+   const BlockDataID pdfFieldID_;
+   const BlockDataID particleFieldID_;
+
+   shared_ptr<ParticleAccessor_T> ac_;
+
+   Vector3<real_t> inflowVelocity_;
+   bool periodicInX_;
+   bool periodicInY_;
+};
+//*******************************************************************************************************************
+
+
+void createPlane(const shared_ptr<mesa_pd::data::ParticleStorage> & ps, const shared_ptr<mesa_pd::data::ShapeStorage> & ss,
+                 Vector3<real_t> position, Vector3<real_t> normal) {
+   mesa_pd::data::Particle&& p0 = *ps->create(true);
+   p0.setPosition(position);
+   p0.setInteractionRadius(std::numeric_limits<real_t>::infinity());
+   p0.setShapeID(ss->create<mesa_pd::data::HalfSpace>( normal ));
+   p0.setOwner(mpi::MPIManager::instance()->rank());
+   p0.setType(0);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::INFINITE);
+   mesa_pd::data::particle_flags::set(p0.getFlagsRef(), mesa_pd::data::particle_flags::FIXED);
+}
+
+void createPlaneSetup(const shared_ptr<mesa_pd::data::ParticleStorage> & ps, const shared_ptr<mesa_pd::data::ShapeStorage> & ss,
+                      const math::AABB & simulationDomain, bool periodicInX, bool periodicInY, real_t offsetAtInflow, real_t offsetAtOutflow )
+{
+   createPlane(ps, ss, simulationDomain.minCorner() + Vector3<real_t>(0,0,offsetAtInflow), Vector3<real_t>(0,0,1));
+   createPlane(ps, ss, simulationDomain.maxCorner() + Vector3<real_t>(0,0,offsetAtOutflow), Vector3<real_t>(0,0,-1));
+
+   if(!periodicInX)
+   {
+      createPlane(ps, ss, simulationDomain.minCorner(), Vector3<real_t>(1,0,0));
+      createPlane(ps, ss, simulationDomain.maxCorner(), Vector3<real_t>(-1,0,0));
+   }
+
+   if(!periodicInY)
+   {
+      createPlane(ps, ss, simulationDomain.minCorner(), Vector3<real_t>(0,1,0));
+      createPlane(ps, ss, simulationDomain.maxCorner(), Vector3<real_t>(0,-1,0));
+   }
+}
+
+struct ParticleInfo
+{
+   real_t averageVelocity = 0_r;
+   real_t maximumVelocity = 0_r;
+   uint_t numParticles = 0;
+   real_t maximumHeight = 0_r;
+   real_t particleVolume = 0_r;
+   real_t heightOfMass = 0_r;
+
+   void allReduce()
+   {
+      walberla::mpi::allReduceInplace(numParticles, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(averageVelocity, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(maximumVelocity, walberla::mpi::MAX);
+      walberla::mpi::allReduceInplace(maximumHeight, walberla::mpi::MAX);
+      walberla::mpi::allReduceInplace(particleVolume, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(heightOfMass, walberla::mpi::SUM);
+
+      averageVelocity /= real_c(numParticles);
+      heightOfMass /= particleVolume;
+   }
+};
+
+std::ostream &operator<<(std::ostream &os, ParticleInfo const &m) {
+   return os << "Particle Info: uAvg = " << m.averageVelocity << ", uMax = " << m.maximumVelocity
+             << ", numParticles = " << m.numParticles << ", zMax = " << m.maximumHeight << ", Vp = "
+             << m.particleVolume << ", zMass = " << m.heightOfMass;
+}
+
+
+template< typename Accessor_T>
+ParticleInfo evaluateParticleInfo(const Accessor_T & ac)
+{
+   static_assert (std::is_base_of<mesa_pd::data::IAccessor, Accessor_T>::value, "Provide a valid accessor" );
+
+   ParticleInfo info;
+   for(uint_t i = 0; i < ac.size(); ++i)
+   {
+      if (isSet(ac.getFlags(i), mesa_pd::data::particle_flags::GHOST)) continue;
+      if (isSet(ac.getFlags(i), mesa_pd::data::particle_flags::GLOBAL)) continue;
+
+      ++info.numParticles;
+      real_t velMagnitude = ac.getLinearVelocity(i).length();
+      real_t particleVolume = ac.getShape(i)->getVolume();
+      real_t height = ac.getPosition(i)[2];
+      info.averageVelocity += velMagnitude;
+      info.maximumVelocity = std::max(info.maximumVelocity, velMagnitude);
+      info.maximumHeight = std::max(info.maximumHeight, height);
+      info.particleVolume += particleVolume;
+      info.heightOfMass += particleVolume*height;
+   }
+
+   info.allReduce();
+
+   return info;
+}
+
+struct FluidInfo
+{
+   uint_t numFluidCells = 0;
+   real_t averageVelocity = 0_r;
+   real_t maximumVelocity = 0_r;
+   real_t averageDensity = 0_r;
+   real_t maximumDensity = 0_r;
+
+
+   void allReduce()
+   {
+      walberla::mpi::allReduceInplace(numFluidCells, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(averageVelocity, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(maximumVelocity, walberla::mpi::MAX);;
+      walberla::mpi::allReduceInplace(averageDensity, walberla::mpi::SUM);
+      walberla::mpi::allReduceInplace(maximumDensity, walberla::mpi::MAX);
+
+      averageVelocity /= real_c(numFluidCells);
+      averageDensity /= real_c(numFluidCells);
+   }
+};
+
+std::ostream &operator<<(std::ostream &os, FluidInfo const &m) {
+   return os << "Fluid Info: numFluidCells = " << m.numFluidCells
+             << ", uAvg = " << m.averageVelocity << ", uMax = " << m.maximumVelocity
+             << ", densityAvg = " << m.averageDensity << ", densityMax = " << m.maximumDensity;
+}
+
+
+template <typename BoundaryHandling_T>
+FluidInfo evaluateFluidInfo( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & pdfFieldID, const BlockDataID & boundaryHandlingID )
+{
+   FluidInfo info;
+
+   for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
+   {
+      auto pdfField = blockIt->getData< PdfField_T > ( pdfFieldID );
+      auto boundaryHandling = blockIt->getData< BoundaryHandling_T >( boundaryHandlingID );
+
+      WALBERLA_FOR_ALL_CELLS_XYZ(pdfField,
+                                 if( !boundaryHandling->isDomain(x,y,z) ) continue;
+                                 ++info.numFluidCells;
+                                 Vector3<real_t> velocity(0_r);
+                                 real_t density = pdfField->getDensityAndVelocity(velocity, x, y, z);
+                                 real_t velMagnitude = velocity.length();
+                                 info.averageVelocity += velMagnitude;
+                                 info.maximumVelocity = std::max(info.maximumVelocity, velMagnitude);
+                                 info.averageDensity += density;
+                                 info.maximumDensity = std::max(info.maximumDensity, density);
+      )
+   }
+   info.allReduce();
+   return info;
+}
+
+
+
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Basic simulation of a fluidization setup
+ *
+ * Initially, the mono-sized sphere are created on a structured grid inside the domain.
+ * The domain is either periodic or bounded by walls in the horizontal directions (x and y).
+ * In z-direction, a constant inflow from below is provided
+ * and a pressure boundary condition is set at the top, resembling an outflow boundary.
+ *
+ * The simulation is run for the given number of seconds (runtime).
+ *
+ * All parameters should be set via the input file.
+ *
+ * For the overall algorithm and the different model parameters, see
+ * Rettinger, RÃ¼de - An efficient four-way coupled lattice Boltzmann - discrete element method for
+ * fully resolved simulations of particle-laden flows (2020, preprint: https://arxiv.org/abs/2003.01490)
+ *
+ */
+//*******************************************************************************************************************
+int main( int argc, char **argv )
+{
+   Environment env( argc, argv );
+
+   auto cfgFile = env.config();
+   if( !cfgFile )
+   {
+      WALBERLA_ABORT("Usage: " << argv[0] << " path-to-configuration-file \n");
+   }
+
+   WALBERLA_LOG_INFO_ON_ROOT(*cfgFile);
+
+   // read all parameters from the config file
+
+   Config::BlockHandle physicalSetup = cfgFile->getBlock( "PhysicalSetup" );
+   const real_t xSize_SI = physicalSetup.getParameter<real_t>("xSize");
+   const real_t ySize_SI = physicalSetup.getParameter<real_t>("ySize");
+   const real_t zSize_SI = physicalSetup.getParameter<real_t>("zSize");
+   const bool periodicInX = physicalSetup.getParameter<bool>("periodicInX");
+   const bool periodicInY = physicalSetup.getParameter<bool>("periodicInY");
+   const real_t runtime_SI = physicalSetup.getParameter<real_t>("runtime");
+   const real_t uInflow_SI = physicalSetup.getParameter<real_t>("uInflow");
+   const real_t gravitationalAcceleration_SI = physicalSetup.getParameter<real_t>("gravitationalAcceleration");
+   const real_t kinematicViscosityFluid_SI = physicalSetup.getParameter<real_t>("kinematicViscosityFluid");
+   const real_t densityFluid_SI = physicalSetup.getParameter<real_t>("densityFluid");
+   const real_t particleDiameter_SI = physicalSetup.getParameter<real_t>("particleDiameter");
+   const real_t densityParticle_SI = physicalSetup.getParameter<real_t>("densityParticle");
+   const real_t dynamicFrictionCoefficient = physicalSetup.getParameter<real_t>("dynamicFrictionCoefficient");
+   const real_t coefficientOfRestitution = physicalSetup.getParameter<real_t>("coefficientOfRestitution");
+   const real_t particleGenerationSpacing_SI = physicalSetup.getParameter<real_t>("particleGenerationSpacing");
+
+   Config::BlockHandle numericalSetup = cfgFile->getBlock( "NumericalSetup" );
+   const real_t dx_SI = numericalSetup.getParameter<real_t>("dx");
+   const real_t uInflow = numericalSetup.getParameter<real_t>("uInflow");
+   const uint_t numXBlocks = numericalSetup.getParameter<uint_t>("numXBlocks");
+   const uint_t numYBlocks = numericalSetup.getParameter<uint_t>("numYBlocks");
+   const uint_t numZBlocks = numericalSetup.getParameter<uint_t>("numZBlocks");
+   const bool useLubricationForces = numericalSetup.getParameter<bool>("useLubricationForces");
+   const uint_t numberOfParticleSubCycles = numericalSetup.getParameter<uint_t>("numberOfParticleSubCycles");
+
+   Config::BlockHandle outputSetup = cfgFile->getBlock( "Output" );
+   const real_t infoSpacing_SI = outputSetup.getParameter<real_t>("infoSpacing");
+   const real_t vtkSpacingParticles_SI = outputSetup.getParameter<real_t>("vtkSpacingParticles");
+   const real_t vtkSpacingFluid_SI = outputSetup.getParameter<real_t>("vtkSpacingFluid");
+   const std::string vtkFolder = outputSetup.getParameter<std::string>("vtkFolder");
+
+   // convert SI units to simulation (LBM) units and check setup
+
+   Vector3<uint_t> domainSize(uint_c(std::ceil(xSize_SI / dx_SI)),
+                              uint_c(std::ceil(ySize_SI / dx_SI)),
+                              uint_c(std::ceil(zSize_SI / dx_SI)));
+   WALBERLA_CHECK_FLOAT_EQUAL(real_t(domainSize[0]) * dx_SI, xSize_SI, "domain size in x is not divisible by given dx");
+   WALBERLA_CHECK_FLOAT_EQUAL(real_t(domainSize[1]) * dx_SI, ySize_SI, "domain size in y is not divisible by given dx");
+   WALBERLA_CHECK_FLOAT_EQUAL(real_t(domainSize[2]) * dx_SI, zSize_SI, "domain size in z is not divisible by given dx");
+
+   Vector3<uint_t> cellsPerBlockPerDirection( domainSize[0] / numXBlocks,
+                                              domainSize[1] / numYBlocks,
+                                              domainSize[2] / numZBlocks );
+
+   WALBERLA_CHECK_EQUAL(domainSize[0], cellsPerBlockPerDirection[0] * numXBlocks, "number of cells in x of " << domainSize[0] << " is not divisible by given number of blocks in x direction");
+   WALBERLA_CHECK_EQUAL(domainSize[1], cellsPerBlockPerDirection[1] * numYBlocks, "number of cells in y of " << domainSize[1] << " is not divisible by given number of blocks in y direction");
+   WALBERLA_CHECK_EQUAL(domainSize[2], cellsPerBlockPerDirection[2] * numZBlocks, "number of cells in z of " << domainSize[2] << " is not divisible by given number of blocks in z direction");
+
+   WALBERLA_CHECK_GREATER(particleDiameter_SI / dx_SI, 5_r, "Your numerical resolution is below 5 cells per diameter and thus too small for such simulations!");
+
+   const real_t densityRatio = densityParticle_SI / densityFluid_SI;
+   const real_t ReynoldsNumberParticle = uInflow_SI * particleDiameter_SI / kinematicViscosityFluid_SI;
+   const real_t GalileiNumber = std::sqrt((densityRatio-1_r)*particleDiameter_SI*gravitationalAcceleration_SI) * particleDiameter_SI / kinematicViscosityFluid_SI;
+
+   // in simulation units: dt = 1, dx = 1, densityFluid = 1
+
+   const real_t dt_SI = uInflow / uInflow_SI * dx_SI;
+   const real_t diameter = particleDiameter_SI / dx_SI;
+   const real_t particleGenerationSpacing = particleGenerationSpacing_SI / dx_SI;
+   const real_t viscosity =  kinematicViscosityFluid_SI * dt_SI / ( dx_SI * dx_SI );
+   const real_t omega = lbm::collision_model::omegaFromViscosity(viscosity);
+   const real_t gravitationalAcceleration = gravitationalAcceleration_SI * dt_SI * dt_SI / dx_SI;
+   const real_t particleVolume = math::pi / 6_r * diameter * diameter * diameter;
+
+   const real_t densityFluid = real_t(1);
+   const real_t densityParticle = densityRatio;
+   const real_t dx = real_t(1);
+
+   const uint_t numTimeSteps = uint_c(std::ceil(runtime_SI / dt_SI));
+   const uint_t infoSpacing = uint_c(std::ceil(infoSpacing_SI / dt_SI));
+   const uint_t vtkSpacingParticles = uint_c(std::ceil(vtkSpacingParticles_SI / dt_SI));
+   const uint_t vtkSpacingFluid = uint_c(std::ceil(vtkSpacingFluid_SI / dt_SI));
+
+   const Vector3<real_t> inflowVec(0_r, 0_r, uInflow);
+
+   const real_t poissonsRatio = real_t(0.22);
+   const real_t kappa = real_t(2) * ( real_t(1) - poissonsRatio ) / ( real_t(2) - poissonsRatio ) ;
+   const real_t particleCollisionTime = 4_r * diameter;
+
+   WALBERLA_LOG_INFO_ON_ROOT("Simulation setup:");
+   WALBERLA_LOG_INFO_ON_ROOT(" - particles: diameter = " << diameter << ", densityRatio = " << densityRatio);
+   WALBERLA_LOG_INFO_ON_ROOT(" - fluid: kin. visc = " << viscosity << ", relaxation rate = " << omega );
+   WALBERLA_LOG_INFO_ON_ROOT(" - grav. acceleration = " << gravitationalAcceleration );
+   WALBERLA_LOG_INFO_ON_ROOT(" - Galileo number = " << GalileiNumber );
+   WALBERLA_LOG_INFO_ON_ROOT(" - particle Reynolds number = " << ReynoldsNumberParticle );
+   WALBERLA_LOG_INFO_ON_ROOT(" - domain size = " << domainSize);
+   WALBERLA_LOG_INFO_ON_ROOT(" - cells per blocks per direction = " << cellsPerBlockPerDirection);
+   WALBERLA_LOG_INFO_ON_ROOT(" - dx = " << dx_SI << " m");
+   WALBERLA_LOG_INFO_ON_ROOT(" - dt = " << dt_SI << " s");
+   WALBERLA_LOG_INFO_ON_ROOT(" - total time steps = " << numTimeSteps);
+   WALBERLA_LOG_INFO_ON_ROOT(" - particle generation spacing = " << particleGenerationSpacing);
+   WALBERLA_LOG_INFO_ON_ROOT(" - info spacing = " << infoSpacing );
+   WALBERLA_LOG_INFO_ON_ROOT(" - vtk spacing particles = " << vtkSpacingParticles << ", fluid slice = " << vtkSpacingFluid);
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   const bool periodicInZ = false;
+   shared_ptr< StructuredBlockForest > blocks  = blockforest::createUniformBlockGrid( numXBlocks, numYBlocks, numZBlocks,
+                                                                                      cellsPerBlockPerDirection[0], cellsPerBlockPerDirection[1], cellsPerBlockPerDirection[2], dx,
+                                                                                      0, false, false,
+                                                                                      periodicInX, periodicInY, periodicInZ, //periodicity
+                                                                                      false );
+
+   auto simulationDomain = blocks->getDomain();
+
+   //////////////////
+   // RPD COUPLING //
+   //////////////////
+
+   auto rpdDomain = std::make_shared<mesa_pd::domain::BlockForestDomain>(blocks->getBlockForestPointer());
+
+   //init data structures
+   auto ps = walberla::make_shared<mesa_pd::data::ParticleStorage>(1);
+   auto ss = walberla::make_shared<mesa_pd::data::ShapeStorage>();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor = walberla::make_shared<ParticleAccessor_T >(ps, ss);
+
+   // prevent particles from interfering with inflow and outflow by putting the bounding planes slightly in front
+   const real_t planeOffsetFromInflow = dx;
+   const real_t planeOffsetFromOutflow = dx;
+   createPlaneSetup(ps, ss, simulationDomain, periodicInX, periodicInY, planeOffsetFromInflow, planeOffsetFromOutflow);
+
+   auto sphereShape = ss->create<mesa_pd::data::Sphere>( diameter * real_t(0.5) );
+   ss->shapes[sphereShape]->updateMassAndInertia(densityParticle);
+
+   // create spheres
+   auto generationDomain = simulationDomain.getExtended(-particleGenerationSpacing*0.5_r);
+   for (auto pt : grid_generator::SCGrid( generationDomain, generationDomain.center(), particleGenerationSpacing))
+   {
+      if (rpdDomain->isContainedInProcessSubdomain(uint_c(mpi::MPIManager::instance()->rank()), pt)) {
+         mesa_pd::data::Particle &&p = *ps->create();
+         p.setPosition(pt);
+         p.setInteractionRadius(diameter * real_t(0.5));
+         p.setOwner(mpi::MPIManager::instance()->rank());
+         p.setShapeID(sphereShape);
+         p.setType(0);
+         p.setLinearVelocity(0.1_r * Vector3<real_t>(math::realRandom(-uInflow, uInflow))); // set small initial velocity to break symmetries
+      }
+   }
+
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   LatticeModel_T latticeModel = LatticeModel_T(omega);
+
+   // add PDF field
+   BlockDataID pdfFieldID = lbm::addPdfFieldToStorage< LatticeModel_T >( blocks, "pdf field", latticeModel,
+                                                                         inflowVec, densityFluid,
+                                                                uint_t(1), field::zyxf );
+
+
+   // add flag field
+   BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
+
+   // add particle field
+   BlockDataID particleFieldID = field::addToStorage<lbm_mesapd_coupling::ParticleField_T>( blocks, "particle field", accessor->getInvalidUid(), field::zyxf, FieldGhostLayers );
+
+   // add boundary handling
+   using BoundaryHandling_T = MyBoundaryHandling<ParticleAccessor_T>::Type;
+   BlockDataID boundaryHandlingID = blocks->addStructuredBlockData< BoundaryHandling_T >(MyBoundaryHandling<ParticleAccessor_T>( flagFieldID, pdfFieldID, particleFieldID, accessor, inflowVec, periodicInX, periodicInY), "boundary handling" );
+
+   // set up RPD functionality
+   std::function<void(void)> syncCall = [&ps,&rpdDomain](){
+      const real_t overlap = real_t( 1.5 );
+      mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+      syncNextNeighborFunc(*ps, *rpdDomain, overlap);
+   };
+
+   syncCall();
+
+   real_t timeStepSizeRPD = real_t(1)/real_t(numberOfParticleSubCycles);
+   mesa_pd::kernel::VelocityVerletPreForceUpdate  vvIntegratorPreForce(timeStepSizeRPD);
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(timeStepSizeRPD);
+   mesa_pd::kernel::LinearSpringDashpot collisionResponse(1);
+   collisionResponse.setFrictionCoefficientDynamic(0,0,dynamicFrictionCoefficient);
+   mesa_pd::mpi::ReduceProperty reduceProperty;
+   mesa_pd::mpi::ReduceContactHistory reduceAndSwapContactHistory;
+
+   // set up coupling functionality
+   Vector3<real_t> gravitationalForce( real_t(0), real_t(0), -(densityParticle - densityFluid) * gravitationalAcceleration * particleVolume );
+   lbm_mesapd_coupling::AddForceOnParticlesKernel addGravitationalForce(gravitationalForce);
+   lbm_mesapd_coupling::ResetHydrodynamicForceTorqueKernel resetHydrodynamicForceTorque;
+   lbm_mesapd_coupling::AverageHydrodynamicForceTorqueKernel averageHydrodynamicForceTorque;
+   lbm_mesapd_coupling::LubricationCorrectionKernel lubricationCorrectionKernel(viscosity, [](real_t r){return (real_t(0.001 + real_t(0.00007)*r))*r;});
+   lbm_mesapd_coupling::ParticleMappingKernel<BoundaryHandling_T> particleMappingKernel(blocks, boundaryHandlingID);
+   lbm_mesapd_coupling::MovingParticleMappingKernel<BoundaryHandling_T> movingParticleMappingKernel(blocks, boundaryHandlingID, particleFieldID);
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // map particles into the LBM simulation
+   // note: planes are not mapped and are thus only visible to the particles, not to the fluid
+   // instead, the respective boundary conditions for the fluid are explicitly set, see the boundary handling
+   ps->forEachParticle(false, lbm_mesapd_coupling::RegularParticlesSelector(), *accessor, movingParticleMappingKernel, *accessor, MO_Flag);
+
+   // setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+   blockforest::communication::UniformBufferedScheme< Stencil_T > optimizedPDFCommunicationScheme( blocks );
+   optimizedPDFCommunicationScheme.addPackInfo( make_shared< lbm::PdfFieldPackInfo< LatticeModel_T > >( pdfFieldID ) ); // optimized sync
+
+   blockforest::communication::UniformBufferedScheme< Stencil_T > fullPDFCommunicationScheme( blocks );
+   fullPDFCommunicationScheme.addPackInfo( make_shared< field::communication::PackInfo< PdfField_T > >( pdfFieldID ) ); // full sync
+
+   // create the timeloop
+
+   SweepTimeloop timeloop( blocks->getBlockStorage(), numTimeSteps );
+
+   timeloop.addFuncBeforeTimeStep( RemainingTimeLogger( timeloop.getNrOfTimeSteps() ), "Remaining Time Logger" );
+
+   // vtk output
+   if( vtkSpacingParticles != uint_t(0) ) {
+      // sphere
+      auto particleVtkOutput = make_shared<mesa_pd::vtk::ParticleVtkOutput>(ps);
+      particleVtkOutput->addOutput<mesa_pd::data::SelectParticleUid>("uid");
+      particleVtkOutput->addOutput<mesa_pd::data::SelectParticleLinearVelocity>("velocity");
+      particleVtkOutput->addOutput<mesa_pd::data::SelectParticleInteractionRadius>("radius");
+      //limit output to process-local spheres
+      particleVtkOutput->setParticleSelector([sphereShape](const mesa_pd::data::ParticleStorage::iterator &pIt) { return pIt->getShapeID() == sphereShape &&
+                                                                                                                         !(mesa_pd::data::particle_flags::isSet(pIt->getFlags(), mesa_pd::data::particle_flags::GHOST)); });
+      auto particleVtkWriter = vtk::createVTKOutput_PointData( particleVtkOutput, "particles", vtkSpacingParticles, vtkFolder );
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(particleVtkWriter), "VTK (sphere data)");
+   }
+
+   if( vtkSpacingFluid != uint_t(0) )
+   {
+      // velocity field, only a slice
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData( blocks, "fluid", vtkSpacingFluid, 0, false, vtkFolder );
+
+      pdfFieldVTK->addBeforeFunction( fullPDFCommunicationScheme );
+
+      AABB sliceAABB( real_t(0), real_c(domainSize[1])*real_t(0.5)-real_t(1), real_t(0),
+                      real_c(domainSize[0]), real_c(domainSize[1])*real_t(0.5)+real_t(1), real_c(domainSize[2]) );
+      vtk::AABBCellFilter aabbSliceFilter( sliceAABB );
+
+      field::FlagFieldCellFilter< FlagField_T > fluidFilter( flagFieldID );
+      fluidFilter.addFlag( Fluid_Flag );
+
+      vtk::ChainedFilter combinedSliceFilter;
+      combinedSliceFilter.addFilter( fluidFilter );
+      combinedSliceFilter.addFilter( aabbSliceFilter );
+
+      pdfFieldVTK->addCellInclusionFilter( combinedSliceFilter );
+
+      pdfFieldVTK->addCellDataWriter( make_shared< lbm::VelocityVTKWriter< LatticeModel_T, float > >( pdfFieldID, "Velocity" ) );
+      pdfFieldVTK->addCellDataWriter( make_shared< lbm::DensityVTKWriter < LatticeModel_T, float > >( pdfFieldID, "Density" ) );
+
+      timeloop.addFuncBeforeTimeStep( vtk::writeFiles( pdfFieldVTK ), "VTK (fluid field data)" );
+
+   }
+
+   if( vtkSpacingFluid != uint_t(0) || vtkSpacingParticles != uint_t(0) )
+   {
+      vtk::writeDomainDecomposition( blocks, "domain_decomposition", vtkFolder );
+   }
+
+   // add LBM communication function and boundary handling sweep (does the hydro force calculations and the no-slip treatment)
+   timeloop.add() << BeforeFunction( optimizedPDFCommunicationScheme, "LBM Communication" )
+                  << Sweep( BoundaryHandling_T::getBlockSweep( boundaryHandlingID ), "Boundary Handling" );
+
+   // stream + collide LBM step
+   auto lbmSweep = lbm::makeCellwiseSweep< LatticeModel_T, FlagField_T >( pdfFieldID, flagFieldID, Fluid_Flag );
+   timeloop.add() << Sweep( makeSharedSweep( lbmSweep ), "LBM stream / collide" );
+
+
+   // this is carried out after the particle integration, it corrects the flag field and restores missing PDF information
+   // then, the checkpointing file can be written, as otherwise some cells are invalid and can not be recovered
+   SweepTimeloop timeloopAfterParticles( blocks->getBlockStorage(), numTimeSteps );
+
+   // sweep for updating the particle mapping into the LBM simulation
+   bool strictlyConserveMomentum = false;
+   timeloopAfterParticles.add() << Sweep( lbm_mesapd_coupling::makeMovingParticleMapping<PdfField_T,BoundaryHandling_T>(blocks, pdfFieldID,boundaryHandlingID, particleFieldID, accessor, MO_Flag, FormerMO_Flag, lbm_mesapd_coupling::RegularParticlesSelector(), strictlyConserveMomentum), "Particle Mapping" );
+
+   // sweep for restoring PDFs in cells previously occupied by particles
+   bool reconstruction_recomputeTargetDensity = false;
+   bool reconstruction_useCentralDifferences = true;
+   auto gradReconstructor = lbm_mesapd_coupling::makeGradsMomentApproximationReconstructor<BoundaryHandling_T>(blocks, boundaryHandlingID, omega, reconstruction_recomputeTargetDensity,reconstruction_useCentralDifferences);
+   timeloopAfterParticles.add() << BeforeFunction( fullPDFCommunicationScheme, "PDF Communication" )
+                                << Sweep( makeSharedSweep(lbm_mesapd_coupling::makePdfReconstructionManager<PdfField_T,BoundaryHandling_T>(blocks, pdfFieldID, boundaryHandlingID, particleFieldID, accessor, FormerMO_Flag, Fluid_Flag, gradReconstructor, strictlyConserveMomentum) ), "PDF Restore" );
+
+   ////////////////////////
+   // EXECUTE SIMULATION //
+   ////////////////////////
+
+   WcTimingPool timeloopTiming;
+   const bool useOpenMP = false;
+
+   // time loop
+   for (uint_t timeStep = 0; timeStep < numTimeSteps; ++timeStep )
+   {
+      // perform a single simulation step -> this contains LBM and setting of the hydrodynamic interactions
+      timeloop.singleStep( timeloopTiming );
+
+      reduceProperty.operator()<mesa_pd::HydrodynamicForceTorqueNotification>(*ps);
+
+      if( timeStep == 0 )
+      {
+         lbm_mesapd_coupling::InitializeHydrodynamicForceTorqueForAveragingKernel initializeHydrodynamicForceTorqueForAveragingKernel;
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, initializeHydrodynamicForceTorqueForAveragingKernel, *accessor );
+      }
+      ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, averageHydrodynamicForceTorque, *accessor );
+
+      for(auto subCycle = uint_t(0); subCycle < numberOfParticleSubCycles; ++subCycle )
+      {
+
+         timeloopTiming["RPD"].start();
+
+         ps->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPreForce, *accessor);
+         syncCall();
+
+         if(useLubricationForces)
+         {
+            // lubrication correction
+            ps->forEachParticlePairHalf(useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+                                        [&lubricationCorrectionKernel,&rpdDomain](const size_t idx1, const size_t idx2, auto& ac)
+                                        {
+                                           mesa_pd::collision_detection::AnalyticContactDetection acd;
+                                           acd.getContactThreshold() = lubricationCorrectionKernel.getNormalCutOffDistance();
+                                           mesa_pd::kernel::DoubleCast double_cast;
+                                           mesa_pd::mpi::ContactFilter contact_filter;
+                                           if (double_cast(idx1, idx2, ac, acd, ac ))
+                                           {
+                                              if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                                              {
+                                                 double_cast(acd.getIdx1(), acd.getIdx2(), ac, lubricationCorrectionKernel, ac, acd.getContactNormal(), acd.getPenetrationDepth());
+                                              }
+                                           }
+                                        },
+                                        *accessor );
+         }
+
+
+         // collision response
+         ps->forEachParticlePairHalf(useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *accessor,
+                                     [&collisionResponse, &rpdDomain, timeStepSizeRPD, coefficientOfRestitution, particleCollisionTime, kappa]
+                                     (const size_t idx1, const size_t idx2, auto& ac)
+                                     {
+                                        mesa_pd::collision_detection::AnalyticContactDetection acd;
+                                        mesa_pd::kernel::DoubleCast double_cast;
+                                        mesa_pd::mpi::ContactFilter contact_filter;
+                                        if (double_cast(idx1, idx2, ac, acd, ac ))
+                                        {
+                                           if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *rpdDomain))
+                                           {
+                                              auto meff = real_t(1) / (ac.getInvMass(idx1) + ac.getInvMass(idx2));
+                                              collisionResponse.setStiffnessAndDamping(0,0,coefficientOfRestitution, particleCollisionTime, kappa, meff);
+                                              collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(), acd.getPenetrationDepth(), timeStepSizeRPD);
+                                           }
+                                        }
+                                     },
+                                     *accessor );
+
+         reduceAndSwapContactHistory(*ps);
+
+         // add hydrodynamic force
+         lbm_mesapd_coupling::AddHydrodynamicInteractionKernel addHydrodynamicInteraction;
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addHydrodynamicInteraction, *accessor );
+
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, addGravitationalForce, *accessor );
+
+         reduceProperty.operator()<mesa_pd::ForceTorqueNotification>(*ps);
+
+         ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor, vvIntegratorPostForce, *accessor);
+         syncCall();
+
+         timeloopTiming["RPD"].end();
+      }
+
+      ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectAll(), *accessor, resetHydrodynamicForceTorque, *accessor );
+
+      // update particle mapping
+      timeloopAfterParticles.singleStep(timeloopTiming);
+
+
+      if(timeStep % infoSpacing == 0)
+      {
+         timeloopTiming["Evaluate infos"].start();
+
+         auto particleInfo = evaluateParticleInfo(*accessor);
+         WALBERLA_LOG_INFO_ON_ROOT(particleInfo);
+
+         auto fluidInfo = evaluateFluidInfo<BoundaryHandling_T>(blocks, pdfFieldID, boundaryHandlingID);
+         WALBERLA_LOG_INFO_ON_ROOT(fluidInfo);
+
+         timeloopTiming["Evaluate infos"].end();
+      }
+
+
+   }
+
+   timeloopTiming.logResultOnRoot();
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace fluidized_bed
+
+int main( int argc, char **argv ){
+   fluidized_bed::main(argc, argv);
+}
diff --git a/apps/showcases/FluidizedBed/input.prm b/apps/showcases/FluidizedBed/input.prm
new file mode 100644
index 0000000000000000000000000000000000000000..6b3824f07cb8b6f7e94af1dc3b9c22f8c176eba2
--- /dev/null
+++ b/apps/showcases/FluidizedBed/input.prm
@@ -0,0 +1,47 @@
+PhysicalSetup // all to be specified in SI units!
+{
+    xSize 0.05; // = width
+    ySize 0.02; // = depth
+    zSize 0.10; // = height
+
+    periodicInX true;
+    periodicInY true;
+
+    runtime 10.0;
+
+    uInflow 0.01;
+    gravitationalAcceleration 9.81;
+
+    kinematicViscosityFluid 1e-5;
+    densityFluid 1000.;
+
+    particleDiameter 0.002;
+    densityParticle 1100.;
+    dynamicFrictionCoefficient 0.15;
+    coefficientOfRestitution 0.6;
+
+    particleGenerationSpacing 0.00401;
+}
+
+NumericalSetup
+{
+    dx 0.0002; // in m
+    uInflow 0.01; // in LBM units, should be smaller than 0.1, this then determines dt
+
+    // product of number of blocks should be equal to number of used processes
+    numXBlocks 2;
+    numYBlocks 2;
+    numZBlocks 2;
+
+    useLubricationForces true;
+    numberOfParticleSubCycles 10;
+}
+
+Output
+{
+    infoSpacing 0.01; // in s
+
+    vtkSpacingParticles 0.1; // in s
+    vtkSpacingFluid 0.1; // in s
+    vtkFolder vtk_out;
+}
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/CMakeLists.txt b/apps/showcases/PhaseFieldAllenCahn/CPU/CMakeLists.txt
index adacc326397db495352f958afa90fb26ae63e5d4..cf550a8e1f8392dbce80bb709ac71c3857724787 100644
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/CMakeLists.txt
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/CMakeLists.txt
@@ -10,11 +10,14 @@ waLBerla_generate_target_from_python(NAME PhaseFieldCodeGenCPU
         phase_field_LB_NoSlip.cpp phase_field_LB_NoSlip.h
         hydro_LB_step.cpp hydro_LB_step.h
         hydro_LB_NoSlip.cpp hydro_LB_NoSlip.h
-        stream_hydro.cpp stream_hydro.h
+        PackInfo_phase_field_distributions.cpp PackInfo_phase_field_distributions.h
+        PackInfo_velocity_based_distributions.cpp PackInfo_velocity_based_distributions.h
+        PackInfo_phase_field.cpp PackInfo_phase_field.h
+        ContactAngle.cpp ContactAngle.h
         GenDefines.h)
 
 waLBerla_add_executable(NAME multiphaseCPU
-        FILES multiphase.cpp PythonExports.cpp InitializerFunctions.cpp contact.cpp CalculateNormals.cpp multiphase_codegen.py
+        FILES multiphase.cpp PythonExports.cpp InitializerFunctions.cpp multiphase_codegen.py
         DEPENDS blockforest core field postprocessing lbm geometry timeloop gui PhaseFieldCodeGenCPU)
 
 set_target_properties(multiphaseCPU PROPERTIES CXX_VISIBILITY_PRESET hidden)
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/CalculateNormals.cpp b/apps/showcases/PhaseFieldAllenCahn/CPU/CalculateNormals.cpp
deleted file mode 100644
index e20053e38130262fdf2bb8c556f1fd665888a945..0000000000000000000000000000000000000000
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/CalculateNormals.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file CalculateNormals.cpp
-//! \author Markus Holzer <markus.holzer@fau.de>
-//
-//======================================================================================================================
-#include "CalculateNormals.h"
-
-#include "core/Environment.h"
-#include "core/logging/Initialization.h"
-
-#include "field/FlagField.h"
-
-namespace walberla
-{
-using FlagField_T    = FlagField< uint8_t >;
-using NormalsField_T = GhostLayerField< int8_t, 3 >;
-
-void calculate_normals(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID normalsFieldID,
-                       ConstBlockDataID flagFieldID, FlagUID domainFlagUID, FlagUID boundaryFlagUID)
-{
-   for (auto& block : *blocks)
-   {
-      CellInterval globalCellBB = blocks->getBlockCellBB(block);
-      CellInterval blockLocalCellBB;
-      blocks->transformGlobalToBlockLocalCellInterval(blockLocalCellBB, block, globalCellBB);
-
-      auto* normalsField = block.getData< NormalsField_T >(normalsFieldID);
-      auto* flagField    = block.getData< FlagField_T >(flagFieldID);
-      auto boundaryFlag  = flagField->getFlag(boundaryFlagUID);
-      auto domainFlag    = flagField->getFlag(domainFlagUID);
-
-      // clang-format off
-      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(normalsField,
-
-         if( x < blockLocalCellBB.xMax() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x + 1, y, z) == domainFlag)
-               normalsField->get(x, y, z, 0) = 1;
-         }
-
-         if( x > blockLocalCellBB.xMin() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x - 1, y, z) == domainFlag)
-               normalsField->get(x, y, z, 0) = - 1;
-         }
-
-         if( y < blockLocalCellBB.yMax() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x, y + 1, z) == domainFlag)
-               normalsField->get(x, y, z, 1) = 1;
-         }
-
-         if( y > blockLocalCellBB.yMin() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x, y - 1, z) == domainFlag)
-               normalsField->get(x, y, z, 1) = - 1;
-         }
-
-         if( z < blockLocalCellBB.zMax() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x, y, z + 1) == domainFlag)
-               normalsField->get(x, y, z, 2) = 1;
-         }
-
-         if( z > blockLocalCellBB.zMin() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x, y, z - 1) == domainFlag)
-               normalsField->get(x, y, z, 2) = - 1;
-         }
-
-      )
-      // clang-format on
-   }
-}
-} // namespace walberla
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/InitializerFunctions.cpp b/apps/showcases/PhaseFieldAllenCahn/CPU/InitializerFunctions.cpp
index 0fd64cb73cefcfefb25a565fee9c80b4c127b155..979c87ff330f669567df87c2e051a75271280b6b 100644
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/InitializerFunctions.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/InitializerFunctions.cpp
@@ -53,24 +53,196 @@ void initPhaseField_sphere(const shared_ptr< StructuredBlockStorage >& blocks, B
    }
 }
 
+void init_Taylor_bubble(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID,
+                        const real_t D = 5, const real_t H = 2, const real_t DT = 20, const real_t Donut_x0 = 40)
+{
+   auto Mx = blocks->getDomainCellBB().xMax() / 2.0;
+   auto Mz = blocks->getDomainCellBB().zMax() / 2.0;
+
+   for (auto& block : *blocks)
+   {
+      auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+         phaseField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+
+         real_t Ri = D * sqrt(pow(H, 2) - pow(DT - sqrt(pow(globalCell[0] - Mx, 2) + pow(globalCell[2] - Mz, 2)), 2));
+
+         real_t shifter           = atan2((globalCell[2] - Mz), (globalCell[0] - Mx));
+         if (shifter < 0) shifter = shifter + 2 * math::pi;
+         if ((globalCell[1] < Donut_x0 + Ri * sin(shifter / 2.0)) && (globalCell[1] > Donut_x0 - Ri)) {
+            phaseField->get(x, y, z) = 0.0;
+         } else { phaseField->get(x, y, z) = 1.0; })
+   }
+}
+
+void init_bubble_field(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t R,
+                       real_t W = 5)
+{
+   Vector3< real_t > bubbleMidPoint;
+
+   auto X = blocks->getDomainCellBB().xMax();
+   auto Y = blocks->getDomainCellBB().yMax();
+   auto Z = blocks->getDomainCellBB().zMax();
+
+   // 20 percent from the top are filled with the gas phase
+   real_t gas_top = Y - Y / 5.0;
+
+   // Diameter of the bubble
+   real_t D = R * 2;
+
+   // distance in between the bubbles
+   int dist = 4;
+   auto nx  = static_cast< unsigned int >(floor(X / (D + dist * W)));
+   auto nz  = static_cast< unsigned int >(floor(Z / (D + dist * W)));
+
+   // fluctuation of the bubble radii
+   std::vector< std::vector< real_t > > fluctuation_radius(nx, std::vector< real_t >(nz, 0.0));
+   std::vector< std::vector< real_t > > fluctuation_pos(nx, std::vector< real_t >(nz, 0.0));
+
+   real_t max_fluctuation_radius = R / 5;
+   real_t max_fluctuation_pos    = (dist * W) / 3.0;
+   for (unsigned int i = 0; i < nx; ++i)
+   {
+      for (unsigned int j = 0; j < nz; ++j)
+      {
+         fluctuation_radius[i][j] = math::realRandom< real_t >(-max_fluctuation_radius, max_fluctuation_radius);
+         fluctuation_pos[i][j]    = math::realRandom< real_t >(-max_fluctuation_pos, max_fluctuation_pos);
+      }
+   }
+
+   for (auto& block : *blocks)
+   {
+      auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+         phaseField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+         for (unsigned int i = 0; i < nx; ++i) {
+            for (unsigned int j = 0; j < nz; ++j)
+            {
+               bubbleMidPoint[0] = (i + 1) * (D + (dist * W)) - (D + (dist * W)) / 2.0 + fluctuation_pos[i][j];
+               bubbleMidPoint[1] = R + W + 4;
+               bubbleMidPoint[2] = (j + 1) * (D + (dist * W)) - (D + (dist * W)) / 2.0 + fluctuation_pos[i][j];
+
+               real_t Ri = sqrt((globalCell[0] - bubbleMidPoint[0]) * (globalCell[0] - bubbleMidPoint[0]) +
+                                (globalCell[1] - bubbleMidPoint[1]) * (globalCell[1] - bubbleMidPoint[1]) +
+                                (globalCell[2] - bubbleMidPoint[2]) * (globalCell[2] - bubbleMidPoint[2]));
+               if (globalCell[0] >= i * (D + dist * W) && globalCell[0] <= (i + 1) * (D + dist * W) &&
+                   globalCell[2] >= j * (D + dist * W) && globalCell[2] <= (j + 1) * (D + dist * W))
+                  phaseField->get(x, y, z) = 0.5 + 0.5 * tanh(2.0 * (Ri - (R - fluctuation_radius[i][j])) / W);
+
+               if (globalCell[0] > nx * (D + dist * W)) phaseField->get(x, y, z) = 1.0;
+               if (globalCell[2] > nz * (D + dist * W)) phaseField->get(x, y, z) = 1.0;
+            }
+         }
+
+         if (globalCell[1] > gas_top) {
+            phaseField->get(x, y, z) = 0.5 + 0.5 * tanh(2.0 * (gas_top + 10 - globalCell[1]) / W);
+         })
+   }
+}
+
 void initPhaseField_RTI(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID,
-                        const real_t W = 5)
+                        const real_t W = 5, const bool pipe = true)
 {
    auto X              = blocks->getDomainCellBB().xMax();
+   auto Z              = blocks->getDomainCellBB().zMax();
    auto halfY          = (blocks->getDomainCellBB().yMax()) / 2.0;
-   double perturbation = 0.05;
+   real_t perturbation = 0.05;
 
-   for (auto& block : *blocks)
+   if (pipe)
    {
-      auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
-      // clang-format off
-      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(phaseField, Cell globalCell;
-         blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-         real_t tmp =
-         perturbation * X * (cos((2.0 * math::pi * globalCell[0]) / X) + cos((2.0 * math::pi * globalCell[2]) / X));
-         phaseField->get(x, y, z) = 0.5 + 0.5 * tanh(((globalCell[1] - halfY) - tmp) / (W / 2.0));
-      )
-      // clang-format on
+      for (auto& block : *blocks)
+      {
+         auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+            phaseField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            real_t R     = sqrt((globalCell[0] - X / 2) * (globalCell[0] - X / 2) +
+                            (globalCell[2] - Z / 2) * (globalCell[2] - Z / 2));
+            if (R > X) R = X; real_t tmp = perturbation * X * cos((2.0 * math::pi * R) / X);
+            phaseField->get(x, y, z)     = 0.5 + 0.5 * tanh(((globalCell[1] - halfY) + tmp) / (W / 2.0));)
+      }
+   }
+   else
+   {
+      for (auto& block : *blocks)
+      {
+         auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+            phaseField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            real_t tmp = perturbation * X *
+                         (cos((2.0 * math::pi * globalCell[0]) / X) + cos((2.0 * math::pi * globalCell[2]) / X));
+            phaseField->get(x, y, z) = 0.5 + 0.5 * tanh(((globalCell[1] - halfY) - tmp) / (W / 2.0));)
+      }
+   }
+}
+
+void initTubeWithCylinder(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID flagFieldID,
+                          field::FlagUID boundaryFlagUID, real_t const R_in, real_t const eccentricity,
+                          real_t const start_transition, real_t const length_transition,
+                          bool const eccentricity_or_pipe_ratio)
+{
+   if (eccentricity_or_pipe_ratio)
+   {
+      auto Mx = blocks->getDomainCellBB().xMax() / 2.0;
+      auto Mz = blocks->getDomainCellBB().zMax() / 2.0;
+
+      auto R_outer = blocks->getDomainCellBB().xMax() / 2.0 + 1.0;
+
+      real_t const shift = eccentricity * Mx / 2;
+
+      for (auto& block : *blocks)
+      {
+         auto flagField    = block.template getData< FlagField_T >(flagFieldID);
+         auto boundaryFlag = flagField->getOrRegisterFlag(boundaryFlagUID);
+         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(flagField, Cell globalCell; 
+            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            real_t R1;
+            if (globalCell[1] <= start_transition) {
+               R1 = sqrt((globalCell[0] - Mx) * (globalCell[0] - Mx) + (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            } else if (globalCell[1] > start_transition && globalCell[1] < start_transition + length_transition) {
+               real_t tmp       = math::pi * (globalCell[1] - start_transition) / (length_transition);
+               real_t shift_tmp = shift * 0.5 * (1 - cos(tmp));
+               R1               = sqrt((globalCell[0] - Mx - shift_tmp) * (globalCell[0] - Mx - shift_tmp) +
+                         (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            } else {
+               R1 = sqrt((globalCell[0] - Mx - shift) * (globalCell[0] - Mx - shift) +
+                         (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            }
+
+            real_t R2 = sqrt((globalCell[0] - Mx) * (globalCell[0] - Mx) + (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            if (R1 < R_in) addFlag(flagField->get(x, y, z), boundaryFlag);
+            if (R2 > R_outer) addFlag(flagField->get(x, y, z), boundaryFlag);)
+      }
+   }
+   else
+   {
+      auto Mx = blocks->getDomainCellBB().xMax() / 2.0;
+      auto Mz = blocks->getDomainCellBB().zMax() / 2.0;
+      
+      auto R_outer = blocks->getDomainCellBB().xMax() / 2.0 + 1.0;
+
+      real_t const shift = eccentricity * R_in;
+      real_t R_tmp;
+
+      for (auto& block : *blocks)
+      {
+         auto flagField    = block.template getData< FlagField_T >(flagFieldID);
+         auto boundaryFlag = flagField->getOrRegisterFlag(boundaryFlagUID);
+         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+            flagField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            if (globalCell[1] <= start_transition) {
+               R_tmp = R_in;
+            } else if (globalCell[1] > start_transition && globalCell[1] < start_transition + length_transition) {
+               real_t tmp       = math::pi * (globalCell[1] - start_transition) / (length_transition);
+               real_t shift_tmp = shift * 0.5 * (1 - cos(tmp));
+               R_tmp = R_in + shift_tmp;
+            } else {
+               R_tmp = R_in + shift;
+            }
+
+            real_t R2 = sqrt((globalCell[0] - Mx) * (globalCell[0] - Mx) + (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            if (R2 < R_tmp) addFlag(flagField->get(x, y, z), boundaryFlag);
+            if (R2 > R_outer) addFlag(flagField->get(x, y, z), boundaryFlag);)
+      }
    }
 }
 } // namespace walberla
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/InitializerFunctions.h b/apps/showcases/PhaseFieldAllenCahn/CPU/InitializerFunctions.h
index 04bac53f2e75d17ddd393da0e247b63df05d0704..585639ac9078beb048a8ce694ea97552e59b833a 100644
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/InitializerFunctions.h
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/InitializerFunctions.h
@@ -34,6 +34,17 @@ namespace walberla
 void initPhaseField_sphere(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t R,
                            Vector3< real_t > bubbleMidPoint, bool bubble = true, real_t W = 5);
 
-void initPhaseField_RTI(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t W = 5);
+void init_Taylor_bubble(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t D = 5,
+                        real_t H = 2, real_t DT = 20, real_t Donut_x0 = 40);
+
+void init_bubble_field(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t R,
+                       real_t W = 5);
+
+void initPhaseField_RTI(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t W = 5,
+                        const bool pipe = true);
+
+void initTubeWithCylinder(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID flagFieldID,
+                          field::FlagUID boundaryFlagUID, real_t R_in, real_t eccentricity, real_t start_transition,
+                          real_t length_transition, bool const eccentricity_or_pipe_ratio);
 
 } // namespace walberla
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/contact.cpp b/apps/showcases/PhaseFieldAllenCahn/CPU/contact.cpp
deleted file mode 100644
index 8253e1d4cd26c071e4c1abd775d8aee82e27ce26..0000000000000000000000000000000000000000
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/contact.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file contact.cpp
-//! \author Markus Holzer <markus.holzer@fau.de>
-//
-//======================================================================================================================
-
-#include "contact.h"
-
-#include "core/DataTypes.h"
-
-#include <cmath>
-
-#define FUNC_PREFIX
-
-namespace walberla
-{
-namespace lbm
-{
-#ifdef __GNUC__
-#   pragma GCC diagnostic push
-#endif
-
-namespace internal_boundary_contact
-{
-static FUNC_PREFIX void contact_angle_treatment(uint8_t* WALBERLA_RESTRICT const _data_indexVector, double* WALBERLA_RESTRICT _data_phase,
-                                                int64_t const _stride_phase_0, int64_t const _stride_phase_1,
-                                                int64_t const _stride_phase_2, int64_t indexVectorSize, double alpha)
-{
-   for (int ctr_0 = 0; ctr_0 < indexVectorSize; ctr_0 += 1)
-   {
-      const int32_t x = *((int32_t*) (&_data_indexVector[24 * ctr_0]));
-      const int32_t y = *((int32_t*) (&_data_indexVector[24 * ctr_0 + 4]));
-      const int32_t z = *((int32_t*) (&_data_indexVector[24 * ctr_0 + 8]));
-
-      const int32_t nx = *((int32_t*) (&_data_indexVector[24 * ctr_0 + 12]));
-      const int32_t ny = *((int32_t*) (&_data_indexVector[24 * ctr_0 + 16]));
-      const int32_t nz = *((int32_t*) (&_data_indexVector[24 * ctr_0 + 20]));
-
-      const int32_t x1 = x + nx;
-      const int32_t y1 = y + ny;
-      const int32_t z1 = z + nz;
-
-      const double h = 0.5 * sqrt((float) (nx * nx + ny * ny + nz * nz));
-      const double a = cos(alpha);
-      const double W = 5;
-
-      double* WALBERLA_RESTRICT _phase_wall     = _data_phase + _stride_phase_1 * y + _stride_phase_2 * z;
-      double* WALBERLA_RESTRICT _phase_interior = _data_phase + _stride_phase_1 * y1 + _stride_phase_2 * z1;
-      if (h < 0.001) { _phase_wall[_stride_phase_0 * x] = 1.0; }
-      else if (a > 1e-8 || a < -1e-8)
-      {
-         const double var = -h * (4.0 / W) * a;
-         _phase_wall[_stride_phase_0 * x] =
-            (1 + var - sqrt((1 + var) * (1 + var) - 4 * var * _phase_interior[_stride_phase_0 * x1])) / (var + 1e-12) -
-            _phase_interior[_stride_phase_0 * x1];
-      }
-      else
-      {
-         _phase_wall[_stride_phase_0 * x] = _phase_interior[_stride_phase_0 * x1];
-      }
-   }
-}
-} // namespace internal_boundary_contact
-
-#ifdef __GNUC__
-#   pragma GCC diagnostic pop
-#endif
-
-#ifdef __CUDACC__
-#   pragma pop
-#endif
-
-void contact::run(IBlock* block, IndexVectors::Type type)
-{
-   auto* indexVectors      = block->getData< IndexVectors >(indexVectorID);
-   int64_t indexVectorSize = int64_c(indexVectors->indexVector(type).size());
-   if (indexVectorSize == 0) return;
-
-   auto pointer = indexVectors->pointerCpu(type);
-
-   auto* _data_indexVector = reinterpret_cast< uint8_t* >(pointer);
-
-   auto phaseField = block->getData< field::GhostLayerField< double, 1 > >(phaseFieldID);
-   auto& alpha     = this->alpha_;
-
-   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(phaseField->nrOfGhostLayers()))
-   double* WALBERLA_RESTRICT _data_phase = phaseField->dataAt(0, 0, 0, 0);
-   const auto _stride_pdfs_0    = int64_t(phaseField->xStride());
-   const auto _stride_pdfs_1    = int64_t(phaseField->yStride());
-   const auto _stride_pdfs_2    = int64_t(phaseField->zStride());
-   internal_boundary_contact::contact_angle_treatment(_data_indexVector, _data_phase, _stride_pdfs_0, _stride_pdfs_1,
-                                                      _stride_pdfs_2, indexVectorSize, alpha);
-}
-
-void contact::operator()(IBlock* block) { run(block, IndexVectors::ALL); }
-
-void contact::inner(IBlock* block) { run(block, IndexVectors::INNER); }
-
-void contact::outer(IBlock* block) { run(block, IndexVectors::OUTER); }
-
-} // namespace lbm
-} // namespace walberla
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/contact.h b/apps/showcases/PhaseFieldAllenCahn/CPU/contact.h
deleted file mode 100644
index dbaf4a560b3a717c96b34ae811a58f048776eff8..0000000000000000000000000000000000000000
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/contact.h
+++ /dev/null
@@ -1,132 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file contact.h
-//! \author Markus Holzer <markus.holzer@fau.de>
-//
-//======================================================================================================================
-
-#include "blockforest/StructuredBlockForest.h"
-
-#include "core/DataTypes.h"
-
-#include "domain_decomposition/BlockDataID.h"
-#include "domain_decomposition/IBlock.h"
-
-#include "field/FlagField.h"
-#include "field/GhostLayerField.h"
-
-#include <set>
-#include <vector>
-
-namespace walberla
-{
-namespace lbm
-{
-class contact
-{
- public:
-   struct IndexInfo
-   {
-      int32_t x1;
-      int32_t y1;
-      int32_t z1;
-      int32_t x2;
-      int32_t y2;
-      int32_t z2;
-      IndexInfo(int32_t x1_, int32_t y1_, int32_t z1_, int32_t x2_, int32_t y2_, int32_t z2_)
-         : x1(x1_), y1(y1_), z1(z1_), x2(x2_), y2(y2_), z2(z2_)
-      {}
-      bool operator==(const IndexInfo& o) const
-      {
-         return x1 == o.x1 && y1 == o.y1 && z1 == o.z1 && x2 == o.x2 && y2 == o.y2 && z2 == o.z2;
-      }
-   };
-
-   class IndexVectors
-   {
-    public:
-      using CpuIndexVector = std::vector< IndexInfo >;
-
-      enum Type { ALL = 0, INNER = 1, OUTER = 2, NUM_TYPES = 3 };
-
-      IndexVectors() : cpuVectors_(NUM_TYPES) {}
-      bool operator==(IndexVectors& other) { return other.cpuVectors_ == cpuVectors_; }
-
-      CpuIndexVector& indexVector(Type t) { return cpuVectors_[t]; }
-      IndexInfo* pointerCpu(Type t) { return &(cpuVectors_[t][0]); }
-
-    private:
-      std::vector< CpuIndexVector > cpuVectors_;
-   };
-
-   contact(const shared_ptr< StructuredBlockForest >& blocks, BlockDataID phaseFieldID_, double alpha)
-      : phaseFieldID(phaseFieldID_), alpha_(alpha)
-   {
-      auto createIdxVector = [](IBlock* const, StructuredBlockStorage* const) { return new IndexVectors(); };
-      indexVectorID = blocks->addStructuredBlockData< IndexVectors >(createIdxVector, "IndexField_contact_angle");
-   };
-
-   void operator()(IBlock* block);
-   void inner(IBlock* block);
-   void outer(IBlock* block);
-
-   template< typename NormalField_T >
-   void fillFromNormalField(const shared_ptr< StructuredBlockForest >& blocks, ConstBlockDataID normalFieldID)
-   {
-      for (auto& block : *blocks)
-      {
-         auto* indexVectors     = block.getData< IndexVectors >(indexVectorID);
-         auto& indexVectorAll   = indexVectors->indexVector(IndexVectors::ALL);
-         auto& indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
-         auto& indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
-
-         auto* normalField = block.getData< NormalField_T >(normalFieldID);
-
-         auto inner = normalField->xyzSize();
-         inner.expand(cell_idx_t(-1));
-
-         indexVectorAll.clear();
-         indexVectorInner.clear();
-         indexVectorOuter.clear();
-         // clang-format off
-         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(normalField, Cell globalCell;
-            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-            if(normalField->get(x, y, z, 0) != 0 || normalField->get(x, y, z, 1) != 0 || normalField->get(x, y, z, 2) != 0)
-               {
-                  auto element = IndexInfo(x, y,  z, normalField->get(x, y, z, 0), normalField->get(x, y, z, 1), normalField->get(x, y, z, 2));
-                  indexVectorAll.push_back( element );
-                  if( inner.contains( x, y, z ) )
-                      indexVectorInner.push_back( element );
-                  else
-                     indexVectorOuter.push_back( element );
-               }
-         )
-         // clang-format on
-      }
-   }
-
- private:
-   void run(IBlock* block, IndexVectors::Type type);
-
-   BlockDataID indexVectorID;
-
- public:
-   BlockDataID phaseFieldID;
-   double alpha_;
-};
-
-} // namespace lbm
-} // namespace walberla
\ No newline at end of file
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/droplet_contact_angle.py b/apps/showcases/PhaseFieldAllenCahn/CPU/droplet_contact_angle.py
index 68229ece6fda45f0467a6277548ac23515054a8b..ee297075c1030f87b1d4b3cab4ee505f07dc381b 100755
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/droplet_contact_angle.py
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/droplet_contact_angle.py
@@ -7,7 +7,7 @@ class Scenario:
         self.vtkWriteFrequency = 1000
 
         # simulation parameters
-        self.timesteps = 5001
+        self.timesteps = 10001
         self.cells = (32, 64, 32)
         self.blocks = (4, 1, 4)
         self.periodic = (0, 0, 0)
@@ -18,25 +18,24 @@ class Scenario:
         self.overlappingWidth = (8, 1, 1)
         self.timeStepStrategy = 'normal'
 
-        self.contactAngle = 22
-
         # bubble parameters
         self.dropletRadius = 24.0
         self.dropletMidPoint = (64, 24, 64)
 
         # everything else
-        self.scenario = 1  # 1 rising bubble, 2 RTI
+        self.scenario = 1  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
 
         self.counter = 0
         self.yPositions = []
 
     @wlb.member_callback
-    def config(self, **kwargs):
+    def config(self):
         return {
             'DomainSetup': {
                 'blocks': self.blocks,
                 'cellsPerBlock': self.cells,
                 'periodic': self.periodic,
+                'tube': False
             },
             'Parameters': {
                 'timesteps': self.timesteps,
@@ -45,7 +44,6 @@ class Scenario:
                 'overlappingWidth': self.overlappingWidth,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
-                'contactAngle': self.contactAngle
             },
             'PhysicalParameters': {
                 'density_liquid': 1.0,
@@ -55,6 +53,7 @@ class Scenario:
                 'gravitational_acceleration': 0.0,
                 'relaxation_time_liquid': 3 * 0.166,
                 'relaxation_time_gas': 3 * 0.0166,
+                'interface_thickness': 5
             },
             'Boundaries': {
                 'Border': [
@@ -69,7 +68,7 @@ class Scenario:
             'Bubble': {
                 'bubbleMidPoint': self.dropletMidPoint,
                 'bubbleRadius': self.dropletRadius,
-                'bubble': False
+                'bubble': False  # this means we are simulating a droplet rather than a bubble
             },
         }
 
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase.cpp
index 00f5fae4d7caad2a9540aade589e05ea9aa06cc3..07c9542cf3c83de774e3cf39f8cfafed8b9ec597 100644
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase.cpp
@@ -27,21 +27,17 @@
 
 #include "field/AddToStorage.h"
 #include "field/FlagField.h"
-#include "field/communication/PackInfo.h"
 #include "field/vtk/VTKWriter.h"
 
 #include "geometry/InitBoundaryHandling.h"
 
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/PythonCallback.h"
-#include "python_coupling/export/FieldExports.h"
 
 #include "timeloop/SweepTimeloop.h"
 
-#include "CalculateNormals.h"
 #include "InitializerFunctions.h"
 #include "PythonExports.h"
-#include "contact.h"
 
 //////////////////////////////
 // INCLUDE GENERATED FILES //
@@ -54,7 +50,10 @@
 #include "initialize_velocity_based_distributions.h"
 #include "phase_field_LB_NoSlip.h"
 #include "phase_field_LB_step.h"
-#include "stream_hydro.h"
+#include "ContactAngle.h"
+#include "PackInfo_phase_field_distributions.h"
+#include "PackInfo_velocity_based_distributions.h"
+#include "PackInfo_phase_field.h"
 
 ////////////
 // USING //
@@ -62,11 +61,6 @@
 
 using namespace walberla;
 
-using PdfField_phase_T = GhostLayerField< real_t, Stencil_phase_T::Size >;
-using PdfField_hydro_T = GhostLayerField< real_t, Stencil_hydro_T::Size >;
-using VelocityField_T  = GhostLayerField< real_t, Stencil_hydro_T::Dimension >;
-using NormalsField_T   = GhostLayerField< int8_t, Stencil_hydro_T::Dimension >;
-using PhaseField_T     = GhostLayerField< real_t, 1 >;
 using flag_t           = walberla::uint8_t;
 using FlagField_T      = FlagField< flag_t >;
 
@@ -89,6 +83,7 @@ int main(int argc, char** argv)
 
       auto domainSetup                = config->getOneBlock("DomainSetup");
       Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+      const bool tube                 = domainSetup.getParameter< bool >("tube", true);
 
       ////////////////////////////////////////
       // ADD GENERAL SIMULATION PARAMETERS //
@@ -100,8 +95,7 @@ int main(int argc, char** argv)
       const real_t remainingTimeLoggerFrequency =
          parameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0);
       const uint_t scenario  = parameters.getParameter< uint_t >("scenario", uint_c(1));
-      const real_t alpha     = parameters.getParameter< real_t >("contactAngle", real_c(90));
-      const real_t alpha_rad = alpha * (math::pi / 180);
+
       Vector3< int > overlappingWidth =
          parameters.getParameter< Vector3< int > >("overlappingWidth", Vector3< int >(1, 1, 1));
 
@@ -116,9 +110,19 @@ int main(int argc, char** argv)
       BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_t(0), field::fzyx);
 
-      BlockDataID normals     = field::addToStorage< NormalsField_T >(blocks, "normals", int8_t(0), field::fzyx);
       BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
+      auto physical_parameters     = config->getOneBlock("PhysicalParameters");
+      const real_t density_liquid  = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0));
+      const real_t density_gas     = physical_parameters.getParameter< real_t >("density_gas");
+      const real_t surface_tension = physical_parameters.getParameter< real_t >("surface_tension");
+      const real_t mobility        = physical_parameters.getParameter< real_t >("mobility");
+      const real_t gravitational_acceleration =
+         physical_parameters.getParameter< real_t >("gravitational_acceleration");
+      const real_t relaxation_time_liquid = physical_parameters.getParameter< real_t >("relaxation_time_liquid");
+      const real_t relaxation_time_gas    = physical_parameters.getParameter< real_t >("relaxation_time_gas");
+      const real_t interface_thickness    = physical_parameters.getParameter< real_t >("interface_thickness");
+
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the phase-field")
       if (scenario == 1)
       {
@@ -130,7 +134,7 @@ int main(int argc, char** argv)
       }
       else if (scenario == 2)
       {
-         initPhaseField_RTI(blocks, phase_field);
+         initPhaseField_RTI(blocks, phase_field, interface_thickness, tube);
       }
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the phase-field done")
 
@@ -138,43 +142,31 @@ int main(int argc, char** argv)
       // ADD SWEEPS //
       ///////////////
 
-      auto physical_parameters     = config->getOneBlock("PhysicalParameters");
-      const real_t density_liquid  = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0));
-      const real_t density_gas     = physical_parameters.getParameter< real_t >("density_gas");
-      const real_t surface_tension = physical_parameters.getParameter< real_t >("surface_tension");
-      const real_t mobility        = physical_parameters.getParameter< real_t >("mobility");
-      const real_t gravitational_acceleration =
-         physical_parameters.getParameter< real_t >("gravitational_acceleration");
-      const real_t relaxation_time_liquid = physical_parameters.getParameter< real_t >("relaxation_time_liquid");
-      const real_t relaxation_time_gas    = physical_parameters.getParameter< real_t >("relaxation_time_gas");
-
-      pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
+      pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field, interface_thickness);
       pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
 
       pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field, phase_field, vel_field, mobility,
+         lb_phase_field, phase_field, vel_field, interface_thickness, mobility,
          Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
       pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field, gravitational_acceleration,
-                                              density_liquid, density_gas, surface_tension, relaxation_time_liquid,
+                                              interface_thickness, density_liquid, density_gas, surface_tension, relaxation_time_liquid,
                                               relaxation_time_gas,
                                               Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
-      pystencils::stream_hydro stream_hydro(lb_velocity_field,
-                                            Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
 
       ////////////////////////
       // ADD COMMUNICATION //
       //////////////////////
-
       blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field(blocks);
-      Comm_phase_field.addPackInfo(make_shared< field::communication::PackInfo< PhaseField_T > >(phase_field));
+      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
+      Comm_phase_field.addPackInfo(generatedPackInfo_phase_field);
 
       blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
-      Comm_velocity_based_distributions.addPackInfo(
-         make_shared< field::communication::PackInfo< PdfField_hydro_T > >(lb_velocity_field));
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
+      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
 
       blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
-      Comm_phase_field_distributions.addPackInfo(
-         make_shared< field::communication::PackInfo< PdfField_phase_T > >(lb_phase_field));
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
+      Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
 
       ////////////////////////
       // BOUNDARY HANDLING //
@@ -190,14 +182,13 @@ int main(int argc, char** argv)
          geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
       }
 
-      calculate_normals(blocks, normals, flagFieldID, fluidFlagUID, wallFlagUID);
       lbm::phase_field_LB_NoSlip phase_field_LB_NoSlip(blocks, lb_phase_field);
       lbm::hydro_LB_NoSlip hydro_LB_NoSlip(blocks, lb_velocity_field);
-      lbm::contact contact_angle(blocks, phase_field, alpha_rad);
+      pystencils::ContactAngle contact_angle(blocks, phase_field, interface_thickness);
 
       phase_field_LB_NoSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
       hydro_LB_NoSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
-      contact_angle.fillFromNormalField< NormalsField_T >(blocks, normals);
+      contact_angle.fillFromFlagField< FlagField_T >(blocks, flagFieldID, wallFlagUID, fluidFlagUID);
 
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the normals-field done")
 
@@ -209,48 +200,44 @@ int main(int argc, char** argv)
       auto normalTimeStep = [&]() {
          for (auto& block : *blocks)
          {
-            phase_field_LB_NoSlip(&block);
             Comm_phase_field_distributions();
+            phase_field_LB_NoSlip(&block);
 
-            phase_field_LB_step(&block);
 
-            Comm_phase_field();
+            phase_field_LB_step(&block);
             contact_angle(&block);
+            Comm_phase_field();
+
 
             hydro_LB_step(&block);
-            hydro_LB_NoSlip(&block);
-            Comm_velocity_based_distributions();
 
-            stream_hydro(&block);
+            Comm_velocity_based_distributions();
+            hydro_LB_NoSlip(&block);
          }
       };
       auto simpleOverlapTimeStep = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_NoSlip(&block);
-
-         Comm_phase_field_distributions.startCommunication();
-         for (auto& block : *blocks)
-            phase_field_LB_step.inner(&block);
-         Comm_phase_field_distributions.wait();
-         for (auto& block : *blocks)
-            phase_field_LB_step.outer(&block);
-
-         Comm_phase_field.startCommunication();
-         for (auto& block : *blocks)
-            hydro_LB_step.inner(&block);
-         Comm_phase_field.wait();
-         for (auto& block : *blocks)
-            hydro_LB_step.outer(&block);
-
-         for (auto& block : *blocks)
-            hydro_LB_NoSlip(&block);
-
-         Comm_velocity_based_distributions.startCommunication();
-         for (auto& block : *blocks)
-            stream_hydro.inner(&block);
-         Comm_velocity_based_distributions.wait();
-         for (auto& block : *blocks)
-            stream_hydro.outer(&block);
+        for (auto& block : *blocks)
+           phase_field_LB_NoSlip(&block);
+
+        Comm_phase_field_distributions.startCommunication();
+        for (auto& block : *blocks)
+           phase_field_LB_step.inner(&block);
+        Comm_phase_field_distributions.wait();
+        for (auto& block : *blocks)
+           phase_field_LB_step.outer(&block);
+
+        for (auto& block : *blocks)
+           contact_angle(&block);
+
+        Comm_phase_field.startCommunication();
+        for (auto& block : *blocks)
+           hydro_LB_step.inner(&block);
+        Comm_phase_field.wait();
+        for (auto& block : *blocks)
+           hydro_LB_step.outer(&block);
+
+        for (auto& block : *blocks)
+           hydro_LB_NoSlip(&block);
       };
       std::function< void() > timeStep;
       if (timeStepStrategy == "overlap")
@@ -294,6 +281,8 @@ int main(int argc, char** argv)
                {
                   callback.data().exposeValue("blocks", blocks);
                   callback.data().exposeValue( "timeStep", timeLoop->getCurrentTimeStep());
+                  callback.data().exposeValue("stencil_phase", stencil_phase_name);
+                  callback.data().exposeValue("stencil_hydro", stencil_hydro_name);
                   callback();
                }
             }
@@ -313,9 +302,6 @@ int main(int argc, char** argv)
          auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "PhaseField");
          vtkOutput->addCellDataWriter(phaseWriter);
 
-         // auto normlasWriter = make_shared<field::VTKWriter<NormalsField_T>>(normals, "Normals");
-         // vtkOutput->addCellDataWriter(normlasWriter);
-
          // auto velWriter = make_shared<field::VTKWriter<VelocityField_T>>(vel_field, "Velocity");
          // vtkOutput->addCellDataWriter(velWriter);
 
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_RTI_3D.py b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_RTI_3D.py
index 1ef86f585a2ad8e90ae0400321284431eaa71b04..401ce693ac893aba8375c64dd2b56f6853a9c08c 100755
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_RTI_3D.py
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_RTI_3D.py
@@ -1,8 +1,11 @@
+import os
+
 import waLBerla as wlb
-import waLBerla.tools.sqlitedb as wlbSqlite
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars
 from waLBerla.core_extension import makeSlice
 
 import numpy as np
+import pandas as pd
 from lbmpy.phasefield_allen_cahn.parameter_calculation import calculate_parameters_rti
 
 
@@ -10,12 +13,11 @@ class Scenario:
     def __init__(self):
         # output frequencies
         self.vtkWriteFrequency = 1000
-        self.dbWriteFrequency = 200
 
         # simulation parameters
-        self.timesteps = 27001
+        self.time = 2  # physical time in seconds
 
-        self.cells = (64, 32, 64)
+        self.cells = (128, 64, 128)
         self.blocks = (1, 8, 1)
         self.periodic = (1, 0, 1)
         self.size = (self.cells[0] * self.blocks[0],
@@ -24,25 +26,40 @@ class Scenario:
 
         # physical parameters
         self.density_heavy = 1.0
-        self.reference_time = 6000
-        self.parameters = calculate_parameters_rti(reference_length=128,
+        self.reference_time = 4000
+        self.dbWriteFrequency = self.reference_time // 20
+        self.timesteps = int(self.reference_time * self.time) + 1
+
+        self.capillary_number = 8.7
+        self.reynolds_number = 3000
+        self.atwood_number = 1
+        self.peclet_number = 744
+        self.density_ratio = 1000
+        self.viscosity_ratio = 100
+
+        self.parameters = calculate_parameters_rti(reference_length=self.cells[0],
                                                    reference_time=self.reference_time,
                                                    density_heavy=self.density_heavy,
-                                                   capillary_number=9.1,
-                                                   reynolds_number=128,
-                                                   atwood_number=1.0,
-                                                   peclet_number=140,
-                                                   density_ratio=3,
-                                                   viscosity_ratio=3)
+                                                   capillary_number=self.capillary_number,
+                                                   reynolds_number=self.reynolds_number,
+                                                   atwood_number=self.atwood_number,
+                                                   peclet_number=self.peclet_number,
+                                                   density_ratio=self.density_ratio,
+                                                   viscosity_ratio=self.viscosity_ratio)
+
+        self.interface_thickness = 5
+        self.tube = False
 
         # everything else
-        self.dbFile = "RTI.db"
+        self.dbFile = "RTI.csv"
 
-        self.scenario = 2  # 1 rising bubble, 2 RTI
+        self.scenario = 2  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
 
         self.counter = 0
         self.yPositions = []
 
+        self.config_dict = self.config()
+
     @wlb.member_callback
     def config(self):
         return {
@@ -50,23 +67,24 @@ class Scenario:
                 'blocks': self.blocks,
                 'cellsPerBlock': self.cells,
                 'periodic': self.periodic,
+                'tube': self.tube
             },
             'Parameters': {
                 'timesteps': self.timesteps,
                 'vtkWriteFrequency': self.vtkWriteFrequency,
                 'dbWriteFrequency': self.dbWriteFrequency,
-                'useGui': 0,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
             },
             'PhysicalParameters': {
                 'density_liquid': self.density_heavy,
-                'density_gas': self.parameters["density_light"],
-                'surface_tension': self.parameters["surface_tension"],
-                'mobility': self.parameters.get("mobility", 0.1),
-                'gravitational_acceleration': self.parameters["gravitational_acceleration"],
+                'density_gas': self.parameters.get("density_light"),
+                'surface_tension': self.parameters.get("surface_tension"),
+                'mobility': self.parameters.get("mobility"),
+                'gravitational_acceleration': self.parameters.get("gravitational_acceleration"),
                 'relaxation_time_liquid': self.parameters.get("relaxation_time_heavy"),
                 'relaxation_time_gas': self.parameters.get("relaxation_time_light"),
+                'interface_thickness': self.interface_thickness
             },
             'Boundaries': {
                 'Border': [
@@ -79,55 +97,79 @@ class Scenario:
     @wlb.member_callback
     def at_end_of_time_step(self, blocks, **kwargs):
         t = kwargs['timeStep']
-        ny = self.size[1]
-        l0 = self.size[0]
         if t % self.dbWriteFrequency == 0:
-            location_of_spike = -100
-            location_of_bubble = -100
-            location_of_saddle = -100
-            mass = -100
-            spike_data = wlb.field.gather(blocks, 'phase', makeSlice[self.size[0] // 2, :, self.size[2] // 2])
-            if spike_data:
-                spike_field = np.asarray(spike_data).squeeze()
-                location_of_spike = (np.argmax(spike_field > 0.5) - ny // 2) / l0
-
-            bubble_data = wlb.field.gather(blocks, 'phase', makeSlice[0, :, 0])
-            if bubble_data:
-                bubble_field = np.asarray(bubble_data).squeeze()
-                location_of_bubble = (np.argmax(bubble_field > 0.5) - ny // 2) / l0
-
-            saddle_data = wlb.field.gather(blocks, 'phase', makeSlice[0, :, self.size[2] // 2])
-            if saddle_data:
-                saddle_field = np.asarray(saddle_data).squeeze()
-                location_of_saddle = (np.argmax(saddle_field > 0.5) - ny // 2) / l0
-
             phase = wlb.field.gather(blocks, 'phase', makeSlice[:, :, :])
             if phase:
+                data = {'timestep': t}
+                data.update(self.config_dict['PhysicalParameters'])
+                data.update({'total_timesteps': self.timesteps})
+                data.update({'normalized_time': t / self.reference_time})
+                data.update({'tube_setup': self.tube})
+                data.update({'interface_thickness': self.interface_thickness})
+                data.update({'capillary_number': self.capillary_number})
+                data.update({'reynolds_number': self.reynolds_number})
+                data.update({'atwood_number': self.atwood_number})
+                data.update({'peclet_number': self.peclet_number})
+                data.update({'density_ratio': self.density_ratio})
+                data.update({'viscosity_ratio': self.viscosity_ratio})
+                data.update({'reference_time': self.reference_time})
+                data.update(kwargs)
+
                 phase_field = np.asarray(phase).squeeze()
+                stable = np.isfinite(np.sum(phase_field))
                 mass = np.sum(phase_field)
+                rel_max = np.max(phase_field) - 1
+                rel_min = np.min(phase_field)
+                data.update({'mass': mass})
+                data.update({'rel_max': rel_max})
+                data.update({'rel_min': rel_min})
+                data.update({'stable': stable})
+
+                if self.tube:
+                    location_of_spike = self.get_interface_location(
+                        phase_field[self.size[0] // 2, :, self.size[2] // 2])
+                    a = np.where(phase_field < 0.5)
+                    value = np.argmax(a[1])
+                    location_of_bubble = self.get_interface_location(
+                        phase_field[a[0][value], a[1][value] - 10:a[1][value] + 10, a[2][value]], a[1][value] - 10)
+
+                    data.update({'location_of_spike': location_of_spike})
+                    data.update({'location_of_bubble': location_of_bubble})
+                else:
+                    location_of_spike = self.get_interface_location(
+                        phase_field[self.size[0] // 2, :, self.size[2] // 2])
+                    location_of_bubble = self.get_interface_location(phase_field[0, :, 0])
+                    location_of_saddle = self.get_interface_location(phase_field[0, :, self.size[2] // 2])
+
+                    data.update({'location_of_spike': location_of_spike})
+                    data.update({'location_of_bubble': location_of_bubble})
+                    data.update({'location_of_saddle': location_of_saddle})
+
+                sequenceValuesToScalars(data)
+
+                csv_file = f"RTI_{data['stencil_phase']}_{data['stencil_hydro']}_Re_{self.reynolds_number}_tube.csv"
+
+                df = pd.DataFrame.from_records([data])
+                if not os.path.isfile(csv_file):
+                    df.to_csv(csv_file, index=False)
+                else:
+                    df.to_csv(csv_file, index=False, mode='a', header=False)
+
+    def get_interface_location(self, one_dimensional_array, shift=None):
+        ny = self.size[1]
+        l0 = self.size[0]
 
-            self.write_result_to_database(t, location_of_spike, location_of_bubble, location_of_saddle, mass)
-
-    def write_result_to_database(self, t, spike, bubble, saddle, mass):
-        """Writes the simulation result stored in the global variables shapeFactors and angles to
-               an sqlite3 database, and resets the global variables."""
-        result = {'waLBerlaVersion': wlb.build_info.version,
-                  'xCells': self.size[0],
-                  'yCells': self.size[1],
-                  'zCells': self.size[2],
-                  'spike': spike,
-                  'bubble': bubble,
-                  'saddle': saddle,
-                  'mass': mass,
-                  'timesteps': t,
-                  'normalized_time': t / self.reference_time,
-                  'processes': wlb.mpi.numProcesses(),
-                  }
-        try:
-            wlbSqlite.checkAndUpdateSchema(result, 'interface_location', self.dbFile)
-            wlbSqlite.storeSingle(result, 'interface_location', self.dbFile)
-        except Exception as e:
-            wlb.log_warning("Failed to store run in database " + str(e) + "\n" + str(result))
+        index = np.argmax(one_dimensional_array > 0.5)
+
+        if index > 0:
+            zw1 = one_dimensional_array[index]
+            zw2 = one_dimensional_array[index - 1]
+            absolute_location = (index - 1) + (zw2 - 0.5) / (zw2 - zw1)
+            if shift:
+                absolute_location += shift
+            return (absolute_location - ny // 2) / l0
+        else:
+            return -100
 
 
 scenarios = wlb.ScenarioManager()
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_codegen.py b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_codegen.py
index 28f40c5e4a68f477fcd2d8137fa270ca07f0f577..607fc161686c9bec98be1a103d6c70a47e6d0095 100644
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_codegen.py
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_codegen.py
@@ -1,26 +1,31 @@
+from lbmpy.phasefield_allen_cahn.contact_angle import ContactAngle
 from pystencils import fields
 from pystencils.simp import sympy_cse
-from pystencils import AssignmentCollection
 
 from lbmpy.boundaries import NoSlip
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
+from lbmpy.creationfunctions import create_lb_method
 from lbmpy.stencils import get_stencil
 
-from pystencils_walberla import CodeGeneration, generate_sweep
-from lbmpy_walberla import generate_boundary
+import pystencils_walberla
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field
+from lbmpy_walberla import generate_boundary, generate_lb_pack_info
 
 from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb, \
-    initializer_kernel_hydro_lb, interface_tracking_force, \
-    hydrodynamic_force, get_collision_assignments_hydro
+    initializer_kernel_hydro_lb, interface_tracking_force, hydrodynamic_force, get_collision_assignments_hydro, \
+    get_collision_assignments_phase
 
 from lbmpy.phasefield_allen_cahn.force_model import MultiphaseForceModel
 
 import numpy as np
 import sympy as sp
-import pystencils as ps
 
-stencil_phase = get_stencil("D3Q19")
-stencil_hydro = get_stencil("D3Q27")
+stencil_phase_name = "D3Q27"
+stencil_hydro_name = "D3Q27"
+
+contact_angle_in_degrees = 22
+
+stencil_phase = get_stencil(stencil_phase_name)
+stencil_hydro = get_stencil(stencil_hydro_name)
 q_phase = len(stencil_phase)
 q_hydro = len(stencil_hydro)
 
@@ -45,7 +50,7 @@ relaxation_time_gas = sp.Symbol("tau_L")
 # phase-field parameter
 drho3 = (density_liquid - density_gas) / 3
 # interface thickness
-W = 5
+W = sp.Symbol("interface_thickness")
 # coefficients related to surface tension
 beta = 12.0 * (surface_tension / W)
 kappa = 1.5 * surface_tension * W
@@ -58,6 +63,7 @@ kappa = 1.5 * surface_tension * W
 u = fields(f"vel_field({dimensions}): [{dimensions}D]", layout='fzyx')
 # phase-field
 C = fields(f"phase_field: [{dimensions}D]", layout='fzyx')
+C_tmp = fields(f"phase_field_tmp: [{dimensions}D]", layout='fzyx')
 
 flag = fields(f"flag_field: uint8[{dimensions}D]", layout='fzyx')
 # phase-field distribution functions
@@ -93,12 +99,17 @@ relaxation_rate_cutoff = sp.Piecewise((1 / (0.5 + relaxation_time_liquid), C.cen
 # LBM METHODS #
 ###############
 
-method_phase = create_lb_method(stencil=stencil_phase, method='srt',
-                                relaxation_rate=relaxation_rate_allen_cahn, compressible=True)
+# method_phase = create_lb_method(stencil=stencil_phase, method="mrt", compressible=True, weighted=True,
+#                                 relaxation_rates=[1, 1.5, 1, 1.5, 1, 1.5])
+method_phase = create_lb_method(stencil=stencil_phase, method="mrt", compressible=True, weighted=True,
+                                relaxation_rates=[1, 1, 1, 1, 1, 1])
+
+method_phase.set_conserved_moments_relaxation_rate(relaxation_rate_allen_cahn)
 
 method_hydro = create_lb_method(stencil=stencil_hydro, method="mrt", weighted=True,
                                 relaxation_rates=[relaxation_rate, 1, 1, 1, 1, 1])
 
+
 # create the kernels for the initialization of the g and h field
 h_updates = initializer_kernel_phase_field_lb(h, C, u, method_phase, W, fd_stencil=get_stencil("D3Q27"))
 g_updates = initializer_kernel_hydro_lb(g, u, method_hydro)
@@ -109,84 +120,84 @@ force_model_h = MultiphaseForceModel(force=force_h)
 force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta, body_force,
                              fd_stencil=get_stencil("D3Q27"))
 
+force_model_g = MultiphaseForceModel(force=force_g, rho=density)
+
 ####################
 # LBM UPDATE RULES #
 ####################
 
-h_tmp_symbol_list = [h_tmp.center(i) for i, _ in enumerate(stencil_phase)]
-sum_h = np.sum(h_tmp_symbol_list[:])
+phase_field_LB_step = get_collision_assignments_phase(lb_method=method_phase,
+                                                      velocity_input=u,
+                                                      output={'density': C_tmp},
+                                                      force_model=force_model_h,
+                                                      symbolic_fields={"symbolic_field": h,
+                                                                       "symbolic_temporary_field": h_tmp},
+                                                      kernel_type='stream_pull_collide')
 
-method_phase.set_force_model(force_model_h)
-
-phase_field_LB_step = create_lb_update_rule(lb_method=method_phase,
-                                            velocity_input=u,
-                                            compressible=True,
-                                            optimization={"symbolic_field": h,
-                                                          "symbolic_temporary_field": h_tmp},
-                                            kernel_type='stream_pull_collide')
-
-phase_field_LB_step.set_main_assignments_from_dict({**phase_field_LB_step.main_assignments_dict, **{C.center: sum_h}})
-subexp = [ps.Assignment(a.lhs, float(a.rhs)) if a.rhs == 0 else a for a in phase_field_LB_step.subexpressions]
-phase_field_LB_step = AssignmentCollection(main_assignments=phase_field_LB_step.main_assignments,
-                                           subexpressions=subexp)
 phase_field_LB_step = sympy_cse(phase_field_LB_step)
 
-# ---------------------------------------------------------------------------------------------------------
 hydro_LB_step = get_collision_assignments_hydro(lb_method=method_hydro,
                                                 density=density,
                                                 velocity_input=u,
-                                                force=force_g,
+                                                force_model=force_model_g,
                                                 sub_iterations=2,
                                                 symbolic_fields={"symbolic_field": g,
                                                                  "symbolic_temporary_field": g_tmp},
-                                                kernel_type='collide_only')
+                                                kernel_type='collide_stream_push')
 
 hydro_LB_step.set_sub_expressions_from_dict({**{relaxation_rate: relaxation_rate_cutoff},
                                              **hydro_LB_step.subexpressions_dict})
 
-stream_hydro = create_lb_update_rule(stencil=stencil_hydro,
-                                     optimization={"symbolic_field": g,
-                                                   "symbolic_temporary_field": g_tmp},
-                                     kernel_type='stream_pull_only')
+contact_angle = ContactAngle(contact_angle_in_degrees, W)
+
 
 ###################
 # GENERATE SWEEPS #
 ###################
-cpu_vec = {'assume_inner_stride_one': True, 'nontemporal': True}
-
-vp = [('int32_t', 'cudaBlockSize0'),
-      ('int32_t', 'cudaBlockSize1')]
-
 info_header = f"""
+using namespace walberla;
 #include "stencil/D3Q{q_phase}.h"\nusing Stencil_phase_T = walberla::stencil::D3Q{q_phase};
 #include "stencil/D3Q{q_hydro}.h"\nusing Stencil_hydro_T = walberla::stencil::D3Q{q_hydro};
+using PdfField_phase_T = GhostLayerField<real_t, {q_phase}>;
+using PdfField_hydro_T = GhostLayerField<real_t, {q_hydro}>;
+using VelocityField_T = GhostLayerField<real_t, {dimensions}>;
+using PhaseField_T = GhostLayerField<real_t, 1>;
+#ifndef UTIL_H
+#define UTIL_H
+const char * stencil_phase_name = "{stencil_phase_name}";
+const char * stencil_hydro_name = "{stencil_hydro_name}";
+#endif
 """
 
 with CodeGeneration() as ctx:
-    if not ctx.optimize_for_localhost:
-        cpu_vec = {'instruction_set': None}
-
     generate_sweep(ctx, 'initialize_phase_field_distributions', h_updates, target='cpu')
     generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target='cpu')
 
     generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                   field_swaps=[(h, h_tmp)],
+                   field_swaps=[(h, h_tmp), (C, C_tmp)],
                    inner_outer_split=True,
-                   cpu_vectorize_info=cpu_vec, target='cpu')
-    generate_boundary(ctx, 'phase_field_LB_NoSlip', NoSlip(), method_phase, target='cpu')
+                   target='cpu')
+    generate_boundary(ctx, 'phase_field_LB_NoSlip', NoSlip(), method_phase, target='cpu', streaming_pattern='pull')
 
     generate_sweep(ctx, 'hydro_LB_step', hydro_LB_step,
-                   inner_outer_split=True,
-                   cpu_vectorize_info=cpu_vec, target='cpu')
-    generate_boundary(ctx, 'hydro_LB_NoSlip', NoSlip(), method_hydro, target='cpu')
-
-    generate_sweep(ctx, 'stream_hydro', stream_hydro,
                    field_swaps=[(g, g_tmp)],
                    inner_outer_split=True,
-                   cpu_vectorize_info=cpu_vec, target='cpu')
+                   target='cpu')
+    generate_boundary(ctx, 'hydro_LB_NoSlip', NoSlip(), method_hydro, target='cpu', streaming_pattern='push')
 
-    ctx.write_file("GenDefines.h", info_header)
+    # communication
+
+    generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
+                          streaming_pattern='pull', target='cpu')
 
-    # TODO: generate communication (PackInfo)
+    generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
+                          streaming_pattern='push', target='cpu')
+
+    generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target='cpu')
+
+    pystencils_walberla.boundary.generate_boundary(ctx, 'ContactAngle', contact_angle,
+                                                   C.name, stencil_hydro, index_shape=[], target='cpu')
+
+    ctx.write_file("GenDefines.h", info_header)
 
 print("finished code generation successfully")
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_rising_bubble.py b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_rising_bubble.py
index d424d7c21c5255ec85ed28e493f54176d12ff4be..c040fc2f1df72793085b745787bd9dff6d00b9a5 100755
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_rising_bubble.py
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_rising_bubble.py
@@ -14,8 +14,8 @@ class Scenario:
 
         # simulation parameters
         self.timesteps = 10000
-        self.cells = (64, 32, 64)
-        self.blocks = (1, 4, 1)
+        self.cells = (32, 32, 64)
+        self.blocks = (2, 4, 1)
         self.periodic = (0, 0, 0)
         self.size = (self.cells[0] * self.blocks[0],
                      self.cells[1] * self.blocks[1],
@@ -39,6 +39,8 @@ class Scenario:
                                                                 density_ratio=1000,
                                                                 viscosity_ratio=100)
 
+        self.interface_thickness = 5
+
         # everything else
         self.dbFile = "risingBubble3D.db"
 
@@ -72,6 +74,7 @@ class Scenario:
                 'gravitational_acceleration': self.parameters["gravitational_acceleration"],
                 'relaxation_time_liquid': self.parameters.get("relaxation_time_heavy"),
                 'relaxation_time_gas': self.parameters.get("relaxation_time_light"),
+                'interface_thickness': self.interface_thickness
             },
             'Boundaries': {
                 'Border': [
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt b/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt
index 51c6922032a44744f513890ac6642185930ba3af..a8ff39721f322b6bdd5deed047984176ed570c15 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/CMakeLists.txt
@@ -10,14 +10,14 @@ waLBerla_generate_target_from_python(NAME PhaseFieldCodeGenGPU
         phase_field_LB_NoSlip.cu phase_field_LB_NoSlip.h
         hydro_LB_step.cu hydro_LB_step.h
         hydro_LB_NoSlip.cu hydro_LB_NoSlip.h
-        stream_hydro.cu stream_hydro.h
         PackInfo_phase_field_distributions.cu PackInfo_phase_field_distributions.h
         PackInfo_phase_field.cu PackInfo_phase_field.h
         PackInfo_velocity_based_distributions.cu PackInfo_velocity_based_distributions.h
+        ContactAngle.cu ContactAngle.h
         GenDefines.h)
 
 waLBerla_add_executable(NAME multiphaseGPU
-        FILES multiphase.cpp PythonExports.cpp InitializerFunctions.cpp CalculateNormals.cpp contact.cu multiphase_codegen.py
+        FILES multiphase.cpp PythonExports.cpp InitializerFunctions.cpp util.cpp multiphase_codegen.py
         DEPENDS blockforest core cuda field postprocessing lbm geometry timeloop gui PhaseFieldCodeGenGPU)
 
 set_target_properties(multiphaseGPU PROPERTIES CXX_VISIBILITY_PRESET hidden)
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/CalculateNormals.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/CalculateNormals.cpp
deleted file mode 100644
index e20053e38130262fdf2bb8c556f1fd665888a945..0000000000000000000000000000000000000000
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/CalculateNormals.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file CalculateNormals.cpp
-//! \author Markus Holzer <markus.holzer@fau.de>
-//
-//======================================================================================================================
-#include "CalculateNormals.h"
-
-#include "core/Environment.h"
-#include "core/logging/Initialization.h"
-
-#include "field/FlagField.h"
-
-namespace walberla
-{
-using FlagField_T    = FlagField< uint8_t >;
-using NormalsField_T = GhostLayerField< int8_t, 3 >;
-
-void calculate_normals(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID normalsFieldID,
-                       ConstBlockDataID flagFieldID, FlagUID domainFlagUID, FlagUID boundaryFlagUID)
-{
-   for (auto& block : *blocks)
-   {
-      CellInterval globalCellBB = blocks->getBlockCellBB(block);
-      CellInterval blockLocalCellBB;
-      blocks->transformGlobalToBlockLocalCellInterval(blockLocalCellBB, block, globalCellBB);
-
-      auto* normalsField = block.getData< NormalsField_T >(normalsFieldID);
-      auto* flagField    = block.getData< FlagField_T >(flagFieldID);
-      auto boundaryFlag  = flagField->getFlag(boundaryFlagUID);
-      auto domainFlag    = flagField->getFlag(domainFlagUID);
-
-      // clang-format off
-      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(normalsField,
-
-         if( x < blockLocalCellBB.xMax() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x + 1, y, z) == domainFlag)
-               normalsField->get(x, y, z, 0) = 1;
-         }
-
-         if( x > blockLocalCellBB.xMin() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x - 1, y, z) == domainFlag)
-               normalsField->get(x, y, z, 0) = - 1;
-         }
-
-         if( y < blockLocalCellBB.yMax() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x, y + 1, z) == domainFlag)
-               normalsField->get(x, y, z, 1) = 1;
-         }
-
-         if( y > blockLocalCellBB.yMin() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x, y - 1, z) == domainFlag)
-               normalsField->get(x, y, z, 1) = - 1;
-         }
-
-         if( z < blockLocalCellBB.zMax() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x, y, z + 1) == domainFlag)
-               normalsField->get(x, y, z, 2) = 1;
-         }
-
-         if( z > blockLocalCellBB.zMin() ){
-            if(flagField->get(x, y, z) == boundaryFlag && flagField->get(x, y, z - 1) == domainFlag)
-               normalsField->get(x, y, z, 2) = - 1;
-         }
-
-      )
-      // clang-format on
-   }
-}
-} // namespace walberla
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/InitializerFunctions.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/InitializerFunctions.cpp
index 0fd64cb73cefcfefb25a565fee9c80b4c127b155..979c87ff330f669567df87c2e051a75271280b6b 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/InitializerFunctions.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/InitializerFunctions.cpp
@@ -53,24 +53,196 @@ void initPhaseField_sphere(const shared_ptr< StructuredBlockStorage >& blocks, B
    }
 }
 
+void init_Taylor_bubble(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID,
+                        const real_t D = 5, const real_t H = 2, const real_t DT = 20, const real_t Donut_x0 = 40)
+{
+   auto Mx = blocks->getDomainCellBB().xMax() / 2.0;
+   auto Mz = blocks->getDomainCellBB().zMax() / 2.0;
+
+   for (auto& block : *blocks)
+   {
+      auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+         phaseField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+
+         real_t Ri = D * sqrt(pow(H, 2) - pow(DT - sqrt(pow(globalCell[0] - Mx, 2) + pow(globalCell[2] - Mz, 2)), 2));
+
+         real_t shifter           = atan2((globalCell[2] - Mz), (globalCell[0] - Mx));
+         if (shifter < 0) shifter = shifter + 2 * math::pi;
+         if ((globalCell[1] < Donut_x0 + Ri * sin(shifter / 2.0)) && (globalCell[1] > Donut_x0 - Ri)) {
+            phaseField->get(x, y, z) = 0.0;
+         } else { phaseField->get(x, y, z) = 1.0; })
+   }
+}
+
+void init_bubble_field(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t R,
+                       real_t W = 5)
+{
+   Vector3< real_t > bubbleMidPoint;
+
+   auto X = blocks->getDomainCellBB().xMax();
+   auto Y = blocks->getDomainCellBB().yMax();
+   auto Z = blocks->getDomainCellBB().zMax();
+
+   // 20 percent from the top are filled with the gas phase
+   real_t gas_top = Y - Y / 5.0;
+
+   // Diameter of the bubble
+   real_t D = R * 2;
+
+   // distance in between the bubbles
+   int dist = 4;
+   auto nx  = static_cast< unsigned int >(floor(X / (D + dist * W)));
+   auto nz  = static_cast< unsigned int >(floor(Z / (D + dist * W)));
+
+   // fluctuation of the bubble radii
+   std::vector< std::vector< real_t > > fluctuation_radius(nx, std::vector< real_t >(nz, 0.0));
+   std::vector< std::vector< real_t > > fluctuation_pos(nx, std::vector< real_t >(nz, 0.0));
+
+   real_t max_fluctuation_radius = R / 5;
+   real_t max_fluctuation_pos    = (dist * W) / 3.0;
+   for (unsigned int i = 0; i < nx; ++i)
+   {
+      for (unsigned int j = 0; j < nz; ++j)
+      {
+         fluctuation_radius[i][j] = math::realRandom< real_t >(-max_fluctuation_radius, max_fluctuation_radius);
+         fluctuation_pos[i][j]    = math::realRandom< real_t >(-max_fluctuation_pos, max_fluctuation_pos);
+      }
+   }
+
+   for (auto& block : *blocks)
+   {
+      auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+         phaseField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+         for (unsigned int i = 0; i < nx; ++i) {
+            for (unsigned int j = 0; j < nz; ++j)
+            {
+               bubbleMidPoint[0] = (i + 1) * (D + (dist * W)) - (D + (dist * W)) / 2.0 + fluctuation_pos[i][j];
+               bubbleMidPoint[1] = R + W + 4;
+               bubbleMidPoint[2] = (j + 1) * (D + (dist * W)) - (D + (dist * W)) / 2.0 + fluctuation_pos[i][j];
+
+               real_t Ri = sqrt((globalCell[0] - bubbleMidPoint[0]) * (globalCell[0] - bubbleMidPoint[0]) +
+                                (globalCell[1] - bubbleMidPoint[1]) * (globalCell[1] - bubbleMidPoint[1]) +
+                                (globalCell[2] - bubbleMidPoint[2]) * (globalCell[2] - bubbleMidPoint[2]));
+               if (globalCell[0] >= i * (D + dist * W) && globalCell[0] <= (i + 1) * (D + dist * W) &&
+                   globalCell[2] >= j * (D + dist * W) && globalCell[2] <= (j + 1) * (D + dist * W))
+                  phaseField->get(x, y, z) = 0.5 + 0.5 * tanh(2.0 * (Ri - (R - fluctuation_radius[i][j])) / W);
+
+               if (globalCell[0] > nx * (D + dist * W)) phaseField->get(x, y, z) = 1.0;
+               if (globalCell[2] > nz * (D + dist * W)) phaseField->get(x, y, z) = 1.0;
+            }
+         }
+
+         if (globalCell[1] > gas_top) {
+            phaseField->get(x, y, z) = 0.5 + 0.5 * tanh(2.0 * (gas_top + 10 - globalCell[1]) / W);
+         })
+   }
+}
+
 void initPhaseField_RTI(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID,
-                        const real_t W = 5)
+                        const real_t W = 5, const bool pipe = true)
 {
    auto X              = blocks->getDomainCellBB().xMax();
+   auto Z              = blocks->getDomainCellBB().zMax();
    auto halfY          = (blocks->getDomainCellBB().yMax()) / 2.0;
-   double perturbation = 0.05;
+   real_t perturbation = 0.05;
 
-   for (auto& block : *blocks)
+   if (pipe)
    {
-      auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
-      // clang-format off
-      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(phaseField, Cell globalCell;
-         blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-         real_t tmp =
-         perturbation * X * (cos((2.0 * math::pi * globalCell[0]) / X) + cos((2.0 * math::pi * globalCell[2]) / X));
-         phaseField->get(x, y, z) = 0.5 + 0.5 * tanh(((globalCell[1] - halfY) - tmp) / (W / 2.0));
-      )
-      // clang-format on
+      for (auto& block : *blocks)
+      {
+         auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+            phaseField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            real_t R     = sqrt((globalCell[0] - X / 2) * (globalCell[0] - X / 2) +
+                            (globalCell[2] - Z / 2) * (globalCell[2] - Z / 2));
+            if (R > X) R = X; real_t tmp = perturbation * X * cos((2.0 * math::pi * R) / X);
+            phaseField->get(x, y, z)     = 0.5 + 0.5 * tanh(((globalCell[1] - halfY) + tmp) / (W / 2.0));)
+      }
+   }
+   else
+   {
+      for (auto& block : *blocks)
+      {
+         auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+            phaseField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            real_t tmp = perturbation * X *
+                         (cos((2.0 * math::pi * globalCell[0]) / X) + cos((2.0 * math::pi * globalCell[2]) / X));
+            phaseField->get(x, y, z) = 0.5 + 0.5 * tanh(((globalCell[1] - halfY) - tmp) / (W / 2.0));)
+      }
+   }
+}
+
+void initTubeWithCylinder(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID flagFieldID,
+                          field::FlagUID boundaryFlagUID, real_t const R_in, real_t const eccentricity,
+                          real_t const start_transition, real_t const length_transition,
+                          bool const eccentricity_or_pipe_ratio)
+{
+   if (eccentricity_or_pipe_ratio)
+   {
+      auto Mx = blocks->getDomainCellBB().xMax() / 2.0;
+      auto Mz = blocks->getDomainCellBB().zMax() / 2.0;
+
+      auto R_outer = blocks->getDomainCellBB().xMax() / 2.0 + 1.0;
+
+      real_t const shift = eccentricity * Mx / 2;
+
+      for (auto& block : *blocks)
+      {
+         auto flagField    = block.template getData< FlagField_T >(flagFieldID);
+         auto boundaryFlag = flagField->getOrRegisterFlag(boundaryFlagUID);
+         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(flagField, Cell globalCell; 
+            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            real_t R1;
+            if (globalCell[1] <= start_transition) {
+               R1 = sqrt((globalCell[0] - Mx) * (globalCell[0] - Mx) + (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            } else if (globalCell[1] > start_transition && globalCell[1] < start_transition + length_transition) {
+               real_t tmp       = math::pi * (globalCell[1] - start_transition) / (length_transition);
+               real_t shift_tmp = shift * 0.5 * (1 - cos(tmp));
+               R1               = sqrt((globalCell[0] - Mx - shift_tmp) * (globalCell[0] - Mx - shift_tmp) +
+                         (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            } else {
+               R1 = sqrt((globalCell[0] - Mx - shift) * (globalCell[0] - Mx - shift) +
+                         (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            }
+
+            real_t R2 = sqrt((globalCell[0] - Mx) * (globalCell[0] - Mx) + (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            if (R1 < R_in) addFlag(flagField->get(x, y, z), boundaryFlag);
+            if (R2 > R_outer) addFlag(flagField->get(x, y, z), boundaryFlag);)
+      }
+   }
+   else
+   {
+      auto Mx = blocks->getDomainCellBB().xMax() / 2.0;
+      auto Mz = blocks->getDomainCellBB().zMax() / 2.0;
+      
+      auto R_outer = blocks->getDomainCellBB().xMax() / 2.0 + 1.0;
+
+      real_t const shift = eccentricity * R_in;
+      real_t R_tmp;
+
+      for (auto& block : *blocks)
+      {
+         auto flagField    = block.template getData< FlagField_T >(flagFieldID);
+         auto boundaryFlag = flagField->getOrRegisterFlag(boundaryFlagUID);
+         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+            flagField, Cell globalCell; blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            if (globalCell[1] <= start_transition) {
+               R_tmp = R_in;
+            } else if (globalCell[1] > start_transition && globalCell[1] < start_transition + length_transition) {
+               real_t tmp       = math::pi * (globalCell[1] - start_transition) / (length_transition);
+               real_t shift_tmp = shift * 0.5 * (1 - cos(tmp));
+               R_tmp = R_in + shift_tmp;
+            } else {
+               R_tmp = R_in + shift;
+            }
+
+            real_t R2 = sqrt((globalCell[0] - Mx) * (globalCell[0] - Mx) + (globalCell[2] - Mz) * (globalCell[2] - Mz));
+            if (R2 < R_tmp) addFlag(flagField->get(x, y, z), boundaryFlag);
+            if (R2 > R_outer) addFlag(flagField->get(x, y, z), boundaryFlag);)
+      }
    }
 }
 } // namespace walberla
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/InitializerFunctions.h b/apps/showcases/PhaseFieldAllenCahn/GPU/InitializerFunctions.h
index 04bac53f2e75d17ddd393da0e247b63df05d0704..585639ac9078beb048a8ce694ea97552e59b833a 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/InitializerFunctions.h
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/InitializerFunctions.h
@@ -34,6 +34,17 @@ namespace walberla
 void initPhaseField_sphere(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t R,
                            Vector3< real_t > bubbleMidPoint, bool bubble = true, real_t W = 5);
 
-void initPhaseField_RTI(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t W = 5);
+void init_Taylor_bubble(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t D = 5,
+                        real_t H = 2, real_t DT = 20, real_t Donut_x0 = 40);
+
+void init_bubble_field(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t R,
+                       real_t W = 5);
+
+void initPhaseField_RTI(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t W = 5,
+                        const bool pipe = true);
+
+void initTubeWithCylinder(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID flagFieldID,
+                          field::FlagUID boundaryFlagUID, real_t R_in, real_t eccentricity, real_t start_transition,
+                          real_t length_transition, bool const eccentricity_or_pipe_ratio);
 
 } // namespace walberla
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/contact.cu b/apps/showcases/PhaseFieldAllenCahn/GPU/contact.cu
deleted file mode 100644
index 48d26d1c5d7ec8e9353e48cd9a81731179ce4de4..0000000000000000000000000000000000000000
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/contact.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file contact.cu
-//! \author Markus Holzer <markus.holzer@fau.de>
-//
-//======================================================================================================================
-
-#include "core/DataTypes.h"
-#include "core/Macros.h"
-
-#include <cmath>
-
-#include "contact.h"
-
-#define FUNC_PREFIX __global__
-
-namespace walberla
-{
-namespace lbm
-{
-#ifdef __GNUC__
-#   pragma GCC diagnostic push
-#endif
-
-#ifdef __CUDACC__
-#   pragma push
-#endif
-
-namespace internal_boundary_contact
-{
-static FUNC_PREFIX void contact_angle_treatment(uint8_t* WALBERLA_RESTRICT const _data_indexVector, double* WALBERLA_RESTRICT _data_phase,
-                                                int64_t const _stride_phase_0, int64_t const _stride_phase_1,
-                                                int64_t const _stride_phase_2, int64_t indexVectorSize, double alpha)
-{
-   if (blockDim.x * blockIdx.x + threadIdx.x < indexVectorSize)
-   {
-      uint8_t* WALBERLA_RESTRICT _data_indexVector_10 = _data_indexVector;
-      const int32_t x = *((int32_t*) (&_data_indexVector_10[24 * blockDim.x * blockIdx.x + 24 * threadIdx.x]));
-      uint8_t* WALBERLA_RESTRICT _data_indexVector_14 = _data_indexVector + 4;
-      const int32_t y = *((int32_t*) (&_data_indexVector_14[24 * blockDim.x * blockIdx.x + 24 * threadIdx.x]));
-      uint8_t* WALBERLA_RESTRICT _data_indexVector_18 = _data_indexVector + 8;
-      const int32_t z = *((int32_t*) (&_data_indexVector_18[24 * blockDim.x * blockIdx.x + 24 * threadIdx.x]));
-      uint8_t* WALBERLA_RESTRICT _data_indexVector_112 = _data_indexVector + 12;
-      const int32_t nx = *((int32_t*) (&_data_indexVector_112[24 * blockDim.x * blockIdx.x + 24 * threadIdx.x]));
-      const int32_t x1 = x + nx;
-      uint8_t* WALBERLA_RESTRICT _data_indexVector_116 = _data_indexVector + 16;
-      const int32_t ny = *((int32_t*) (&_data_indexVector_116[24 * blockDim.x * blockIdx.x + 24 * threadIdx.x]));
-      const int32_t y1 = y + ny;
-      uint8_t* WALBERLA_RESTRICT _data_indexVector_200 = _data_indexVector + 20;
-      const int32_t nz = *((int32_t*) (&_data_indexVector_200[24 * blockDim.x * blockIdx.x + 24 * threadIdx.x]));
-      const int32_t z1 = z + nz;
-
-      const double h = 0.5 * sqrt((float) (nx * nx + ny * ny + nz * nz));
-      const double a = cos(alpha);
-      const double W = 5;
-
-      double* WALBERLA_RESTRICT _phase_wall     = _data_phase + _stride_phase_1 * y + _stride_phase_2 * z;
-      double* WALBERLA_RESTRICT _phase_interior = _data_phase + _stride_phase_1 * y1 + _stride_phase_2 * z1;
-      if (h < 0.001) { _phase_wall[_stride_phase_0 * x] = 1.0; }
-      else if (a > 1e-8 || a < -1e-8)
-      {
-         const double var = -h * (4.0 / W) * a;
-         _phase_wall[_stride_phase_0 * x] =
-            (1 + var - sqrt((1 + var) * (1 + var) - 4 * var * _phase_interior[_stride_phase_0 * x1])) / (var + 1e-12) -
-            _phase_interior[_stride_phase_0 * x1];
-      }
-      else
-      {
-         _phase_wall[_stride_phase_0 * x] = _phase_interior[_stride_phase_0 * x1];
-      }
-   }
-}
-} // namespace internal_boundary_contact
-
-#ifdef __GNUC__
-#   pragma GCC diagnostic pop
-#endif
-
-#ifdef __CUDACC__
-#   pragma pop
-#endif
-
-void contact::run(IBlock* block, IndexVectors::Type type, cudaStream_t stream)
-{
-   auto* indexVectors      = block->getData< IndexVectors >(indexVectorID);
-   int64_t indexVectorSize = int64_c(indexVectors->indexVector(type).size());
-   if (indexVectorSize == 0) return;
-
-   auto pointer = indexVectors->pointerGpu(type);
-
-   auto* _data_indexVector = reinterpret_cast< uint8_t* >(pointer);
-
-   auto phaseField = block->getData< cuda::GPUField< double > >(phaseFieldID);
-
-   auto& alpha = this->alpha_;
-   WALBERLA_ASSERT_GREATER_EQUAL(0, -int_c(phaseField->nrOfGhostLayers()))
-   double* WALBERLA_RESTRICT _data_phase = phaseField->dataAt(0, 0, 0, 0);
-   const auto _stride_pdfs_0    = int64_t(phaseField->xStride());
-   const auto _stride_pdfs_1    = int64_t(phaseField->yStride());
-   const auto _stride_pdfs_2    = int64_t(phaseField->zStride());
-   dim3 _block(int(((256 < indexVectorSize) ? 256 : indexVectorSize)), int(1), int(1));
-   dim3 _grid(int(((indexVectorSize) % (((256 < indexVectorSize) ? 256 : indexVectorSize)) == 0 ?
-                      (int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize)) :
-                      ((int64_t)(indexVectorSize) / (int64_t)(((256 < indexVectorSize) ? 256 : indexVectorSize))) + 1)),
-              int(1), int(1));
-   internal_boundary_contact::contact_angle_treatment<<< _grid, _block, 0, stream >>>(
-      _data_indexVector, _data_phase, _stride_pdfs_0, _stride_pdfs_1, _stride_pdfs_2, indexVectorSize, alpha);
-}
-
-void contact::operator()(IBlock* block, cudaStream_t stream) { run(block, IndexVectors::ALL, stream); }
-
-void contact::inner(IBlock* block, cudaStream_t stream) { run(block, IndexVectors::INNER, stream); }
-
-void contact::outer(IBlock* block, cudaStream_t stream) { run(block, IndexVectors::OUTER, stream); }
-
-} // namespace lbm
-} // namespace walberla
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/contact.h b/apps/showcases/PhaseFieldAllenCahn/GPU/contact.h
deleted file mode 100644
index f2cd84cd0c67577ab7a40d8ac720fda46f6e7fe3..0000000000000000000000000000000000000000
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/contact.h
+++ /dev/null
@@ -1,158 +0,0 @@
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file contact.h
-//! \author Markus Holzer <markus.holzer@fau.de>
-//
-//======================================================================================================================
-
-#include "blockforest/StructuredBlockForest.h"
-
-#include "core/DataTypes.h"
-
-#include "cuda/GPUField.h"
-
-#include "domain_decomposition/BlockDataID.h"
-#include "domain_decomposition/IBlock.h"
-
-#include "field/FlagField.h"
-
-#include <set>
-#include <vector>
-
-namespace walberla
-{
-namespace lbm
-{
-class contact
-{
- public:
-   struct IndexInfo
-   {
-      int32_t x1;
-      int32_t y1;
-      int32_t z1;
-      int32_t x2;
-      int32_t y2;
-      int32_t z2;
-      IndexInfo(int32_t x1_, int32_t y1_, int32_t z1_, int32_t x2_, int32_t y2_, int32_t z2_)
-         : x1(x1_), y1(y1_), z1(z1_), x2(x2_), y2(y2_), z2(z2_)
-      {}
-      bool operator==(const IndexInfo& o) const
-      {
-         return x1 == o.x1 && y1 == o.y1 && z1 == o.z1 && x2 == o.x2 && y2 == o.y2 && z2 == o.z2;
-      }
-   };
-
-   class IndexVectors
-   {
-    public:
-      using CpuIndexVector = std::vector< IndexInfo >;
-
-      enum Type { ALL = 0, INNER = 1, OUTER = 2, NUM_TYPES = 3 };
-
-      IndexVectors() : cpuVectors_(NUM_TYPES) {}
-      bool operator==(IndexVectors& other) { return other.cpuVectors_ == cpuVectors_; }
-
-      ~IndexVectors()
-      {
-         for (auto& gpuVec : gpuVectors_)
-            cudaFree(gpuVec);
-      }
-
-      CpuIndexVector& indexVector(Type t) { return cpuVectors_[t]; }
-      IndexInfo* pointerCpu(Type t) { return &(cpuVectors_[t][0]); }
-
-      IndexInfo* pointerGpu(Type t) { return gpuVectors_[t]; }
-
-      void syncGPU()
-      {
-         gpuVectors_.resize(cpuVectors_.size());
-         for (size_t i = 0; i < size_t(NUM_TYPES); ++i)
-         {
-            auto& gpuVec = gpuVectors_[i];
-            auto& cpuVec = cpuVectors_[i];
-            cudaFree(gpuVec);
-            cudaMalloc(&gpuVec, sizeof(IndexInfo) * cpuVec.size());
-            cudaMemcpy(gpuVec, &cpuVec[0], sizeof(IndexInfo) * cpuVec.size(), cudaMemcpyHostToDevice);
-         }
-      }
-
-    private:
-      std::vector< CpuIndexVector > cpuVectors_;
-
-      using GpuIndexVector = IndexInfo*;
-      std::vector< GpuIndexVector > gpuVectors_;
-   };
-
-   contact(const shared_ptr< StructuredBlockForest >& blocks, BlockDataID phaseFieldID_, double alpha)
-      : phaseFieldID(phaseFieldID_), alpha_(alpha)
-   {
-      auto createIdxVector = [](IBlock* const, StructuredBlockStorage* const) { return new IndexVectors(); };
-      indexVectorID = blocks->addStructuredBlockData< IndexVectors >(createIdxVector, "IndexField_hydro_NoSlip_gpu");
-   };
-
-   void operator()(IBlock* block, cudaStream_t stream = nullptr);
-   void inner(IBlock* block, cudaStream_t stream = nullptr);
-   void outer(IBlock* block, cudaStream_t stream = nullptr);
-
-   template< typename NormalField_T >
-   void fillFromNormalField(const shared_ptr< StructuredBlockForest >& blocks, ConstBlockDataID normalFieldID)
-   {
-      for (auto& block : *blocks)
-      {
-         auto* indexVectors     = block.getData< IndexVectors >(indexVectorID);
-         auto& indexVectorAll   = indexVectors->indexVector(IndexVectors::ALL);
-         auto& indexVectorInner = indexVectors->indexVector(IndexVectors::INNER);
-         auto& indexVectorOuter = indexVectors->indexVector(IndexVectors::OUTER);
-
-         auto* normalField = block.getData< NormalField_T >(normalFieldID);
-
-         auto inner = normalField->xyzSize();
-         inner.expand(cell_idx_t(-1));
-
-         indexVectorAll.clear();
-         indexVectorInner.clear();
-         indexVectorOuter.clear();
-         // clang-format off
-         WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(normalField, Cell globalCell;
-            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-            if (normalField->get(x, y, z, 0) != 0 || normalField->get(x, y, z, 1) != 0 || normalField->get(x, y, z, 2) != 0)
-               {
-                  auto element = IndexInfo(x, y, z, normalField->get(x, y, z, 0), normalField->get(x, y, z, 1), normalField->get(x, y, z, 2));
-                  indexVectorAll.push_back(element);
-                  if (inner.contains(x, y, z))
-                     indexVectorInner.push_back(element);
-                  else
-                     indexVectorOuter.push_back(element);
-               }
-         )
-         // clang-format off
-         indexVectors->syncGPU();
-      }
-   }
-
- private:
-   void run(IBlock* block, IndexVectors::Type type, cudaStream_t stream = nullptr);
-
-   BlockDataID indexVectorID;
-
- public:
-   BlockDataID phaseFieldID;
-   double alpha_;
-};
-
-} // namespace lbm
-} // namespace walberla
\ No newline at end of file
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py b/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py
index 1b3e9d6a8e15642ea4855d6be13eb378bcb8cc1e..34bdb9fdceb1432638a89d5670c7c8ce537158d5 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py
@@ -18,25 +18,24 @@ class Scenario:
         self.overlappingWidth = (8, 1, 1)
         self.timeStepStrategy = 'normal'
 
-        self.contactAngle = 22
-
         # bubble parameters
         self.dropletRadius = 24.0
         self.dropletMidPoint = (64, 24, 64)
 
         # everything else
-        self.scenario = 1  # 1 rising bubble, 2 RTI
+        self.scenario = 1  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
 
         self.counter = 0
         self.yPositions = []
 
     @wlb.member_callback
-    def config(self, **kwargs):
+    def config(self):
         return {
             'DomainSetup': {
                 'blocks': self.blocks,
                 'cellsPerBlock': self.cells,
                 'periodic': self.periodic,
+                'tube': False
             },
             'Parameters': {
                 'timesteps': self.timesteps,
@@ -45,7 +44,6 @@ class Scenario:
                 'overlappingWidth': self.overlappingWidth,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
-                'contactAngle': self.contactAngle
             },
             'PhysicalParameters': {
                 'density_liquid': 1.0,
@@ -55,6 +53,7 @@ class Scenario:
                 'gravitational_acceleration': 0.0,
                 'relaxation_time_liquid': 3 * 0.166,
                 'relaxation_time_gas': 3 * 0.0166,
+                'interface_thickness': 5
             },
             'Boundaries': {
                 'Border': [
@@ -69,7 +68,7 @@ class Scenario:
             'Bubble': {
                 'bubbleMidPoint': self.dropletMidPoint,
                 'bubbleRadius': self.dropletRadius,
-                'bubble': False
+                'bubble': False  # this means we are simulating a droplet rather than a bubble
             },
         }
 
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
index de54dfe91b39cce943fa15ea1338e0131842866c..a97ea043fd4facc1010f2838b99e5269c91a6023 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
@@ -27,6 +27,7 @@
 
 #include "cuda/AddGPUFieldToStorage.h"
 #include "cuda/DeviceSelectMPI.h"
+#include "cuda/NVTX.h"
 #include "cuda/ParallelStreams.h"
 #include "cuda/communication/UniformGPUScheme.h"
 
@@ -36,6 +37,11 @@
 #include "field/vtk/VTKWriter.h"
 
 #include "geometry/InitBoundaryHandling.h"
+#include "geometry/mesh/TriangleMeshIO.h"
+
+#include "lbm/vtk/QCriterion.h"
+
+#include "postprocessing/FieldToSurfaceMesh.h"
 
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/PythonCallback.h"
@@ -43,10 +49,9 @@
 
 #include "timeloop/SweepTimeloop.h"
 
-#include "CalculateNormals.h"
 #include "InitializerFunctions.h"
 #include "PythonExports.h"
-#include "contact.h"
+#include "util.h"
 
 //////////////////////////////
 // INCLUDE GENERATED FILES //
@@ -62,7 +67,7 @@
 #include "initialize_velocity_based_distributions.h"
 #include "phase_field_LB_NoSlip.h"
 #include "phase_field_LB_step.h"
-#include "stream_hydro.h"
+#include "ContactAngle.h"
 
 ////////////
 // USING //
@@ -70,14 +75,30 @@
 
 using namespace walberla;
 
-using PdfField_phase_T = GhostLayerField< real_t, Stencil_phase_T::Size >;
-using PdfField_hydro_T = GhostLayerField< real_t, Stencil_hydro_T::Size >;
-using VelocityField_T  = GhostLayerField< real_t, Stencil_hydro_T::Dimension >;
-using NormalsField_T   = GhostLayerField< int8_t, Stencil_hydro_T::Dimension >;
-using PhaseField_T     = GhostLayerField< real_t, 1 >;
-using FlagField_T      = FlagField< uint8_t >;
+using NormalsField_T = GhostLayerField< int8_t, Stencil_hydro_T::Dimension >;
+using FlagField_T    = FlagField< uint8_t >;
+
+typedef cuda::GPUField< real_t > GPUField;
+typedef cuda::GPUField< uint8_t > GPUField_int;
+
+class Filter
+{
+ public:
+   explicit Filter(Vector3< uint_t > numberOfCells) : numberOfCells_(numberOfCells) {}
+
+   void operator()(const IBlock& /*block*/) {}
+
+   bool operator()(const cell_idx_t x, const cell_idx_t y, const cell_idx_t z) const
+   {
+      return x >= -1 && x <= cell_idx_t(numberOfCells_[0]) && y >= -1 && y <= cell_idx_t(numberOfCells_[1]) &&
+             z >= -1 && z <= cell_idx_t(numberOfCells_[2]);
+   }
 
-typedef cuda::GPUField< double > GPUField;
+ private:
+   Vector3< uint_t > numberOfCells_;
+};
+
+using FluidFilter_T = Filter;
 
 int main(int argc, char** argv)
 {
@@ -99,6 +120,7 @@ int main(int argc, char** argv)
 
       auto domainSetup                = config->getOneBlock("DomainSetup");
       Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+      const bool tube                 = domainSetup.getParameter< bool >("tube", true);
 
       ////////////////////////////////////////
       // ADD GENERAL SIMULATION PARAMETERS //
@@ -109,9 +131,7 @@ int main(int argc, char** argv)
       const uint_t timesteps             = parameters.getParameter< uint_t >("timesteps", uint_c(50));
       const real_t remainingTimeLoggerFrequency =
          parameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0);
-      const uint_t scenario  = parameters.getParameter< uint_t >("scenario", uint_c(1));
-      const real_t alpha     = parameters.getParameter< real_t >("contactAngle", real_c(90));
-      const real_t alpha_rad = alpha * (math::pi / 180);
+      const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
       Vector3< int > overlappingWidth =
          parameters.getParameter< Vector3< int > >("overlappingWidth", Vector3< int >(1, 1, 1));
       Vector3< int > gpuBlockSize =
@@ -124,7 +144,6 @@ int main(int argc, char** argv)
       // CPU fields
       BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_t(0), field::fzyx);
-      BlockDataID normals     = field::addToStorage< NormalsField_T >(blocks, "normals", int8_t(0), field::fzyx);
       // GPU fields
       BlockDataID lb_phase_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
          blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
@@ -135,71 +154,94 @@ int main(int argc, char** argv)
       BlockDataID phase_field_gpu =
          cuda::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
       // Flag field
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+      BlockDataID flagFieldID     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+      BlockDataID flagFieldID_gpu = cuda::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true);
+
+      auto physical_parameters     = config->getOneBlock("PhysicalParameters");
+      const real_t density_liquid  = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0));
+      const real_t density_gas     = physical_parameters.getParameter< real_t >("density_gas");
+      const real_t surface_tension = physical_parameters.getParameter< real_t >("surface_tension");
+      const real_t mobility        = physical_parameters.getParameter< real_t >("mobility");
+      const real_t gravitational_acceleration =
+         physical_parameters.getParameter< real_t >("gravitational_acceleration");
+      const real_t relaxation_time_liquid = physical_parameters.getParameter< real_t >("relaxation_time_liquid");
+      const real_t relaxation_time_gas    = physical_parameters.getParameter< real_t >("relaxation_time_gas");
+      const real_t interface_thickness    = physical_parameters.getParameter< real_t >("interface_thickness");
+
+      std::array< real_t, 3 > center_of_mass = { 0.0, 0.0, 0.0 };
 
-      WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the phase-field")
+      WALBERLA_LOG_INFO_ON_ROOT("initialization of the phase field")
       if (scenario == 1)
       {
          auto bubbleParameters                  = config->getOneBlock("Bubble");
          const Vector3< real_t > bubbleMidPoint = bubbleParameters.getParameter< Vector3< real_t > >("bubbleMidPoint");
          const real_t bubbleRadius              = bubbleParameters.getParameter< real_t >("bubbleRadius", 20.0);
          const bool bubble                      = bubbleParameters.getParameter< bool >("bubble", true);
-         initPhaseField_sphere(blocks, phase_field, bubbleRadius, bubbleMidPoint, bubble);
+         initPhaseField_sphere(blocks, phase_field, bubbleRadius, bubbleMidPoint, bubble, interface_thickness);
+
       }
       else if (scenario == 2)
       {
-         initPhaseField_RTI(blocks, phase_field);
+         initPhaseField_RTI(blocks, phase_field, interface_thickness, tube);
+      }
+      else if (scenario == 3)
+      {
+         auto bubbleParameters     = config->getOneBlock("Bubble");
+         const real_t bubbleRadius = bubbleParameters.getParameter< real_t >("bubbleRadius", 20.0);
+         init_bubble_field(blocks, phase_field, bubbleRadius);
+      }
+      else if (scenario == 4)
+      {
+         auto TorusParameters   = config->getOneBlock("Torus");
+         const real_t midpoint  = TorusParameters.getParameter< real_t >("Donut_midpoint");
+         const real_t height    = TorusParameters.getParameter< real_t >("Donut_h");
+         const real_t diameter  = TorusParameters.getParameter< real_t >("Donut_D");
+         const real_t donutTime = TorusParameters.getParameter< real_t >("DonutTime");
+         init_Taylor_bubble(blocks, phase_field, diameter, height, donutTime, midpoint);
+         center_of_mass[0] = real_t(cellsPerBlock[0]);
+         center_of_mass[1] = real_t(midpoint);
+         center_of_mass[2] = real_t(cellsPerBlock[2]);
       }
 
-      WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the phase-field done")
+      WALBERLA_LOG_INFO_ON_ROOT("initialization of the phase field done")
 
       /////////////////
       // ADD SWEEPS //
       ///////////////
 
-      auto physical_parameters     = config->getOneBlock("PhysicalParameters");
-      const real_t density_liquid  = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0));
-      const real_t density_gas     = physical_parameters.getParameter< real_t >("density_gas");
-      const real_t surface_tension = physical_parameters.getParameter< real_t >("surface_tension");
-      const real_t mobility        = physical_parameters.getParameter< real_t >("mobility");
-      const real_t gravitational_acceleration =
-         physical_parameters.getParameter< real_t >("gravitational_acceleration");
-      const real_t relaxation_time_liquid = physical_parameters.getParameter< real_t >("relaxation_time_liquid");
-      const real_t relaxation_time_gas    = physical_parameters.getParameter< real_t >("relaxation_time_gas");
-
-      pystencils::initialize_phase_field_distributions init_h(lb_phase_field_gpu, phase_field_gpu, vel_field_gpu);
+      pystencils::initialize_phase_field_distributions init_h(lb_phase_field_gpu, phase_field_gpu, vel_field_gpu,
+                                                              interface_thickness);
       pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
 
       pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, mobility, gpuBlockSize[0], gpuBlockSize[1],
-         Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
-
-      pystencils::hydro_LB_step hydro_LB_step(
-         lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gravitational_acceleration, density_liquid, density_gas,
-         surface_tension, relaxation_time_liquid, relaxation_time_gas, gpuBlockSize[0], gpuBlockSize[1],
+         flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, interface_thickness, mobility,
+         gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2],
          Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
 
-      pystencils::stream_hydro stream_hydro(lb_velocity_field_gpu, gpuBlockSize[0], gpuBlockSize[1],
-                                            Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+      pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu,
+                                              gravitational_acceleration, interface_thickness, density_liquid,
+                                              density_gas, surface_tension, relaxation_time_liquid, relaxation_time_gas,
+                                              gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2],
+                                              Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
 
       ////////////////////////
       // ADD COMMUNICATION //
       //////////////////////
 
       auto Comm_velocity_based_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
+         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 1);
       auto generatedPackInfo_velocity_based_distributions =
-         make_shared< pystencils::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
+         make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
 
-      auto Comm_phase_field = make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
+      auto Comm_phase_field = make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 1);
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
       Comm_phase_field->addPackInfo(generatedPackInfo_phase_field);
 
       auto Comm_phase_field_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
+         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 1);
       auto generatedPackInfo_phase_field_distributions =
-         make_shared< pystencils::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
+         make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
 
       ////////////////////////
@@ -213,17 +255,27 @@ int main(int argc, char** argv)
       if (boundariesConfig)
       {
          geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+         if (tube)
+         {
+            const real_t inner_radius      = domainSetup.getParameter< real_t >("inner_radius", real_c(0));
+            const real_t eccentricity      = domainSetup.getParameter< real_t >("ratio", real_c(0));
+            const real_t start_transition  = domainSetup.getParameter< real_t >("start_transition", real_c(60));
+            const real_t length_transition = domainSetup.getParameter< real_t >("length_transition", real_c(10));
+            const bool eccentricity_or_pipe_ration = domainSetup.getParameter< bool >("eccentricity_or_pipe_ration", true);
+            initTubeWithCylinder(blocks, flagFieldID, wallFlagUID, inner_radius, eccentricity, start_transition,
+                                 length_transition, eccentricity_or_pipe_ration);
+         }
          geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
       }
+      cuda::fieldCpy< GPUField_int, FlagField_T >(blocks, flagFieldID_gpu, flagFieldID);
 
-      calculate_normals(blocks, normals, flagFieldID, fluidFlagUID, wallFlagUID);
       lbm::phase_field_LB_NoSlip phase_field_LB_NoSlip(blocks, lb_phase_field_gpu);
       lbm::hydro_LB_NoSlip hydro_LB_NoSlip(blocks, lb_velocity_field_gpu);
-      lbm::contact contact_angle(blocks, phase_field_gpu, alpha_rad);
+      pystencils::ContactAngle contact_angle(blocks, phase_field_gpu, interface_thickness);
 
       phase_field_LB_NoSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, wallFlagUID, fluidFlagUID);
       hydro_LB_NoSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, wallFlagUID, fluidFlagUID);
-      contact_angle.fillFromNormalField< NormalsField_T >(blocks, normals);
+      contact_angle.fillFromFlagField< FlagField_T >(blocks, flagFieldID, wallFlagUID, fluidFlagUID);
 
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the normals-field done")
 
@@ -251,7 +303,6 @@ int main(int argc, char** argv)
 
             Comm_velocity_based_distributions->communicate(nullptr);
             hydro_LB_NoSlip(&block);
-            stream_hydro(&block);
          }
       };
       auto simpleOverlapTimeStep = [&]() {
@@ -277,13 +328,6 @@ int main(int argc, char** argv)
 
          for (auto& block : *blocks)
             hydro_LB_NoSlip(&block);
-
-         Comm_velocity_based_distributions->startCommunication(defaultStream);
-         for (auto& block : *blocks)
-            stream_hydro.inner(&block, defaultStream);
-         Comm_velocity_based_distributions->wait(defaultStream);
-         for (auto& block : *blocks)
-            stream_hydro.outer(&block, defaultStream);
       };
       std::function< void() > timeStep;
       if (timeStepStrategy == "overlap")
@@ -299,6 +343,15 @@ int main(int argc, char** argv)
 
       timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
 
+      if (scenario == 4)
+      {
+         python_coupling::PythonCallback smear_interface("interface_diffusion");
+         if (smear_interface.isCallable())
+         {
+            smear_interface.data().exposeValue("blocks", blocks);
+            smear_interface();
+         }
+      }
       cuda::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
 
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the PDFs")
@@ -309,23 +362,100 @@ int main(int argc, char** argv)
       }
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the PDFs done")
       uint_t dbWriteFrequency = parameters.getParameter< uint_t >("dbWriteFrequency", 10000000);
+      int targetRank          = 0;
 
       timeLoop->addFuncAfterTimeStep(
          [&]() {
             if (timeLoop->getCurrentTimeStep() % dbWriteFrequency == 0)
             {
                cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-               python_coupling::PythonCallback callback("at_end_of_time_step");
-               if (callback.isCallable())
+               cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
+               if (scenario == 4)
+               {
+                  std::array< real_t, 4 > total_velocity = { 0.0, 0.0, 0.0, 0.0 };
+                  real_t volume;
+                  uint_t nrCells;
+                  PhaseField_T gatheredPhaseField(0, 0, 0, 0);
+                  VelocityField_T gatheredVelocityField(0, 0, 0, 0);
+
+                  CellInterval boundingBox = blocks->getDomainCellBB();
+                  if (cell_idx_t(center_of_mass[1] - cell_idx_t(cellsPerBlock[0]) * 1.5) >= 0)
+                     boundingBox.min()[1] = cell_idx_t(center_of_mass[1] - cell_idx_t(cellsPerBlock[0]) * 1.5);
+                  if (cell_idx_t(center_of_mass[1] + cell_idx_t(cellsPerBlock[0]) * 1.5) <= boundingBox.max()[1])
+                     boundingBox.max()[1] = cell_idx_t(center_of_mass[1] + cell_idx_t(cellsPerBlock[0]) * 1.5);
+
+                  field::gather< PhaseField_T >(gatheredPhaseField, blocks, phase_field, boundingBox, targetRank);
+                  field::gather< VelocityField_T >(gatheredVelocityField, blocks, vel_field, boundingBox, targetRank);
+
+                  WALBERLA_EXCLUSIVE_WORLD_SECTION(targetRank)
+                  {
+                     flood_fill(gatheredPhaseField, gatheredVelocityField, boundingBox, volume, nrCells, center_of_mass,
+                                total_velocity);
+                  }
+                  WALBERLA_MPI_SECTION() { walberla::mpi::broadcastObject(center_of_mass, targetRank); }
+
+                  python_coupling::PythonCallback callback("at_end_of_time_step");
+                  if (callback.isCallable())
+                  {
+                     callback.data().exposeValue("blocks", blocks);
+                     callback.data().exposeValue("timeStep", timeLoop->getCurrentTimeStep());
+                     callback.data().exposeValue("target_rank", targetRank);
+                     callback.data().exposeValue("bounding_box_min", boundingBox.min()[1]);
+                     callback.data().exposeValue("bounding_box_max", boundingBox.max()[1]);
+                     callback.data().exposeValue("total_velocity", total_velocity[0]);
+                     callback.data().exposeValue("total_velocity_X", total_velocity[1]);
+                     callback.data().exposeValue("total_velocity_Y", total_velocity[2]);
+                     callback.data().exposeValue("total_velocity_Z", total_velocity[3]);
+                     callback.data().exposeValue("center_of_mass_X", center_of_mass[0]);
+                     callback.data().exposeValue("center_of_mass_Y", center_of_mass[1]);
+                     callback.data().exposeValue("center_of_mass_Z", center_of_mass[2]);
+                     callback.data().exposeValue("sum_inv_phi", volume);
+                     callback.data().exposeValue("gas_cells_of_the_taylor_bubble", nrCells);
+                     callback.data().exposeValue("stencil_phase", stencil_phase_name);
+                     callback.data().exposeValue("stencil_hydro", stencil_hydro_name);
+                     callback();
+                  }
+               }
+               else
                {
-                  callback.data().exposeValue("blocks", blocks);
-                  callback.data().exposeValue( "timeStep", timeLoop->getCurrentTimeStep());
-                  callback();
+                  python_coupling::PythonCallback callback("at_end_of_time_step");
+                  if (callback.isCallable())
+                  {
+                     callback.data().exposeValue("blocks", blocks);
+                     callback.data().exposeValue("timeStep", timeLoop->getCurrentTimeStep());
+                     callback.data().exposeValue("stencil_phase", stencil_phase_name);
+                     callback.data().exposeValue("stencil_hydro", stencil_hydro_name);
+                     callback();
+                  }
                }
             }
          },
          "Python callback");
 
+      int meshWriteFrequency = parameters.getParameter< int >("meshWriteFrequency", 0);
+      int counter            = 0;
+      if (meshWriteFrequency > 0)
+      {
+         timeLoop->addFuncAfterTimeStep(
+            [&]() {
+               if (timeLoop->getCurrentTimeStep() % uint_t(meshWriteFrequency) == 0)
+               {
+                  auto mesh = postprocessing::realFieldToSurfaceMesh< PhaseField_T >(blocks, phase_field, 0.5, 0, true,
+                                                                                     targetRank, MPI_COMM_WORLD);
+                  WALBERLA_EXCLUSIVE_WORLD_SECTION(targetRank)
+                  {
+                     std::string path = "";
+                     std::ostringstream out;
+                     out << std::internal << std::setfill('0') << std::setw(6) << counter;
+                     geometry::writeMesh(
+                        path + "taylor_bubble_D_" + std::to_string(cellsPerBlock[0]) + "_" + out.str() + ".obj", *mesh);
+                     counter++;
+                  }
+               }
+            },
+            "Mesh writer");
+      }
+
       // remaining time logger
       timeLoop->addFuncAfterTimeStep(
          timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
@@ -339,19 +469,22 @@ int main(int argc, char** argv)
                                                          "simulation_step", false, true, true, false, 0);
          vtkOutput->addBeforeFunction([&]() {
             cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-            // cuda::fieldCpy<VelocityField_T, GPUField>( blocks, vel_field, vel_field_gpu );
+            cuda::fieldCpy<VelocityField_T, GPUField>( blocks, vel_field, vel_field_gpu );
          });
-         auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "PhaseField");
+         auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T, float > >(phase_field, "PhaseField");
          vtkOutput->addCellDataWriter(phaseWriter);
 
-         // auto normlasWriter = make_shared<field::VTKWriter<NormalsField_T>>(normals, "Normals");
-         // vtkOutput->addCellDataWriter(normlasWriter);
+         auto flagWriter = make_shared< field::VTKWriter< FlagField_T > >(flagFieldID, "flag");
+         vtkOutput->addCellDataWriter(flagWriter);
+
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float > >(vel_field, "Velocity");
+         vtkOutput->addCellDataWriter(velWriter);
 
-         // auto flagWriter = make_shared<field::VTKWriter<FlagField_T>>(flagFieldID, "flag");
-         // vtkOutput->addCellDataWriter(flagWriter);
+         FluidFilter_T filter(cellsPerBlock);
 
-         // auto velWriter = make_shared<field::VTKWriter<VelocityField_T>>(vel_field, "Velocity");
-         // vtkOutput->addCellDataWriter(velWriter);
+         auto QCriterionWriter = make_shared< lbm::QCriterionVTKWriter< VelocityField_T, FluidFilter_T, float > >(
+            blocks, filter, vel_field, "Q-Criterion");
+         vtkOutput->addCellDataWriter(QCriterionWriter);
 
          timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
@@ -371,6 +504,5 @@ int main(int argc, char** argv)
       WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
       WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
    }
-
    return EXIT_SUCCESS;
 }
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py
index c9798504de79d23c5bbaaa686b209c917d8a63f5..024b66a9e15c4291f045d500d4493de2448f137b 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py
@@ -1,8 +1,11 @@
+import os
+
 import waLBerla as wlb
-import waLBerla.tools.sqlitedb as wlbSqlite
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars
 from waLBerla.core_extension import makeSlice
 
 import numpy as np
+import pandas as pd
 from lbmpy.phasefield_allen_cahn.parameter_calculation import calculate_parameters_rti
 
 
@@ -10,12 +13,11 @@ class Scenario:
     def __init__(self):
         # output frequencies
         self.vtkWriteFrequency = 1000
-        self.dbWriteFrequency = 200
 
         # simulation parameters
-        self.timesteps = 27001
+        self.time = 2  # physical time in seconds
 
-        self.cells = (64, 256, 64)
+        self.cells = (128, 512, 128)
         self.blocks = (1, 1, 1)
         self.periodic = (1, 0, 1)
         self.size = (self.cells[0] * self.blocks[0],
@@ -24,25 +26,40 @@ class Scenario:
 
         # physical parameters
         self.density_heavy = 1.0
-        self.reference_time = 6000
-        self.parameters = calculate_parameters_rti(reference_length=128,
+        self.reference_time = 4000
+        self.dbWriteFrequency = self.reference_time // 20
+        self.timesteps = int(self.reference_time * self.time) + 1
+
+        self.capillary_number = 8.7
+        self.reynolds_number = 3000
+        self.atwood_number = 1
+        self.peclet_number = 744
+        self.density_ratio = 1000
+        self.viscosity_ratio = 100
+
+        self.parameters = calculate_parameters_rti(reference_length=self.cells[0],
                                                    reference_time=self.reference_time,
                                                    density_heavy=self.density_heavy,
-                                                   capillary_number=9.1,
-                                                   reynolds_number=128,
-                                                   atwood_number=1.0,
-                                                   peclet_number=140,
-                                                   density_ratio=3,
-                                                   viscosity_ratio=3)
+                                                   capillary_number=self.capillary_number,
+                                                   reynolds_number=self.reynolds_number,
+                                                   atwood_number=self.atwood_number,
+                                                   peclet_number=self.peclet_number,
+                                                   density_ratio=self.density_ratio,
+                                                   viscosity_ratio=self.viscosity_ratio)
+
+        self.interface_thickness = 5
+        self.tube = False
 
         # everything else
-        self.dbFile = "RTI.db"
+        self.dbFile = "RTI.csv"
 
-        self.scenario = 2  # 1 rising bubble, 2 RTI
+        self.scenario = 2  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
 
         self.counter = 0
         self.yPositions = []
 
+        self.config_dict = self.config()
+
     @wlb.member_callback
     def config(self):
         return {
@@ -50,23 +67,24 @@ class Scenario:
                 'blocks': self.blocks,
                 'cellsPerBlock': self.cells,
                 'periodic': self.periodic,
+                'tube': self.tube
             },
             'Parameters': {
                 'timesteps': self.timesteps,
                 'vtkWriteFrequency': self.vtkWriteFrequency,
                 'dbWriteFrequency': self.dbWriteFrequency,
-                'useGui': 0,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
             },
             'PhysicalParameters': {
                 'density_liquid': self.density_heavy,
-                'density_gas': self.parameters["density_light"],
-                'surface_tension': self.parameters["surface_tension"],
-                'mobility': self.parameters.get("mobility", 0.1),
-                'gravitational_acceleration': self.parameters["gravitational_acceleration"],
+                'density_gas': self.parameters.get("density_light"),
+                'surface_tension': self.parameters.get("surface_tension"),
+                'mobility': self.parameters.get("mobility"),
+                'gravitational_acceleration': self.parameters.get("gravitational_acceleration"),
                 'relaxation_time_liquid': self.parameters.get("relaxation_time_heavy"),
                 'relaxation_time_gas': self.parameters.get("relaxation_time_light"),
+                'interface_thickness': self.interface_thickness
             },
             'Boundaries': {
                 'Border': [
@@ -79,55 +97,79 @@ class Scenario:
     @wlb.member_callback
     def at_end_of_time_step(self, blocks, **kwargs):
         t = kwargs['timeStep']
-        ny = self.size[1]
-        l0 = self.size[0]
         if t % self.dbWriteFrequency == 0:
-            location_of_spike = -100
-            location_of_bubble = -100
-            location_of_saddle = -100
-            mass = -100
-            spike_data = wlb.field.gather(blocks, 'phase', makeSlice[self.size[0] // 2, :, self.size[2] // 2])
-            if spike_data:
-                spike_field = np.asarray(spike_data).squeeze()
-                location_of_spike = (np.argmax(spike_field > 0.5) - ny // 2) / l0
-
-            bubble_data = wlb.field.gather(blocks, 'phase', makeSlice[0, :, 0])
-            if bubble_data:
-                bubble_field = np.asarray(bubble_data).squeeze()
-                location_of_bubble = (np.argmax(bubble_field > 0.5) - ny // 2) / l0
-
-            saddle_data = wlb.field.gather(blocks, 'phase', makeSlice[0, :, self.size[2] // 2])
-            if saddle_data:
-                saddle_field = np.asarray(saddle_data).squeeze()
-                location_of_saddle = (np.argmax(saddle_field > 0.5) - ny // 2) / l0
-
             phase = wlb.field.gather(blocks, 'phase', makeSlice[:, :, :])
             if phase:
+                data = {'timestep': t}
+                data.update(self.config_dict['PhysicalParameters'])
+                data.update({'total_timesteps': self.timesteps})
+                data.update({'normalized_time': t / self.reference_time})
+                data.update({'tube_setup': self.tube})
+                data.update({'interface_thickness': self.interface_thickness})
+                data.update({'capillary_number': self.capillary_number})
+                data.update({'reynolds_number': self.reynolds_number})
+                data.update({'atwood_number': self.atwood_number})
+                data.update({'peclet_number': self.peclet_number})
+                data.update({'density_ratio': self.density_ratio})
+                data.update({'viscosity_ratio': self.viscosity_ratio})
+                data.update({'reference_time': self.reference_time})
+                data.update(kwargs)
+
                 phase_field = np.asarray(phase).squeeze()
+                stable = np.isfinite(np.sum(phase_field))
                 mass = np.sum(phase_field)
+                rel_max = np.max(phase_field) - 1
+                rel_min = np.min(phase_field)
+                data.update({'mass': mass})
+                data.update({'rel_max': rel_max})
+                data.update({'rel_min': rel_min})
+                data.update({'stable': stable})
+
+                if self.tube:
+                    location_of_spike = self.get_interface_location(
+                        phase_field[self.size[0] // 2, :, self.size[2] // 2])
+                    a = np.where(phase_field < 0.5)
+                    value = np.argmax(a[1])
+                    location_of_bubble = self.get_interface_location(
+                        phase_field[a[0][value], a[1][value] - 10:a[1][value] + 10, a[2][value]], a[1][value] - 10)
+
+                    data.update({'location_of_spike': location_of_spike})
+                    data.update({'location_of_bubble': location_of_bubble})
+                else:
+                    location_of_spike = self.get_interface_location(
+                        phase_field[self.size[0] // 2, :, self.size[2] // 2])
+                    location_of_bubble = self.get_interface_location(phase_field[0, :, 0])
+                    location_of_saddle = self.get_interface_location(phase_field[0, :, self.size[2] // 2])
+
+                    data.update({'location_of_spike': location_of_spike})
+                    data.update({'location_of_bubble': location_of_bubble})
+                    data.update({'location_of_saddle': location_of_saddle})
+
+                sequenceValuesToScalars(data)
+
+                csv_file = f"RTI_{data['stencil_phase']}_{data['stencil_hydro']}_Re_{self.reynolds_number}_tube.csv"
+
+                df = pd.DataFrame.from_records([data])
+                if not os.path.isfile(csv_file):
+                    df.to_csv(csv_file, index=False)
+                else:
+                    df.to_csv(csv_file, index=False, mode='a', header=False)
+
+    def get_interface_location(self, one_dimensional_array, shift=None):
+        ny = self.size[1]
+        l0 = self.size[0]
 
-            self.write_result_to_database(t, location_of_spike, location_of_bubble, location_of_saddle, mass)
-
-    def write_result_to_database(self, t, spike, bubble, saddle, mass):
-        """Writes the simulation result stored in the global variables shapeFactors and angles to
-               an sqlite3 database, and resets the global variables."""
-        result = {'waLBerlaVersion': wlb.build_info.version,
-                  'xCells': self.size[0],
-                  'yCells': self.size[1],
-                  'zCells': self.size[2],
-                  'spike': spike,
-                  'bubble': bubble,
-                  'saddle': saddle,
-                  'mass': mass,
-                  'timesteps': t,
-                  'normalized_time': t / self.reference_time,
-                  'processes': wlb.mpi.numProcesses(),
-                  }
-        try:
-            wlbSqlite.checkAndUpdateSchema(result, 'interface_location', self.dbFile)
-            wlbSqlite.storeSingle(result, 'interface_location', self.dbFile)
-        except Exception as e:
-            wlb.log_warning("Failed to store run in database " + str(e) + "\n" + str(result))
+        index = np.argmax(one_dimensional_array > 0.5)
+
+        if index > 0:
+            zw1 = one_dimensional_array[index]
+            zw2 = one_dimensional_array[index - 1]
+            absolute_location = (index - 1) + (zw2 - 0.5) / (zw2 - zw1)
+            if shift:
+                absolute_location += shift
+            return (absolute_location - ny // 2) / l0
+        else:
+            return -100
 
 
 scenarios = wlb.ScenarioManager()
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_codegen.py b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_codegen.py
index 48a19621db6f0e62be8c6bc21f6ecc4c65faa90a..0abf47375eaa5d43924d6582761307278ecb52ea 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_codegen.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_codegen.py
@@ -1,24 +1,33 @@
+from lbmpy.phasefield_allen_cahn.contact_angle import ContactAngle
 from pystencils import fields, TypedSymbol
 from pystencils.simp import sympy_cse
-from pystencils import AssignmentCollection
+from pystencils import Assignment
+from pystencils.astnodes import Block, Conditional
 
 from lbmpy.boundaries import NoSlip
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
+from lbmpy.creationfunctions import create_lb_method
 from lbmpy.stencils import get_stencil
 
-from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
-from lbmpy_walberla import generate_boundary
+import pystencils_walberla
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field
+from lbmpy_walberla import generate_boundary, generate_lb_pack_info
 
 from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb,\
-    initializer_kernel_hydro_lb, interface_tracking_force, hydrodynamic_force, get_collision_assignments_hydro
+    initializer_kernel_hydro_lb, interface_tracking_force, hydrodynamic_force, get_collision_assignments_hydro,\
+    get_collision_assignments_phase
 
 from lbmpy.phasefield_allen_cahn.force_model import MultiphaseForceModel
 
 import numpy as np
 import sympy as sp
 
-stencil_phase = get_stencil("D3Q19")
-stencil_hydro = get_stencil("D3Q27")
+stencil_phase_name = "D3Q27"
+stencil_hydro_name = "D3Q27"
+
+contact_angle_in_degrees = 22
+
+stencil_phase = get_stencil(stencil_phase_name)
+stencil_hydro = get_stencil(stencil_hydro_name)
 q_phase = len(stencil_phase)
 q_hydro = len(stencil_hydro)
 
@@ -43,7 +52,7 @@ relaxation_time_gas = sp.Symbol("tau_L")
 # phase-field parameter
 drho3 = (density_liquid - density_gas) / 3
 # interface thickness
-W = 5
+W = sp.Symbol("interface_thickness")
 # coefficients related to surface tension
 beta = 12.0 * (surface_tension / W)
 kappa = 1.5 * surface_tension * W
@@ -56,6 +65,7 @@ kappa = 1.5 * surface_tension * W
 u = fields(f"vel_field({dimensions}): [{dimensions}D]", layout='fzyx')
 # phase-field
 C = fields(f"phase_field: [{dimensions}D]", layout='fzyx')
+C_tmp = fields(f"phase_field_tmp: [{dimensions}D]", layout='fzyx')
 
 flag = fields(f"flag_field: uint8[{dimensions}D]", layout='fzyx')
 # phase-field distribution functions
@@ -91,12 +101,17 @@ relaxation_rate_cutoff = sp.Piecewise((1 / (0.5 + relaxation_time_liquid), C.cen
 # LBM METHODS #
 ###############
 
-method_phase = create_lb_method(stencil=stencil_phase, method='srt',
-                                relaxation_rate=relaxation_rate_allen_cahn, compressible=True)
+# method_phase = create_lb_method(stencil=stencil_phase, method="mrt", compressible=True, weighted=True,
+#                                 relaxation_rates=[1, 1.5, 1, 1.5, 1, 1.5])
+method_phase = create_lb_method(stencil=stencil_phase, method="mrt", compressible=True, weighted=True,
+                                relaxation_rates=[1, 1, 1, 1, 1, 1])
+
+method_phase.set_conserved_moments_relaxation_rate(relaxation_rate_allen_cahn)
 
 method_hydro = create_lb_method(stencil=stencil_hydro, method="mrt", weighted=True,
                                 relaxation_rates=[relaxation_rate, 1, 1, 1, 1, 1])
 
+
 # create the kernels for the initialization of the g and h field
 h_updates = initializer_kernel_phase_field_lb(h, C, u, method_phase, W, fd_stencil=get_stencil("D3Q27"))
 g_updates = initializer_kernel_hydro_lb(g, u, method_hydro)
@@ -107,60 +122,70 @@ force_model_h = MultiphaseForceModel(force=force_h)
 force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta, body_force,
                              fd_stencil=get_stencil("D3Q27"))
 
+force_model_g = MultiphaseForceModel(force=force_g, rho=density)
+
 ####################
 # LBM UPDATE RULES #
 ####################
 
-h_tmp_symbol_list = [h_tmp.center(i) for i, _ in enumerate(stencil_phase)]
-sum_h = np.sum(h_tmp_symbol_list[:])
+phase_field_LB_step = get_collision_assignments_phase(lb_method=method_phase,
+                                                      velocity_input=u,
+                                                      output={'density': C_tmp},
+                                                      force_model=force_model_h,
+                                                      symbolic_fields={"symbolic_field": h,
+                                                                       "symbolic_temporary_field": h_tmp},
+                                                      kernel_type='stream_pull_collide')
 
-method_phase.set_force_model(force_model_h)
-
-phase_field_LB_step = create_lb_update_rule(lb_method=method_phase,
-                                            velocity_input=u,
-                                            compressible=True,
-                                            optimization={"symbolic_field": h,
-                                                          "symbolic_temporary_field": h_tmp},
-                                            kernel_type='stream_pull_collide')
-
-phase_field_LB_step.set_main_assignments_from_dict({**phase_field_LB_step.main_assignments_dict, **{C.center: sum_h}})
-phase_field_LB_step = AssignmentCollection(main_assignments=phase_field_LB_step.main_assignments,
-                                           subexpressions=phase_field_LB_step.subexpressions)
 phase_field_LB_step = sympy_cse(phase_field_LB_step)
 
+phase_field_LB_step = [Conditional(sp.Eq(flag.center(), 2),
+                                   Block(phase_field_LB_step),
+                                   Block([Assignment(C_tmp.center, C.center)]))]
 # ---------------------------------------------------------------------------------------------------------
 hydro_LB_step = get_collision_assignments_hydro(lb_method=method_hydro,
                                                 density=density,
                                                 velocity_input=u,
-                                                force=force_g,
+                                                force_model=force_model_g,
                                                 sub_iterations=2,
                                                 symbolic_fields={"symbolic_field": g,
                                                                  "symbolic_temporary_field": g_tmp},
-                                                kernel_type='collide_only')
+                                                kernel_type='collide_stream_push')
 
 hydro_LB_step.set_sub_expressions_from_dict({**{relaxation_rate: relaxation_rate_cutoff},
                                              **hydro_LB_step.subexpressions_dict})
 
-stream_hydro = create_lb_update_rule(stencil=stencil_hydro,
-                                     optimization={"symbolic_field": g,
-                                                   "symbolic_temporary_field": g_tmp},
-                                     kernel_type='stream_pull_only')
+hydro_LB_step = [Conditional(sp.Eq(flag.center(), 2), Block(hydro_LB_step))]
+
+contact_angle = ContactAngle(contact_angle_in_degrees, W)
+
 
 ###################
 # GENERATE SWEEPS #
 ###################
 
 vp = [('int32_t', 'cudaBlockSize0'),
-      ('int32_t', 'cudaBlockSize1')]
+      ('int32_t', 'cudaBlockSize1'),
+      ('int32_t', 'cudaBlockSize2')]
 
 sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
                     TypedSymbol("cudaBlockSize1", np.int32),
-                    1)
+                    TypedSymbol("cudaBlockSize2", np.int32))
+
 sweep_params = {'block_size': sweep_block_size}
 
 info_header = f"""
+using namespace walberla;
 #include "stencil/D3Q{q_phase}.h"\nusing Stencil_phase_T = walberla::stencil::D3Q{q_phase};
 #include "stencil/D3Q{q_hydro}.h"\nusing Stencil_hydro_T = walberla::stencil::D3Q{q_hydro};
+using PdfField_phase_T = GhostLayerField<real_t, {q_phase}>;
+using PdfField_hydro_T = GhostLayerField<real_t, {q_hydro}>;
+using VelocityField_T = GhostLayerField<real_t, {dimensions}>;
+using PhaseField_T = GhostLayerField<real_t, 1>;
+#ifndef UTIL_H
+#define UTIL_H
+const char * stencil_phase_name = "{stencil_phase_name}";
+const char * stencil_hydro_name = "{stencil_hydro_name}";
+#endif
 """
 
 with CodeGeneration() as ctx:
@@ -168,35 +193,33 @@ with CodeGeneration() as ctx:
     generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target='gpu')
 
     generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                   field_swaps=[(h, h_tmp)],
+                   field_swaps=[(h, h_tmp), (C, C_tmp)],
                    inner_outer_split=True,
                    target='gpu',
                    gpu_indexing_params=sweep_params,
                    varying_parameters=vp)
-    generate_boundary(ctx, 'phase_field_LB_NoSlip', NoSlip(), method_phase, target='gpu')
+    generate_boundary(ctx, 'phase_field_LB_NoSlip', NoSlip(), method_phase, target='gpu', streaming_pattern='pull')
 
     generate_sweep(ctx, 'hydro_LB_step', hydro_LB_step,
-                   inner_outer_split=True,
-                   target='gpu',
-                   gpu_indexing_params=sweep_params,
-                   varying_parameters=vp)
-    generate_boundary(ctx, 'hydro_LB_NoSlip', NoSlip(), method_hydro, target='gpu')
-
-    generate_sweep(ctx, 'stream_hydro', stream_hydro,
                    field_swaps=[(g, g_tmp)],
                    inner_outer_split=True,
                    target='gpu',
                    gpu_indexing_params=sweep_params,
                    varying_parameters=vp)
+    generate_boundary(ctx, 'hydro_LB_NoSlip', NoSlip(), method_hydro, target='gpu', streaming_pattern='push')
 
     # communication
 
-    generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field_distributions',
-                                   phase_field_LB_step.main_assignments, target='gpu')
-    generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field',
-                                   hydro_LB_step.all_assignments, target='gpu', kind='pull')
-    generate_pack_info_from_kernel(ctx, 'PackInfo_velocity_based_distributions',
-                                   stream_hydro.all_assignments, target='gpu', kind='pull')
+    generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
+                          streaming_pattern='pull', target='gpu')
+
+    generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
+                          streaming_pattern='push', target='gpu')
+
+    generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target='gpu')
+
+    pystencils_walberla.boundary.generate_boundary(ctx, 'ContactAngle', contact_angle,
+                                                   C.name, stencil_hydro, index_shape=[], target='gpu')
 
     ctx.write_file("GenDefines.h", info_header)
 
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py
index 9f5f08323daf8ee885fd35d510260eda43512daf..186590774aa43e8624eaf891f94616aad081360f 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py
@@ -39,6 +39,8 @@ class Scenario:
                                                                 density_ratio=1000,
                                                                 viscosity_ratio=100)
 
+        self.interface_thickness = 5
+
         # everything else
         self.dbFile = "risingBubble3D.db"
 
@@ -72,6 +74,7 @@ class Scenario:
                 'gravitational_acceleration': self.parameters["gravitational_acceleration"],
                 'relaxation_time_liquid': self.parameters.get("relaxation_time_heavy"),
                 'relaxation_time_gas': self.parameters.get("relaxation_time_light"),
+                'interface_thickness': self.interface_thickness
             },
             'Boundaries': {
                 'Border': [
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/parameters_taylor_bubble.py b/apps/showcases/PhaseFieldAllenCahn/GPU/parameters_taylor_bubble.py
new file mode 100644
index 0000000000000000000000000000000000000000..55db7aa693400b12f71d097470c57bed2609ee8e
--- /dev/null
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/parameters_taylor_bubble.py
@@ -0,0 +1,69 @@
+import math
+
+
+def calculate_parameters_taylor_bubble(reference_length=128,
+                                       reference_time=16000,
+                                       density_heavy=1.0,
+                                       d1=0.0254,
+                                       d2=0.0127):
+    r"""
+    Calculate the simulation parameters for a rising Taylor bubble. The calculation can be found in
+    10.1016/S0009-2509(97)00210-8 by G. Das
+
+    Args:
+        reference_length: chosen reference length
+        reference_time: chosen reference time
+        density_heavy: density of the heavier fluid
+        d1: diameter of the outer tube
+        d2: diameter of the inner cylinder
+    """
+
+    water_rho = 998  # kg/m3
+    air_rho = 1.2047  # kg/m3
+    surface_tension = 0.07286  # kg/s2
+    water_mu = 1.002e-3  # kg/ms
+
+    water_nu = water_mu / water_rho  # m2/s
+    air_mu = 1.8205e-5  # kg/ms
+    air_nu = air_mu / air_rho  # m2/s
+    gravity = 9.81  # m/s2
+
+    dh = d1 - d2
+    dr = d1 / d2
+    de = d1 + d2
+    # ur = 0.1695  # (0.28913, 0.23882, 0.1695)
+
+    inverse_viscosity_number = math.sqrt((water_rho - air_rho) * water_rho * gravity * dh ** 3) / water_mu
+    bond_number = (water_rho - air_rho) * gravity * dh ** 2 / surface_tension
+    morton_number = gravity * water_mu ** 4 * (water_rho - air_rho) / (water_rho ** 2 * surface_tension ** 3)
+
+    d = reference_length / dr
+
+    density_light = 1.0 / (water_rho / air_rho)
+    dh = reference_length - d
+    g = dh / reference_time ** 2
+
+    mu_h = math.sqrt((density_heavy - density_light) * density_heavy * g * dh ** 3) / inverse_viscosity_number
+    mu_l = mu_h / (water_mu / air_mu)
+
+    dynamic_viscosity_heavy = mu_h / density_heavy
+    dynamic_viscosity_light = mu_l / density_light
+
+    relaxation_time_heavy = 3 * dynamic_viscosity_heavy
+    relaxation_time_light = 3 * dynamic_viscosity_light
+
+    sigma = (density_heavy - density_light) * g * dh ** 2 / bond_number
+
+    parameters = {
+        "inverse_viscosity_number": inverse_viscosity_number,
+        "bond_number": bond_number,
+        "morton_number": morton_number,
+        "density_light": density_light,
+        "dynamic_viscosity_heavy": dynamic_viscosity_heavy,
+        "dynamic_viscosity_light": dynamic_viscosity_light,
+        "relaxation_time_heavy": relaxation_time_heavy,
+        "relaxation_time_light": relaxation_time_light,
+        "gravitational_acceleration": -g,
+        "surface_tension": sigma
+    }
+    return parameters
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/taylor_bubble.py b/apps/showcases/PhaseFieldAllenCahn/GPU/taylor_bubble.py
new file mode 100755
index 0000000000000000000000000000000000000000..0afb25234ac119bff33d6876e23bfec7cd79391b
--- /dev/null
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/taylor_bubble.py
@@ -0,0 +1,325 @@
+import math
+import os
+import pickle as pl
+import waLBerla as wlb
+from matplotlib import pyplot as plt
+import numpy as np
+import pandas as pd
+
+from waLBerla.core_extension import makeSlice
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars
+from scipy.ndimage.filters import gaussian_filter
+import scipy.spatial as spatial
+
+from parameters_taylor_bubble import calculate_parameters_taylor_bubble
+
+
+def intersection(points1, points2, eps):
+    tree = spatial.KDTree(points1)
+    distances, indices = tree.query(points2, k=1, distance_upper_bound=eps)
+    intersection_points = tree.data[indices[np.isfinite(distances)]]
+    return intersection_points
+
+
+def contour_points(contour, steps=1):
+    return np.row_stack([path.interpolated(steps).vertices
+                         for linecol in contour.collections
+                         for path in linecol.get_paths()])
+
+
+def test_line(center_x, center_z, angle, contour, tol):
+    contact = -1
+
+    line_size = 200
+
+    points1 = contour_points(contour)
+    points2 = np.zeros((line_size, 2))
+    points2[:, 0] = center_x + np.linspace(0, center_x, line_size) * np.cos(np.radians(angle))
+    points2[:, 1] = center_z + np.linspace(0, center_z, line_size) * np.sin(np.radians(angle))
+
+    intersection_points = intersection(points1, points2, tol)
+
+    if len(intersection_points) != 0:
+        contact = 1
+
+    return contact
+
+
+def get_circle(midpoint, radius):
+    theta = np.linspace(0, 2 * np.pi, 100)
+
+    a = midpoint[0] + radius * np.cos(theta)
+    b = midpoint[1] + radius * np.sin(theta)
+
+    return a, b
+
+
+class Scenario:
+    def __init__(self):
+        self.density_liquid = 1.0
+        self.reference_time = 16000
+        self.reference_length = 128
+        d1 = 0.0254  # (0.0508, 0.0381, 0.0254)
+        d2 = 0.0127  # (0.0254, 0.0127, 0.0127)
+        gpus = 1
+        self.interface_width = 5
+        self.mobility = 0.05
+
+        # output frequencies
+        self.vtkWriteFrequency = self.reference_time
+        self.dbWriteFrequency = self.reference_time // 25
+        self.meshWriteFrequency = self.reference_time
+        self.pngWriteFrequency = self.reference_time
+
+        # simulation parameters
+        self.diameter = self.reference_length
+        self.timesteps = self.reference_time * 15 + 1
+        self.cells = (self.diameter, (self.reference_length * 15) // gpus, self.diameter)
+        self.blocks = (1, gpus, 1)
+        self.periodic = (0, 0, 0)
+        self.size = (self.cells[0] * self.blocks[0],
+                     self.cells[1] * self.blocks[1],
+                     self.cells[2] * self.blocks[2])
+        self.inner_radius = self.diameter // 4
+
+        self.center_x = self.size[0] / 2
+        self.center_y = self.size[1] / 2
+        self.center_z = self.size[2] / 2
+
+        self.overlappingWidth = (8, 1, 1)
+        self.timeStepStrategy = 'normal'
+
+        self.scenario = 4  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
+
+        self.counter = 0
+        self.yPositions = []
+
+        self.eccentricity_or_pipe_ratio = False  # if True eccentricity is conducted otherwise pipe ratio
+        self.ratio = 0.5
+
+        self.start_transition = (self.size[1] // 2) - 2 * self.diameter
+        self.length_transition = 4 * self.diameter
+
+        setup = "eccentricity" if self.eccentricity_or_pipe_ratio else "ratio"
+
+        self.csv_file = f"Taylor_bubble_D_{self.diameter}_DasC_{setup}_{self.ratio}_W_" \
+                        f"{self.interface_width}_M_{self.mobility}.csv"
+
+        d = self.diameter / 2
+        dh = self.diameter - d
+
+        resolution = self.diameter / 128
+
+        self.Donut_D = 0.1 * self.diameter / resolution
+        self.Donut_h = dh / 6
+        self.DonutTime = 0.5 * (self.diameter + d) / 2
+
+        parameters = calculate_parameters_taylor_bubble(reference_length=self.reference_length,
+                                                        reference_time=self.reference_time,
+                                                        density_heavy=self.density_liquid,
+                                                        d1=d1,
+                                                        d2=d2)
+
+        self.density_gas = parameters["density_light"]
+        self.surface_tension = parameters["surface_tension"]
+
+        self.gravitational_acceleration = parameters["gravitational_acceleration"]
+
+        self.relaxation_time_liquid = parameters.get("relaxation_time_heavy")
+        self.relaxation_time_gas = parameters.get("relaxation_time_light")
+
+        self.config_dict = self.config()
+
+    @wlb.member_callback
+    def config(self):
+        return {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells,
+                'periodic': self.periodic,
+                'inner_radius': self.inner_radius,
+                'ratio': self.ratio,
+                'start_transition': self.start_transition,
+                'length_transition': self.length_transition,
+                'eccentricity_or_pipe_ration': self.eccentricity_or_pipe_ratio,
+                'tube': True
+            },
+            'Parameters': {
+                'timesteps': self.timesteps,
+                'vtkWriteFrequency': self.vtkWriteFrequency,
+                'dbWriteFrequency': self.dbWriteFrequency,
+                'meshWriteFrequency': self.meshWriteFrequency,
+                'remainingTimeLoggerFrequency': 60.0,
+                'scenario': self.scenario,
+            },
+            'PhysicalParameters': {
+                'density_liquid': self.density_liquid,
+                'density_gas': self.density_gas,
+                'surface_tension': self.surface_tension,
+                'mobility': self.mobility,
+                'gravitational_acceleration': self.gravitational_acceleration,
+                'relaxation_time_liquid': self.relaxation_time_liquid,
+                'relaxation_time_gas': self.relaxation_time_gas,
+                'interface_thickness': self.interface_width
+            },
+            'Boundaries': {
+                'Border': [
+                    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+                    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+                ],
+            },
+            'Torus': {
+                'Donut_midpoint': 5 * self.diameter,
+                'Donut_h': self.Donut_h,
+                'Donut_D': self.Donut_D,
+                'DonutTime': self.DonutTime
+            }
+        }
+
+    @wlb.member_callback
+    def interface_diffusion(self, blocks):
+        for block in blocks:
+            phase_field_array = wlb.field.toArray(block['phase'])
+            phase_field_array[:, :, :] = gaussian_filter(phase_field_array[:, :, :], sigma=2)
+
+    @wlb.member_callback
+    def at_end_of_time_step(self, blocks, **kwargs):
+        t = kwargs["timeStep"]
+        target_rank = kwargs["target_rank"]
+        bounding_box_min = kwargs["bounding_box_min"]
+        bounding_box_max = kwargs["bounding_box_max"]
+        center_of_mass = [kwargs["center_of_mass_X"], kwargs["center_of_mass_Y"], kwargs["center_of_mass_Z"]]
+        if t % self.dbWriteFrequency == 0:
+            wlb_field = wlb.field.gather(blocks, 'phase', makeSlice[:, bounding_box_min:bounding_box_max, :],
+                                         target_rank)
+            if wlb_field:
+                data = {'timestep': t}
+                data.update(self.config_dict['Parameters'])
+                data.update(self.config_dict['DomainSetup'])
+                data.update(self.config_dict['Torus'])
+                data.update(kwargs)
+                del data["bounding_box_min"]
+                del data["bounding_box_max"]
+                data['total_velocity_Y / sum_inv_phi'] = kwargs['total_velocity_Y'] / kwargs['sum_inv_phi']
+
+                phase_field = np.asarray(wlb_field).squeeze()
+                assert np.isfinite(np.sum(phase_field)), "NaN detected in bounding Box"
+                location_of_gas = np.where(phase_field < 0.5)
+                bubble_tip = np.max(location_of_gas, axis=1)
+
+                fig_handle = plt.figure()
+
+                plt.axis('equal')
+                ax = plt.gca()
+                ax.set(ylim=(0, self.diameter + 1))
+                my_contour = plt.contour(np.rot90(phase_field[:, math.floor(center_of_mass[1]) - bounding_box_min, :]),
+                                         [0.5])
+
+                # For eccentricity test cases
+                center_x = self.center_x
+                center_z = self.center_z
+                new_radius = self.inner_radius
+
+                if self.eccentricity_or_pipe_ratio:
+                    shift = self.ratio * self.center_x / 2
+
+                    if center_of_mass[1] < self.start_transition:
+                        center_x = self.center_x
+                    elif self.start_transition < center_of_mass[1] < self.start_transition + self.length_transition:
+                        tmp = math.pi * (center_of_mass[1] - self.start_transition) / self.length_transition
+                        shift_tmp = shift * 0.5 * (1 - math.cos(tmp))
+                        center_x = self.center_x + shift_tmp
+                    else:
+                        center_x = self.center_x + shift
+
+                else:
+                    shift = self.ratio * self.center_x / 2
+
+                    if center_of_mass[1] < self.start_transition:
+                        new_radius = self.inner_radius
+                    elif self.start_transition < center_of_mass[1] < self.start_transition + self.length_transition:
+                        tmp = math.pi * (center_of_mass[1] - self.start_transition) / self.length_transition
+                        shift_tmp = shift * 0.5 * (1 - math.cos(tmp))
+                        new_radius = self.inner_radius + shift_tmp
+                    else:
+                        new_radius = self.inner_radius + shift
+
+                start_angle = 0
+                tol = 0.5
+
+                # Search for two lines where one intersects and one does not:
+                contact = test_line(center_x, center_z, start_angle, my_contour, tol)
+                angle = 0
+                angle1 = 0
+
+                if contact == -1:
+                    num = np.linspace(0, 180, 500)
+                    for i in range(0, len(num)):
+                        test = test_line(center_x, center_z, num[i], my_contour, tol)
+                        if test != -1:
+                            angle = num[i]
+                            break
+
+                    num = np.linspace(0, -180, 500)
+                    for i in range(0, len(num)):
+                        test = test_line(center_x, center_z, num[i], my_contour, tol)
+                        if test != -1:
+                            angle1 = num[i]
+                            break
+
+                    theta = 360 - (angle - angle1)
+                else:
+                    theta = 360
+
+                if t % self.pngWriteFrequency == 0:
+                    plt.plot(center_x + np.linspace(0, center_x) * np.cos(np.radians(angle)),
+                             center_z + np.linspace(0, center_z) * np.sin(np.radians(angle)), 'b-')
+                    plt.plot(center_x + np.linspace(0, center_x) * np.cos(np.radians(angle1)),
+                             center_z + np.linspace(0, center_z) * np.sin(np.radians(angle1)), 'r-')
+
+                    radius = self.diameter // 2
+                    circle1 = get_circle([radius, radius], radius)
+                    plt.plot(circle1[0], circle1[1], 'k--', linewidth=2)
+
+                    circle2 = get_circle([center_x, center_z], new_radius)
+                    plt.fill(circle2[0], circle2[1], 'lightgrey')
+
+                    plt.savefig(f"angle_measurement_ratio_{self.ratio}_{self.counter:06d}.png", dpi=600)
+
+                    outfile = open(f"angle_measurement_ratio_{self.ratio}_{self.counter:06d}.pkl", 'wb')
+
+                    pl.dump(fig_handle, outfile)
+                    outfile.close()
+                    self.counter += 1
+                plt.cla()
+
+                data['bubble_tip_y'] = bubble_tip[1]
+                data['center_of_mass_x'] = center_of_mass[0]
+                data['center_of_mass_y'] = center_of_mass[1]
+                data['center_of_mass_z'] = center_of_mass[2]
+
+                data['xCells'] = self.size[0]
+                data['yCells'] = self.size[1]
+                data['zCells'] = self.size[2]
+
+                data['theta'] = theta
+                if self.eccentricity_or_pipe_ratio:
+                    data['eccentricity'] = self.ratio
+                else:
+                    data['pipe_ratio'] = self.ratio
+
+                sequenceValuesToScalars(data)
+
+                df = pd.DataFrame.from_records([data])
+                if not os.path.isfile(self.csv_file):
+                    df.to_csv(self.csv_file, index=False)
+                else:
+                    df.to_csv(self.csv_file, index=False, mode='a', header=False)
+
+
+scenarios = wlb.ScenarioManager()
+scenarios.add(Scenario())
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/util.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9912edb76e510947883101e72ca60af2b97959d6
--- /dev/null
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/util.cpp
@@ -0,0 +1,147 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file util.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "core/cell/Cell.h"
+#include "core/math/Constants.h"
+#include "core/math/Random.h"
+
+#include "field/FlagField.h"
+#include "field/communication/PackInfo.h"
+#include "field/vtk/VTKWriter.h"
+
+#include <queue>
+namespace walberla
+{
+using flag_t          = walberla::uint8_t;
+using PhaseField_T    = GhostLayerField< real_t, 1 >;
+using VelocityField_T = GhostLayerField< real_t, 3 >;
+using FlagField_T     = FlagField< flag_t >;
+
+void calc_total_velocity(const shared_ptr< StructuredBlockStorage >& blocks, std::array< real_t, 5 >& total_velocity,
+                         BlockDataID phaseFieldID, BlockDataID velocityFieldID, ConstBlockDataID flagFieldID,
+                         FlagUID fluidFlagUID)
+{
+   for (auto& block : *blocks)
+   {
+      auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+      auto velField   = block.getData< VelocityField_T >(velocityFieldID);
+      auto flagField  = block.getData< FlagField_T >(flagFieldID);
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(
+         phaseField,
+
+         auto fluidFlag = flagField->getFlag(fluidFlagUID); if (flagField->get(x, y, z) == fluidFlag) {
+            if (phaseField->get(x, y, z) < 0.5)
+            {
+               real_t invC = 1 - phaseField->get(x, y, z);
+               real_t U    = velField->get(x, y, z, 0);
+               real_t V    = velField->get(x, y, z, 1);
+               real_t W    = velField->get(x, y, z, 2);
+               total_velocity[0] += sqrt((U * invC) * (U * invC) + (V * invC) * (V * invC) + (W * invC) * (W * invC));
+               total_velocity[1] += U * invC;
+               total_velocity[2] += V * invC;
+               total_velocity[3] += W * invC;
+               total_velocity[4] += invC;
+            }
+         })
+   }
+}
+
+void flood_fill(PhaseField_T& phaseField, VelocityField_T& velocityField, CellInterval boundingBox, real_t& volume,
+                uint_t& nrOfCells, std::array< real_t, 3 >& center_of_mass, std::array< real_t, 4 >& total_velocity)
+{
+   Cell startCell(boundingBox.xSize() / 2, boundingBox.ySize() / 2, boundingBox.zSize() / 2);
+   Field< bool, 1 > visit(phaseField.xSize(), phaseField.ySize(), phaseField.zSize(), false, field::fzyx);
+   using namespace stencil;
+
+   volume            = 0;
+   nrOfCells         = 0;
+   center_of_mass[0] = 0.0;
+   center_of_mass[1] = 0.0;
+   center_of_mass[2] = 0.0;
+
+   while (phaseField.get(startCell) > 0.5 && startCell.x() > 0)
+      --startCell.x();
+
+   if (phaseField.get(startCell) > 0.5) WALBERLA_ABORT("startCell for flood fill was not suitable")
+
+   std::queue< Cell > cellQueue;
+   cellQueue.push(startCell);
+   visit.get(startCell) = true;
+
+   real_t invC = 1 - phaseField.get(startCell);
+   real_t v_U  = velocityField.get(startCell, 0);
+   real_t v_V  = velocityField.get(startCell, 1);
+   real_t v_W  = velocityField.get(startCell, 2);
+
+   nrOfCells++;
+   volume += invC;
+
+   total_velocity[0] += sqrt((v_U * invC) * (v_U * invC) + (v_V * invC) * (v_V * invC) + (v_W * invC) * (v_W * invC));
+   total_velocity[1] += v_U * invC;
+   total_velocity[2] += v_V * invC;
+   total_velocity[3] += v_W * invC;
+
+   center_of_mass[0] += (startCell.x() + boundingBox.xMin());
+   center_of_mass[1] += (startCell.y() + boundingBox.yMin());
+   center_of_mass[2] += (startCell.z() + boundingBox.xMin());
+
+   const int DIRS[6] = { N, S, E, W, T, B };
+
+   CellInterval sizeInterval = phaseField.xyzSize();
+   while (!cellQueue.empty())
+   {
+      Cell& cell = cellQueue.front();
+      cellQueue.pop();
+
+      for (int i : DIRS)
+      {
+         Cell neighborCell(cell.x() + cx[i], cell.y() + cy[i], cell.z() + cz[i]);
+
+         if (!sizeInterval.contains(neighborCell)) { continue; }
+
+         if (phaseField.get(neighborCell) < 0.5 && !visit.get(neighborCell))
+         {
+            invC = 1 - phaseField.get(neighborCell);
+            v_U  = velocityField.get(neighborCell, 0);
+            v_V  = velocityField.get(neighborCell, 1);
+            v_W  = velocityField.get(neighborCell, 2);
+
+            nrOfCells++;
+            volume += invC;
+
+            total_velocity[0] +=
+               sqrt((v_U * invC) * (v_U * invC) + (v_V * invC) * (v_V * invC) + (v_W * invC) * (v_W * invC));
+            total_velocity[1] += v_U * invC;
+            total_velocity[2] += v_V * invC;
+            total_velocity[3] += v_W * invC;
+
+            center_of_mass[0] += (neighborCell.x() + boundingBox.xMin());
+            center_of_mass[1] += (neighborCell.y() + boundingBox.yMin());
+            center_of_mass[2] += (neighborCell.z() + boundingBox.xMin());
+
+            visit.get(neighborCell) = true;
+            cellQueue.push(neighborCell);
+         }
+      }
+   }
+   center_of_mass[0] = center_of_mass[0] / real_t(nrOfCells);
+   center_of_mass[1] = center_of_mass[1] / real_t(nrOfCells);
+   center_of_mass[2] = center_of_mass[2] / real_t(nrOfCells);
+}
+} // namespace walberla
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/CalculateNormals.h b/apps/showcases/PhaseFieldAllenCahn/GPU/util.h
similarity index 55%
rename from apps/showcases/PhaseFieldAllenCahn/CPU/CalculateNormals.h
rename to apps/showcases/PhaseFieldAllenCahn/GPU/util.h
index 901db8544732cea2953e651fc10c78943326bf56..cefd57d38736d9f8370a86c78e07215fa947e851 100644
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/CalculateNormals.h
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/util.h
@@ -13,20 +13,31 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file CalculateNormals.h
+//! \file util.h
 //! \author Markus Holzer <markus.holzer@fau.de>
 //
 //======================================================================================================================
 
-#include "blockforest/StructuredBlockForest.h"
+#include "python_coupling/DictWrapper.h"
 
-#include "domain_decomposition/BlockDataID.h"
-#include "domain_decomposition/IBlock.h"
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Constants.h"
 
+#include "field/communication/PackInfo.h"
 #include "field/FlagField.h"
+#include "field/vtk/VTKWriter.h"
+#include "GenDefines.h"
+#pragma once
+
+namespace walberla {
+
+    void calc_total_velocity(const shared_ptr <StructuredBlockStorage> &blocks, std::array<real_t, 5> &total_velocity,
+                             BlockDataID phaseFieldID, BlockDataID velocityFieldID, ConstBlockDataID flagFieldID, FlagUID fluidFlagUID);
+
+    void flood_fill(PhaseField_T &phaseField, VelocityField_T &velocityField, CellInterval boundingBox,
+                    real_t &volume, uint_t &nrOfCells,
+                    std::array<real_t, 3> &center_of_mass, std::array<real_t, 4> &total_velocity);
+
+} // namespace walberla
 
-namespace walberla
-{
-void calculate_normals(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID normalsFieldID,
-                       ConstBlockDataID flagFieldID, FlagUID fluidFlagUID, FlagUID boundaryFlagUID);
-}
\ No newline at end of file
diff --git a/apps/tutorials/lbm/04_LBComplexGeometry.cpp b/apps/tutorials/lbm/04_LBComplexGeometry.cpp
index 9b02de05a655bd46b3f45ac0d3f3811501128b6e..dedc750f84b9354e9255365ba2da229aa1c3cdcb 100644
--- a/apps/tutorials/lbm/04_LBComplexGeometry.cpp
+++ b/apps/tutorials/lbm/04_LBComplexGeometry.cpp
@@ -162,8 +162,13 @@ int main(int argc, char** argv)
 
    WALBERLA_LOG_INFO_ON_ROOT("Octree has height " << distanceOctree->height())
 
+   //! [octreeVTK]
    // write distance octree to file
-   distanceOctree->writeVTKOutput("distanceOctree");
+   WALBERLA_ROOT_SECTION()
+   {
+      distanceOctree->writeVTKOutput("distanceOctree");
+   }
+   //! [octreeVTK]
 
    ///////////////////////////
    /// CREATE BLOCK FOREST ///
@@ -325,4 +330,4 @@ int main(int argc, char** argv)
 }
 } // namespace walberla
 
-int main(int argc, char** argv) { walberla::main(argc, argv); }
\ No newline at end of file
+int main(int argc, char** argv) { walberla::main(argc, argv); }
diff --git a/apps/tutorials/lbm/04_LBComplexGeometry.dox b/apps/tutorials/lbm/04_LBComplexGeometry.dox
index dd1d7e74ea03aa1fb39565c727493508bf346d49..5c8b1fa6a00a17ee69ce62b3ac7c77f1d69f4bd1 100644
--- a/apps/tutorials/lbm/04_LBComplexGeometry.dox
+++ b/apps/tutorials/lbm/04_LBComplexGeometry.dox
@@ -59,6 +59,9 @@ After calling this function, we prepare for building the distance octree by prec
 From this information we can finally build the distance octree. It stores information about how close or far boundaries are to each other. Later, this information could be used for e.g. adaptive mesh refinement (note that this will not be covered in this tutorial).
 \snippet 04_LBComplexGeometry.cpp octree
 
+When writing the distance octree to disk (e.g. for debugging purposes), care must be taken to execute the write function only on the root node:
+\snippet 04_LBComplexGeometry.cpp octreeVTK
+
 Even though we have successfully loaded the complex geometry and set up the corresponding distance octree, we have not defined our computational LB domain yet.
 In this tutorial, the LB domain is defined relatively to the loaded geometry. Henceforth, we calculate the axis-aligned bounding box of the geometry and scale it to our needs.
 Here, we chose our channel to be 10x3x1 times the size of the Stanford Bunny. This scaling is defined in the parameter file (parameter: domainScaling).
diff --git a/python/pystencils_walberla/boundary.py b/python/pystencils_walberla/boundary.py
index c81df201f43951031035c48771e3c891f3c36e33..22d3635dbb1143b7c52c0c4e1dc7339aeae16507 100644
--- a/python/pystencils_walberla/boundary.py
+++ b/python/pystencils_walberla/boundary.py
@@ -82,6 +82,7 @@ def generate_boundary(generation_context,
         'target': target,
         'namespace': namespace,
         'inner_or_boundary': boundary_object.inner_or_boundary,
+        'single_link': boundary_object.single_link,
         'additional_data_handler': additional_data_handler
     }
 
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.h b/python/pystencils_walberla/templates/Boundary.tmpl.h
index 57630039fd2f5b6b623b6b68fd081e25a3f289f0..33c355a530e9313f52d9c623a5e71da71d0a9c7c 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.h
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.h
@@ -197,6 +197,7 @@ public:
         indexVectorInner.clear();
         indexVectorOuter.clear();
 
+        {% if inner_or_boundary -%}
         for( auto it = flagField->begin(); it != flagField->end(); ++it )
         {
             if( ! isFlagSet(it, domainFlag) )
@@ -204,11 +205,7 @@ public:
             {%- for dirIdx, dirVec, offset in additional_data_handler.stencil_info %}
             if ( isFlagSet( it.neighbor({{offset}} {%if dim == 3%}, 0 {%endif %}), boundaryFlag ) )
             {
-                {% if inner_or_boundary -%}
                 auto element = {{StructName}}(it.x(), it.y(), {%if dim == 3%} it.z(), {%endif %} {{dirIdx}} );
-                {% else -%}
-                auto element = {{StructName}}(it.x() + cell_idx_c({{dirVec[0]}}), it.y() + cell_idx_c({{dirVec[1]}}), {%if dim == 3%} it.z() + cell_idx_c({{dirVec[2]}}), {%endif %} {{additional_data_handler.inverse_directions[dirIdx]}} );
-                {% endif -%}
                 {{additional_data_handler.data_initialisation(dirIdx)|indent(16)}}
                 indexVectorAll.push_back( element );
                 if( inner.contains( it.x(), it.y(), it.z() ) )
@@ -218,6 +215,61 @@ public:
             }
             {% endfor %}
         }
+        {%else%}
+        auto flagWithGLayers = flagField->xyzSizeWithGhostLayer();
+        real_t dot = 0.0; real_t maxn = 0.0;
+        cell_idx_t calculated_idx = 0;
+        cell_idx_t dx = 0; cell_idx_t dy = 0; {%if dim == 3%}  cell_idx_t dz = 0; {% endif %}
+        cell_idx_t sum_x = 0; cell_idx_t sum_y = 0; {%if dim == 3%} cell_idx_t sum_z = 0; {%endif %}
+        for( auto it = flagField->beginWithGhostLayerXYZ(); it != flagField->end(); ++it )
+        {
+            {% if single_link -%}
+            sum_x = 0; sum_y = 0; {%if dim == 3%} sum_z = 0; {%endif %}
+            {% endif %}
+            if( ! isFlagSet(it, boundaryFlag) )
+                continue;
+            {%- for dirIdx, dirVec, offset in additional_data_handler.stencil_info %}
+            if ( flagWithGLayers.contains(it.x() + cell_idx_c({{dirVec[0]}}), it.y() + cell_idx_c({{dirVec[1]}}), it.z() + cell_idx_c({{dirVec[2]}})) && isFlagSet( it.neighbor({{offset}} {%if dim == 3%}, 0 {%endif %}), domainFlag ) )
+            {
+                {% if single_link -%}
+                sum_x += cell_idx_c({{dirVec[0]}}); sum_y += cell_idx_c({{dirVec[1]}}); {%if dim == 3%} sum_z += cell_idx_c({{dirVec[2]}}); {%endif %}
+                {% else %}
+                auto element = {{StructName}}(it.x(), it.y(), {%if dim == 3%} it.z(), {%endif %} {{dirIdx}} );
+                {{additional_data_handler.data_initialisation(dirIdx)|indent(16)}}
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                    indexVectorInner.push_back( element );
+                else
+                    indexVectorOuter.push_back( element );
+                {% endif %}
+            }
+            {% endfor %}
+
+        {% if single_link %}
+            dot = 0.0; maxn = 0.0; calculated_idx = 0;
+            if(sum_x != 0 or sum_y !=0 {%if dim == 3%} or sum_z !=0 {%endif %})
+            {
+            {%- for dirIdx, dirVec, offset in additional_data_handler.stencil_info %}
+                dx = {{dirVec[0]}}; dy = {{dirVec[1]}}; {%if dim == 3%} dz = {{dirVec[2]}}; {% endif %}
+                dot = dx*sum_x + dy*sum_y {%if dim == 3%} + dz*sum_z {% endif %};
+                if (dot > maxn)
+                {
+                    maxn = dot;
+                    calculated_idx = {{dirIdx}};
+                }
+            {% endfor %}
+                auto element = {{StructName}}(it.x(), it.y(), {%if dim == 3%} it.z(), {%endif %} calculated_idx );
+                indexVectorAll.push_back( element );
+                if( inner.contains( it.x(), it.y(), it.z() ) )
+                indexVectorInner.push_back( element );
+                else
+                indexVectorOuter.push_back( element );
+            }
+        {% endif -%}
+
+        }
+        {% endif %}
+
         indexVectors->syncGPU();
     }
 
diff --git a/python/pystencils_walberla/utility.py b/python/pystencils_walberla/utility.py
index 0bdf6e9923fe4fb106ab2975109a7c31a7f0d633..cda1ddbec3a6f98a9ea75d96037866c5018875b8 100644
--- a/python/pystencils_walberla/utility.py
+++ b/python/pystencils_walberla/utility.py
@@ -11,7 +11,8 @@ def generate_info_header(ctx: CodeGenerationContext,
                          field_typedefs: dict = None,
                          additional_headers: set = None,
                          headers_to_ignore: set = None,
-                         additional_typedefs: dict = None):
+                         additional_typedefs: dict = None,
+                         additional_code: str = ""):
     """Generates an info header, consolidating required information about the generated code.
     The info header #includes all generated header files, and is thus the only header the
     application needs to #include. It can also contain aliases for waLBerla stencil types and
@@ -24,7 +25,8 @@ def generate_info_header(ctx: CodeGenerationContext,
         field_typedefs: dict mapping type names to pystencils `Field` instances
         additional_headers: additional header files to be included
         headers_to_ignore: headers which should not be included
-        additional_typedefs: dict mapping aliases to types.
+        additional_typedefs: dict mapping aliases to types
+        additional_code: additional code which gets appended on the file
     """
     stencil_typedefs = stencil_typedefs if stencil_typedefs is not None else dict()
     field_typedefs = field_typedefs if field_typedefs is not None else dict()
@@ -50,7 +52,7 @@ def generate_info_header(ctx: CodeGenerationContext,
     if path.splitext(filename)[1] not in HEADER_EXTENSIONS:
         filename += '.h'
 
-    ctx.write_file(filename, lines)
+    ctx.write_file(filename, lines + additional_code)
 
 
 #   ------------------------------------- INTERNAL -------------------------------------------------------------
diff --git a/src/core/math/Matrix2.h b/src/core/math/Matrix2.h
index 7132b05d06b6b524024587c40bb1f1dd85a12566..308108649ce4d6583194c73521824be91c99d638 100644
--- a/src/core/math/Matrix2.h
+++ b/src/core/math/Matrix2.h
@@ -1,15 +1,15 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -830,12 +830,12 @@ inline const Matrix2<Type> fabs( const Matrix2<Type>& m );
 // \param matrix The right-hand-side matrix for the multiplication.
 // \return The scaled result matrix.
 */
-//template< typename Type, typename Other >
-//inline const Matrix2<HIGH> operator*( Other scalar, const Matrix2<Type>& matrix )
-//{
-//   static_assert( ! std::is_scalar<Other>::value, "Only scalar types allowed" );
-//   return matrix*scalar;
-//}
+template< typename Type, typename Other >
+inline typename std::enable_if< std::is_fundamental< Other >::value, Matrix2< HIGH > >::type
+operator*(Other scalar, const Matrix2< Type >& matrix)
+{
+   return matrix * scalar;
+}
 //**********************************************************************************************************************
 
 
diff --git a/src/core/math/Matrix3.h b/src/core/math/Matrix3.h
index d6020cd539992e0f5863eb8e25a553da40dbe6a1..8c4259b05c48fd01d6e452a9dc9a219490aef150 100644
--- a/src/core/math/Matrix3.h
+++ b/src/core/math/Matrix3.h
@@ -1,15 +1,15 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -441,7 +441,7 @@ Matrix3<Type> Matrix3<Type>::makeDiagonalMatrix( const Type d )
 //**********************************************************************************************************************
 /*!\fn Matrix3<Type> Matrix3<Type>::makeIdentityMatrix()
 // \brief Named constructor to create the identity matrix.
-// 
+//
 // All diagonal elements are initialized to one, alls others to zero.
 //
 */
@@ -1404,12 +1404,12 @@ inline const Matrix3<Type> fabs( const Matrix3<Type>& m );
 // \param matrix The right-hand-side matrix for the multiplication.
 // \return The scaled result matrix.
 */
-//template< typename Type, typename Other >
-//inline const Matrix3<HIGH> operator*( Other scalar, const Matrix3<Type>& matrix )
-//{
-//   static_assert( ! std::is_scalar<Other>::value, "Only scalar types allowed" );
-//   return matrix*scalar;
-//}
+template< typename Type, typename Other >
+inline typename std::enable_if< std::is_arithmetic< Other >::value, const Matrix3< HIGH > >::type
+operator*(Other scalar, const Matrix3< Type >& matrix)
+{
+   return matrix * scalar;
+}
 //**********************************************************************************************************************
 
 //*************************************************************************************************
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index b701f144f972b8eb496849d260968f21306542eb..46b22c43a0056afc407c98341384e8d22b5147f8 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -4,7 +4,7 @@
 #
 ###################################################################################################
 
-waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition executiontree field stencil
+waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition executiontree field stencil lbm
                      BUILD_ONLY_IF_FOUND CUDA )
 
 ###################################################################################################
\ No newline at end of file
diff --git a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h b/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c7d2c0fbd9f1d152793bcbe84d577dc708ed9
--- /dev/null
+++ b/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
@@ -0,0 +1,86 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CombinedInPlaceGpuPackInfo.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#define IS_EVEN(x) ((x & 1) ^ 1)
+
+#include "cuda/communication/GeneratedGPUPackInfo.h"
+
+#include "lbm/inplace_streaming/TimestepTracker.h"
+
+namespace walberla {
+namespace lbm {
+
+template< typename EvenPackInfo, typename OddPackInfo >
+class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo
+{
+ public:
+   template< typename... Args >
+   CombinedInPlaceGpuPackInfo(std::shared_ptr< lbm::TimestepTracker >& tracker, Args&&... args)
+      : tracker_(tracker), evenPackInfo_(std::forward< Args >(args)...), oddPackInfo_(std::forward< Args >(args)...)
+   {}
+
+   virtual ~CombinedInPlaceGpuPackInfo() = default;
+
+   void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override
+   {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         evenPackInfo_.pack(dir, buffer, block, stream);
+      }
+      else
+      {
+         oddPackInfo_.pack(dir, buffer, block, stream);
+      }
+   }
+
+   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         evenPackInfo_.unpack(dir, buffer, block, stream);
+      }
+      else
+      {
+         oddPackInfo_.unpack(dir, buffer, block, stream);
+      }
+   }
+
+   uint_t size(stencil::Direction dir, IBlock* block) override {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         return evenPackInfo_.size(dir, block);
+      }
+      else
+      {
+         return oddPackInfo_.size(dir, block);
+      }
+   }
+
+ private:
+   const std::shared_ptr< lbm::TimestepTracker >& tracker_;
+   EvenPackInfo evenPackInfo_;
+   OddPackInfo oddPackInfo_;
+};
+
+} // namespace lbm
+} // namespace walberla
+
+
diff --git a/src/field/Field.h b/src/field/Field.h
index a47fe47e7314011fe0bae2df37ca090facabb321..a01fc76cddc32539c7e482e0159a249f1189c54f 100644
--- a/src/field/Field.h
+++ b/src/field/Field.h
@@ -253,10 +253,10 @@ namespace field {
 
       inline Layout layout() const { return layout_; }
 
-      cell_idx_t xStride() const { return xfact_; }
-      cell_idx_t yStride() const { return yfact_; }
-      cell_idx_t zStride() const { return zfact_; }
-      cell_idx_t fStride() const { return ffact_; }
+      int64_t xStride() const { return xfact_; }
+      int64_t yStride() const { return yfact_; }
+      int64_t zStride() const { return zfact_; }
+      int64_t fStride() const { return ffact_; }
 
       cell_idx_t xOff() const { return xOff_; }
       cell_idx_t yOff() const { return yOff_; }
@@ -369,10 +369,10 @@ namespace field {
       Layout layout_;        //!< Determines in which order the values are stored
 
       uint_t     allocSize_; //!< The overall size of the T* (padding included)
-      cell_idx_t ffact_;     //!< Access multiplication factor for the f-dimension.
-      cell_idx_t xfact_;     //!< Access multiplication factor for the x-dimension.
-      cell_idx_t yfact_;     //!< Access multiplication factor for the y-dimension.
-      cell_idx_t zfact_;     //!< Access multiplication factor for the z-dimension.
+      int64_t ffact_;     //!< Access multiplication factor for the f-dimension.
+      int64_t xfact_;     //!< Access multiplication factor for the x-dimension.
+      int64_t yfact_;     //!< Access multiplication factor for the y-dimension.
+      int64_t zfact_;     //!< Access multiplication factor for the z-dimension.
 
       shared_ptr<FieldAllocator<T> > allocator_; //!< Allocator for the field
 
diff --git a/src/field/Field.impl.h b/src/field/Field.impl.h
index 20ee14bb95a2df3b50975f42c7ce96fb047c1dcf..c5fec45c77194c58cf10f2e0ba633740d54d89eb 100644
--- a/src/field/Field.impl.h
+++ b/src/field/Field.impl.h
@@ -322,14 +322,14 @@ namespace field {
          const uint_t alignment = 64;
 #elif defined(__ARM_NEON)
          const uint_t alignment = 16;
+#elif defined(__BIGGEST_ALIGNMENT__)
+         const uint_t alignment = __BIGGEST_ALIGNMENT__;
 #elif defined(__AVX512F__)
          const uint_t alignment = 64;
 #elif defined(__AVX__)
          const uint_t alignment = 32;
 #elif defined(__SSE__) || defined(_MSC_VER)
          const uint_t alignment = 16;
-#elif defined(__BIGGEST_ALIGNMENT__)
-         const uint_t alignment = __BIGGEST_ALIGNMENT__;
 #else
          const uint_t alignment = 64;
 #endif
@@ -361,24 +361,24 @@ namespace field {
          fAllocSize_ = fSize_;
 
          WALBERLA_CHECK_LESS_EQUAL( fSize_ * xAllocSize_ * yAllocSize_ * zAllocSize_ + xSize_ + ySize_ * xAllocSize_ + zSize_ * xAllocSize_ * yAllocSize_,
-                                    std::numeric_limits< cell_idx_t >::max(),
-                                    "The data type 'cell_idx_t' is too small for your field size! Your field is too large.\nYou may have to set 'cell_idx_t' to an 'int64_t'." );
+                                    std::numeric_limits< int64_t >::max(),
+                                    "The data type 'int64_t' is too small for your field size! Your field is too large." );
 
-         ffact_ = cell_idx_c(xAllocSize_ * yAllocSize_ * zAllocSize_);
-         zfact_ = cell_idx_c(xAllocSize_ * yAllocSize_);
-         yfact_ = cell_idx_c(xAllocSize_);
+         ffact_ = int64_t(xAllocSize_) * int64_t(yAllocSize_) * int64_t(zAllocSize_);
+         zfact_ = int64_t(xAllocSize_) * int64_t(yAllocSize_);
+         yfact_ = int64_t(xAllocSize_);
          xfact_ = 1;
       } else {
          values_ = allocator_->allocate(zSize_, ySize_, xSize_, fSize_, yAllocSize_, xAllocSize_, fAllocSize_);
          zAllocSize_ = zSize_;
 
          WALBERLA_CHECK_LESS_EQUAL( fSize_ + xSize_ * fAllocSize_ + ySize_ * fAllocSize_ * xAllocSize_ + zSize_ * fAllocSize_ * xAllocSize_ * yAllocSize_,
-                                    std::numeric_limits< cell_idx_t >::max(),
-                                    "The data type 'cell_idx_t' is too small for your field size! Your field is too large.\nYou may have to set 'cell_idx_t' to an 'int64_t'." );
+                                    std::numeric_limits< int64_t >::max(),
+                                    "The data type 'int64_t' is too small for your field size! Your field is too large." );
 
-         zfact_ = cell_idx_c(fAllocSize_ * xAllocSize_ * yAllocSize_);
-         yfact_ = cell_idx_c(fAllocSize_ * xAllocSize_);
-         xfact_ = cell_idx_c(fAllocSize_);
+         zfact_ = int64_t (fAllocSize_) * int64_t(xAllocSize_) * int64_t(yAllocSize_);
+         yfact_ = int64_t(fAllocSize_) * int64_t(xAllocSize_);
+         xfact_ = int64_t (fAllocSize_);
          ffact_ = 1;
       }
 
@@ -721,7 +721,7 @@ namespace field {
    {
       assertValidCoordinates( x, y, z, f );
 
-      const cell_idx_t index = f*ffact_+ x*xfact_+ y*yfact_+ z*zfact_;
+      const int64_t index = f*int64_t(ffact_) + int64_t(x)*int64_t(xfact_) + int64_t(y)*int64_t(yfact_) + int64_t(z)*int64_t(zfact_);
 
       WALBERLA_ASSERT_LESS( int64_c(index) + int64_c(valuesWithOffset_ - values_), int64_c(allocSize_) );
       WALBERLA_ASSERT_GREATER_EQUAL( int64_c(index) + int64_c(valuesWithOffset_ - values_), int64_c(0) );
@@ -1102,7 +1102,7 @@ namespace field {
       xSize_ = xs;
       ySize_ = ys;
       zSize_ = zs;
-      const auto offset = xOff_*xfact_+ yOff_*yfact_+ zOff_*zfact_;
+      const int64_t offset = int64_t(xOff_)*int64_t(xfact_) + int64_t(yOff_)*int64_t(yfact_) + int64_t(zOff_)*int64_t(zfact_);
       valuesWithOffset_ = values_ + offset;
    }
 
diff --git a/src/lbm/all.h b/src/lbm/all.h
index d4b0a4f1559eaa977cb17e43b41ef75f48323f7b..dc581a251d6f31ce10032c74ccbf2147a7f90780 100644
--- a/src/lbm/all.h
+++ b/src/lbm/all.h
@@ -33,6 +33,7 @@
 #include "field/all.h"
 #include "geometry/all.h"
 #include "gui/all.h"
+#include "inplace_streaming/all.h"
 #include "lattice_model/all.h"
 #include "refinement/all.h"
 #include "sweeps/all.h"
diff --git a/src/lbm/boundary/SimpleDiffusionDirichlet.h b/src/lbm/boundary/SimpleDiffusionDirichlet.h
index 81805c5b80a4adf75498d0b1930424d000f1af91..5fa9db33d07f1559913f0e0a8c9fb590b3dd3e4e 100644
--- a/src/lbm/boundary/SimpleDiffusionDirichlet.h
+++ b/src/lbm/boundary/SimpleDiffusionDirichlet.h
@@ -1,15 +1,15 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -108,9 +108,9 @@ private:
 
 template< typename LatticeModel_T, typename flag_t >
 inline SimpleDiffusionDirichlet< LatticeModel_T, flag_t >::SimpleDiffusionDirichlet( const BoundaryUID& boundaryUID, const FlagUID& uid, PDFField* const pdfField, const real_t val ) :
-   Boundary<flag_t>( boundaryUID ), uid_( uid ), pdfField_( pdfField ), val_( val ), init_(false) 
+   Boundary<flag_t>( boundaryUID ), uid_( uid ), pdfField_( pdfField ), val_( val ), init_(false)
 {
-   WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ ); 
+   WALBERLA_ASSERT_NOT_NULLPTR( pdfField_ );
 }
 
 
@@ -139,17 +139,8 @@ inline void SimpleDiffusionDirichlet< LatticeModel_T, flag_t >::registerCell( Bu
 
 template< typename LatticeModel_T, typename flag_t >
 inline void SimpleDiffusionDirichlet< LatticeModel_T, flag_t >::registerCell( const flag_t, const cell_idx_t, const cell_idx_t, const cell_idx_t,
-                                                                                const BoundaryConfiguration & bc )
+                                                                                const BoundaryConfiguration & )
 {
-   WALBERLA_ASSERT_EQUAL( dynamic_cast< const ScalarConfiguration * >( &bc ), &bc );
-
-   const ScalarConfiguration & sclConfig = dynamic_cast< const ScalarConfiguration & >( bc );
-
-   if( init_ )
-      WALBERLA_ASSERT_FLOAT_EQUAL( val_, sclConfig.val() );
-
-   init_ = true;
-   val_  = sclConfig.val();
 }
 
 
@@ -183,7 +174,7 @@ inline void SimpleDiffusionDirichlet< LatticeModel_T, flag_t >::treatDirection(
    WALBERLA_ASSERT_EQUAL( ny, y + cell_idx_c( stencil::cy[ dir ] ) );
    WALBERLA_ASSERT_EQUAL( nz, z + cell_idx_c( stencil::cz[ dir ] ) );
    WALBERLA_ASSERT_UNEQUAL( mask & this->mask_, numeric_cast<flag_t>(0) );
-   WALBERLA_ASSERT_EQUAL  ( mask & this->mask_, this->mask_ ); 
+   WALBERLA_ASSERT_EQUAL  ( mask & this->mask_, this->mask_ );
    // only true if "this->mask_" only contains one single flag, which is the case for the current implementation of this boundary condition (SimpleDiffusionDirichlet)
 
    pdfField_->get( nx, ny, nz, Stencil::invDirIdx(dir) ) = real_t(2) * val_ * LatticeModel_T::w[Stencil::idx[dir]] - pdfField_->get( x, y, z, Stencil::idx[dir] );
diff --git a/src/lbm/inplace_streaming/TimestepTracker.h b/src/lbm/inplace_streaming/TimestepTracker.h
index c0ca22635fea5d9a454f99e2d4fe08a519578eef..daba8db2380af5895f15c1f5688cf59f06b28f8f 100644
--- a/src/lbm/inplace_streaming/TimestepTracker.h
+++ b/src/lbm/inplace_streaming/TimestepTracker.h
@@ -29,10 +29,10 @@ namespace lbm
 class TimestepTracker
 {
  private:
-   uint8_t counter_;
+   uint8_t counter_{ 0 };
 
  public:
-   TimestepTracker() : counter_(0) {}
+   TimestepTracker() = default;
    TimestepTracker(uint8_t zeroth_timestep) : counter_(zeroth_timestep & 1) {}
 
    void advance() { counter_ = (counter_ + 1) & 1; }
@@ -43,6 +43,7 @@ class TimestepTracker
    }
 
    uint8_t getCounter() const { return counter_; }
+   uint8_t getCounterPlusOne() const { return (counter_ + 1) & 1; }
 
 }; // class TimestepTracker
 
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/CalculateNormals.h b/src/lbm/inplace_streaming/all.h
similarity index 66%
rename from apps/showcases/PhaseFieldAllenCahn/GPU/CalculateNormals.h
rename to src/lbm/inplace_streaming/all.h
index 901db8544732cea2953e651fc10c78943326bf56..b984e282387c2b39c07819ddeebc58c0bf9327fb 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/CalculateNormals.h
+++ b/src/lbm/inplace_streaming/all.h
@@ -13,20 +13,11 @@
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file CalculateNormals.h
-//! \author Markus Holzer <markus.holzer@fau.de>
+//! \file all.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
 //
 //======================================================================================================================
 
-#include "blockforest/StructuredBlockForest.h"
+#pragma once
 
-#include "domain_decomposition/BlockDataID.h"
-#include "domain_decomposition/IBlock.h"
-
-#include "field/FlagField.h"
-
-namespace walberla
-{
-void calculate_normals(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID normalsFieldID,
-                       ConstBlockDataID flagFieldID, FlagUID fluidFlagUID, FlagUID boundaryFlagUID);
-}
\ No newline at end of file
+#include "TimestepTracker.h"
diff --git a/src/mesh_common/distance_octree/DistanceOctree.h b/src/mesh_common/distance_octree/DistanceOctree.h
index ce1a21c16cd56f8ba9d78d59e3ae4570aa352968..7e58f969ea3e09b3ec385481bd8dc36edd6efeb9 100644
--- a/src/mesh_common/distance_octree/DistanceOctree.h
+++ b/src/mesh_common/distance_octree/DistanceOctree.h
@@ -133,6 +133,20 @@ protected:
 
 
 
+//**********************************************************************************************************************
+/*! \brief Write the distance octree to a VTK file.
+ * 
+ * This method should only be called by the root process:
+ * \code
+     WALBERLA_ROOT_SECTION()
+     {
+        distanceOctree->writeVTKOutput("distanceOctree");
+     }
+ * \endcode
+ *
+ * \param filestem name of the VTK file without extension
+ */
+//**********************************************************************************************************************
 template <typename MeshType>
 void DistanceOctree<MeshType>::writeVTKOutput( const std::string & filestem ) const
 {
diff --git a/src/stencil/Directions.h b/src/stencil/Directions.h
index cc3cfa29b1e320ca1b0f9c49f118402585036e21..c13d95466406f8d63dff04a1021c9779fb310c57 100644
--- a/src/stencil/Directions.h
+++ b/src/stencil/Directions.h
@@ -271,8 +271,8 @@ namespace stencil {
 
    /// Maps (direction,axis) pair to direction
    /// \param axis     0,1 or 2 standing for x,y,z
-   /// \param minOrMax if false, the direction pointing in the negative axis direction is returned,
-   ///                 if true, the positive axis direction
+   /// \param minOrMax if true, the direction pointing in the negative axis direction is returned,
+   ///                 if false, the positive axis direction
    inline Direction directionFromAxis( int axis, bool minOrMax )
    {
       WALBERLA_ASSERT_LESS( axis, 3 );
@@ -288,8 +288,8 @@ namespace stencil {
 
    /// Maps (direction,axis) pair to direction
    /// \param axis     0,1 or 2 standing for x,y,z
-   /// \param minOrMax if false, the direction pointing in the negative axis direction is returned,
-   ///                 if true, the positive axis direction
+   /// \param minOrMax if true, the direction pointing in the negative axis direction is returned,
+   ///                 if false, the positive axis direction
    inline Direction directionFromAxis( uint_t axis, bool minOrMax )
    {
       WALBERLA_ASSERT_LESS( axis, 3 );
diff --git a/tests/core/math/Matrix3Test.cpp b/tests/core/math/Matrix3Test.cpp
index 7fd6aea27d2fdea315a27e87ea4dd342273840e4..d47368f290ab2310d321ed443b464e81b0ae1063 100644
--- a/tests/core/math/Matrix3Test.cpp
+++ b/tests/core/math/Matrix3Test.cpp
@@ -1,15 +1,15 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -19,67 +19,75 @@
 //
 //======================================================================================================================
 
-#include "core/debug/TestSubsystem.h"
 #include "core/math/Matrix3.h"
 
-#include <iostream>
+#include "core/debug/TestSubsystem.h"
 
+#include <iostream>
 
 using namespace walberla;
 using walberla::uint8_t;
 
-
 void rotationTest()
 {
-   Matrix3<real_t> rotationMatrix (0.0);
-   rotationMatrix(0,0) = 1.0;
-   rotationMatrix(1,1) = 1.0;
-   rotationMatrix(2,2) = 1.0;
+   Matrix3< real_t > rotationMatrix(0.0);
+   rotationMatrix(0, 0) = 1.0;
+   rotationMatrix(1, 1) = 1.0;
+   rotationMatrix(2, 2) = 1.0;
 
-   Matrix3<real_t> diagonalMatrix ( 0.0 );
-   diagonalMatrix(0,0) = 2.0;
-   diagonalMatrix(1,1) = 4.0;
-   diagonalMatrix(2,2) = 6.0;
+   Matrix3< real_t > diagonalMatrix(0.0);
+   diagonalMatrix(0, 0) = 2.0;
+   diagonalMatrix(1, 1) = 4.0;
+   diagonalMatrix(2, 2) = 6.0;
 
-   Matrix3<real_t> result = rotationMatrix.rotate( diagonalMatrix );
+   Matrix3< real_t > result = rotationMatrix.rotate(diagonalMatrix);
 
-   std::cout << diagonalMatrix  << std::endl;
+   std::cout << diagonalMatrix << std::endl;
    std::cout << result << std::endl;
 
-   WALBERLA_CHECK_FLOAT_EQUAL( result(0,0), 2.0 );
-   WALBERLA_CHECK_FLOAT_EQUAL( result(1,1), 4.0 );
-   WALBERLA_CHECK_FLOAT_EQUAL( result(2,2), 6.0 );
+   WALBERLA_CHECK_FLOAT_EQUAL(result(0, 0), 2.0);
+   WALBERLA_CHECK_FLOAT_EQUAL(result(1, 1), 4.0);
+   WALBERLA_CHECK_FLOAT_EQUAL(result(2, 2), 6.0);
 
+   for (uint_t i = 0; i < 3; ++i)
+      for (uint_t j = 0; j < 3; ++j)
+         if (i != j) WALBERLA_CHECK_FLOAT_EQUAL(result(i, j), 0.0);
 
-   for( uint_t i = 0; i < 3; ++i )
-      for( uint_t j = 0; j < 3; ++j )
-         if ( i != j)
-            WALBERLA_CHECK_FLOAT_EQUAL( result(i,j), 0.0 );
-
-   //also checking WALBERLA_CHECK_FLOAT_EQUAL for matrices
-   Matrix3<real_t> cmp(2,0,0,0,4,0,0,0,6);
-   WALBERLA_CHECK_FLOAT_EQUAL( result, cmp );
-   WALBERLA_CHECK_FLOAT_EQUAL_EPSILON( result, cmp, real_t(1e-5) );
+   // also checking WALBERLA_CHECK_FLOAT_EQUAL for matrices
+   Matrix3< real_t > cmp(2, 0, 0, 0, 4, 0, 0, 0, 6);
+   WALBERLA_CHECK_FLOAT_EQUAL(result, cmp);
+   WALBERLA_CHECK_FLOAT_EQUAL_EPSILON(result, cmp, real_t(1e-5));
 }
 
 void RARTTest()
 {
-   Matrix3<real_t> A ( 1,2,3,4,5,6,7,8,9 );
-   Matrix3<real_t> R ( 2,3,4,5,6,7,8,9,1 );
-   WALBERLA_CHECK_FLOAT_EQUAL( math::transformMatrixRART(R,A), R * A * R.getTranspose() );
+   Matrix3< real_t > A(1, 2, 3, 4, 5, 6, 7, 8, 9);
+   Matrix3< real_t > R(2, 3, 4, 5, 6, 7, 8, 9, 1);
+   WALBERLA_CHECK_FLOAT_EQUAL(math::transformMatrixRART(R, A), R * A * R.getTranspose());
 }
 
-int main()
+void scalarMultiplicationTest()
 {
+   const Matrix3< real_t > A(1, 2, 3, 4, 5, 6, 7, 8, 9);
+   const real_t s = real_c(5);
 
-   Matrix3<real_t> m1 ( 1.0 );
-   Matrix3<real_t> m2 ( 2.0 );
+   const Matrix3< real_t > solution(5, 10, 15, 20, 25, 30, 35, 40, 45);
+
+   WALBERLA_CHECK_FLOAT_EQUAL(A * s, solution);
+   WALBERLA_CHECK_FLOAT_EQUAL(s * A, A * s);
+}
+
+int main()
+{
+   Matrix3< real_t > m1(1.0);
+   Matrix3< real_t > m2(2.0);
 
-   // the following line gives a compile error when the operator*(Other, Matrix3) is commented in
-   m1 * m2;
+   // in an incorrect implementation of operator*(Other, Matrix3), the following line might give a compile error
+   m1* m2;
 
    rotationTest();
    RARTTest();
+   scalarMultiplicationTest();
 
    return 0;
 }
diff --git a/tests/core/math/MatrixVector2Test.cpp b/tests/core/math/MatrixVector2Test.cpp
index 0e4266c83202ac0a90b01bc7c3526b8ae0a9fd9b..9628040a4e70aaa55da18a383e6997f0b09d3711 100644
--- a/tests/core/math/MatrixVector2Test.cpp
+++ b/tests/core/math/MatrixVector2Test.cpp
@@ -1,15 +1,15 @@
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -25,46 +25,46 @@
 
 #include <iostream>
 
-
 using namespace walberla;
 using walberla::uint8_t;
 
 void matrix2Test()
 {
-   Matrix2<real_t> m1 ( 2, 1, -1, 5 );
-   Matrix2<real_t> m2 ( 4, 2, -2, 1 );
+   Matrix2< real_t > m1(2, 1, -1, 5);
+   Matrix2< real_t > m2(4, 2, -2, 1);
 
-   WALBERLA_CHECK_EQUAL(  m1 * m2 * m2.getInverse(), m1 );
-   WALBERLA_CHECK_EQUAL(  m1 * m1.getInverse() * m2, m2 );
+   WALBERLA_CHECK_EQUAL(m1 * m2 * m2.getInverse(), m1);
+   WALBERLA_CHECK_EQUAL(m1 * m1.getInverse() * m2, m2);
 
-   Vector2<real_t> v1 ( 5, 7 );
+   Vector2< real_t > v1(5, 7);
 
-   WALBERLA_CHECK_EQUAL(  m2 * v1, Vector2<real_t> (34, -3) );
-}
+   WALBERLA_CHECK_EQUAL(m2 * v1, Vector2< real_t >(34, -3));
 
+   const real_t s = real_c(5);
+
+   WALBERLA_CHECK_EQUAL(s * m1, Matrix2< real_t >(10, 5, -5, 25));
+   WALBERLA_CHECK_EQUAL(m1 * s, Matrix2< real_t >(10, 5, -5, 25));
+}
 
 void vector2Test()
 {
-   Vector2<real_t> v1( 1,2 );
-   Vector2<real_t> v2( 3,4 );
-   Vector2<uint_t> v3( 4,3 );
+   Vector2< real_t > v1(1, 2);
+   Vector2< real_t > v2(3, 4);
+   Vector2< uint_t > v3(4, 3);
 
    auto sum = v1 + v2;
-   WALBERLA_CHECK_EQUAL( sum, Vector2<real_t>(4,6 ) );
+   WALBERLA_CHECK_EQUAL(sum, Vector2< real_t >(4, 6));
 
    auto product = v1 * v2;
-   WALBERLA_CHECK_FLOAT_EQUAL( product, 1.0*3 + 2.0*4 );
-
+   WALBERLA_CHECK_FLOAT_EQUAL(product, 1.0 * 3 + 2.0 * 4);
 
-   WALBERLA_CHECK_FLOAT_EQUAL( v2.length(),    5.0  );
-   WALBERLA_CHECK_FLOAT_EQUAL( v2.sqrLength(), 25.0 );
-
-   WALBERLA_CHECK_EQUAL( v3.indexOfMin(), 1 );
-   WALBERLA_CHECK_EQUAL( v3.indexOfMax(), 0 );
+   WALBERLA_CHECK_FLOAT_EQUAL(v2.length(), 5.0);
+   WALBERLA_CHECK_FLOAT_EQUAL(v2.sqrLength(), 25.0);
 
+   WALBERLA_CHECK_EQUAL(v3.indexOfMin(), 1);
+   WALBERLA_CHECK_EQUAL(v3.indexOfMax(), 0);
 }
 
-
 int main()
 {
    vector2Test();
diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py
index 298727b46c428384eeef7f755e8bfe4881d53d60..45bdc303c9f9983c7c4120748e4b1fba1f23ecf7 100644
--- a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py
+++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py
@@ -1,5 +1,5 @@
 import pystencils as ps
-from lbmpy.updatekernels import create_stream_pull_only_kernel
+from lbmpy.updatekernels import create_stream_only_kernel
 from lbmpy.stencils import get_stencil
 from pystencils_walberla import CodeGeneration, generate_sweep
 
@@ -8,15 +8,14 @@ with CodeGeneration() as ctx:
     dtype = 'float64' if ctx.double_accuracy else 'float32'
 
     # Copy sweep
-    src, dst = ps.fields("src({f_size}), dst({f_size}) : {dtype}[3D]".format(dtype=dtype, f_size=f_size),
-                         layout='fzyx')
+    src, dst = ps.fields(f"src({f_size}), dst({f_size}) : {dtype}[3D]", layout='fzyx')
+
     copy_only = [ps.Assignment(dst(i), src(i)) for i in range(f_size)]
     generate_sweep(ctx, 'MicroBenchmarkCopyKernel', copy_only,
                    target='gpu', gpu_indexing_params={'block_size': (128, 1, 1)})
 
     # Stream-only sweep
     stencil = get_stencil("D3Q19")
-    stream_only = create_stream_pull_only_kernel(stencil, src_field_name='src', dst_field_name='dst',
-                                                 generic_field_type=dtype, generic_layout='fzyx')
+    stream_only = create_stream_only_kernel(stencil, src_field=src, dst_field=dst)
     generate_sweep(ctx, 'MicroBenchmarkStreamKernel', stream_only,
                    target='gpu', gpu_indexing_params={'block_size': (128, 1, 1)})