diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py b/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
index 2c447a3d88f3d331da8f4bc0fb7daa082584707e..88a410c1062a0699ac93058104d6dcc46f51a5bf 100755
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
@@ -3,30 +3,43 @@ import waLBerla as wlb
 import pandas as pd
 
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars
+from waLBerla.tools.config import block_decomposition
 
 import sys
+from math import prod
+
+
+def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
+    """Checks if a single block of given size fits into GPU memory"""
+
+    cells = prod(b + 2 * gls for b in block_size)
+    # 3 values for the velocity and two for the phase field and the temporary phase field
+    values_per_cell = 2 * q_phase + 2 * q_hydro + 3 + 2
+    needed_memory = values_per_cell * cells * size_per_value
+    return needed_memory < total_mem
 
 
 class Scenario:
-    def __init__(self, timeStepStrategy, overlappingWidth, cuda_block_size):
+    def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256),
+                 cuda_enabled_mpi=False):
         # output frequencies
         self.vtkWriteFrequency = 0
 
         # simulation parameters
         self.timesteps = 101
-        edge_size = 32
-        self.cells = (edge_size, edge_size, edge_size)
-        self.blocks = (1, 1, 1)
+        self.cells_per_block = cells_per_block
+        self.blocks = block_decomposition(wlb.mpi.numProcesses())
         self.periodic = (1, 1, 1)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
+        self.size = (self.cells_per_block[0] * self.blocks[0],
+                     self.cells_per_block[1] * self.blocks[1],
+                     self.cells_per_block[2] * self.blocks[2])
 
-        self.timeStepStrategy = timeStepStrategy
-        self.overlappingWidth = overlappingWidth
+        self.timeStepStrategy = time_step_strategy
         self.cuda_block_size = cuda_block_size
         self.warmupSteps = 10
 
+        self.cudaEnabledMpi = cuda_enabled_mpi
+
         # bubble parameters
         self.bubbleRadius = min(self.size) // 4
         self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
@@ -41,19 +54,18 @@ class Scenario:
         return {
             'DomainSetup': {
                 'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                'cellsPerBlock': self.cells_per_block,
                 'periodic': self.periodic,
             },
             'Parameters': {
                 'timesteps': self.timesteps,
                 'vtkWriteFrequency': self.vtkWriteFrequency,
-                'useGui': 0,
                 'remainingTimeLoggerFrequency': -1,
                 'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
                 'gpuBlockSize': self.cuda_block_size,
                 'warmupSteps': self.warmupSteps,
                 'scenario': self.scenario,
+                'cudaEnabledMpi': self.cudaEnabledMpi
             },
             'Boundaries_GPU': {
                 'Border': []
@@ -86,43 +98,46 @@ class Scenario:
             df.to_csv(self.csv_file, index=False, mode='a', header=False)
 
 
-def overlap_benchmark():
+def benchmark():
     scenarios = wlb.ScenarioManager()
-    overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
-                         (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
-                         (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
-
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
-                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
-                   (32, 4, 2), (64, 4, 2), (128, 2, 2),
-                   (32, 8, 1), (64, 8, 1), (64, 4, 2),
-                   (32, 16, 1), (16, 16, 1), (16, 16, 2)]
-
-    # no overlap
-    scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1), cuda_block_size=(16, 16, 1)))
-
-    # overlap
-    for overlap_strategy in ['overlap']:
-        for overlappingWidth in overlappingWidths:
-            for cuda_block in cuda_blocks:
-                scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                    overlappingWidth=overlappingWidth,
-                                    cuda_block_size=cuda_block)
-                scenarios.add(scenario)
+
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+
+    block_size = (256, 256, 256)
+
+    if not domain_block_size_ok(block_size, gpu_mem):
+        wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+    else:
+        scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size))
 
 
 def kernel_benchmark():
     scenarios = wlb.ScenarioManager()
 
-    # overlap
-    # for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
-    for overlap_strategy in ['overlap']:
-        scenario = Scenario(timeStepStrategy=overlap_strategy,
-                            overlappingWidth=(1, 1, 1),
-                            cuda_block_size=(128, 1, 1))
-        scenarios.add(scenario)
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+
+    block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)]
+
+    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
+                   (32, 2, 1), (64, 2, 1), (128, 2, 1),
+                   (32, 4, 1), (64, 4, 1),
+                   (32, 4, 2),
+                   (32, 8, 1),
+                   (16, 16, 1)]
+
+    for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
+        for cuda_block in cuda_blocks:
+            for block_size in block_sizes:
+                if not domain_block_size_ok(block_size, gpu_mem):
+                    wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+                    continue
+                scenario = Scenario(time_step_strategy=time_step_strategy,
+                                    cuda_block_size=cuda_block,
+                                    cells_per_block=block_size)
+                scenarios.add(scenario)
 
 
-# overlap_benchmark()
+# benchmark()
 kernel_benchmark()
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_cpu.py b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_cpu.py
index e4fc118efaaaf6650ad0c756928f21796003bd2b..58cf970f4e634a5eba9fb528d1c3b407b9e86942 100755
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_cpu.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_cpu.py
@@ -3,57 +3,55 @@ import waLBerla as wlb
 import pandas as pd
 
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars
+from waLBerla.tools.config import block_decomposition
 
 import sys
 
 
 class Scenario:
-    def __init__(self, timeStepStrategy, overlappingWidth):
+    def __init__(self, time_step_strategy, cells_per_block=(256, 256, 256),
+                 cuda_enabled_mpi=False):
         # output frequencies
-        self.vtkWriteFrequency = -1
+        self.vtkWriteFrequency = 0
 
         # simulation parameters
-        self.timesteps = 500
-        edge_size = 50
-        self.cells = (edge_size, edge_size, edge_size)
-        self.blocks = (1, 1, 1)
+        self.timesteps = 101
+        self.cells_per_block = cells_per_block
+        self.blocks = block_decomposition(wlb.mpi.numProcesses())
         self.periodic = (1, 1, 1)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
+        self.size = (self.cells_per_block[0] * self.blocks[0],
+                     self.cells_per_block[1] * self.blocks[1],
+                     self.cells_per_block[2] * self.blocks[2])
 
-        self.timeStepStrategy = timeStepStrategy
-        self.overlappingWidth = overlappingWidth
-        self.cuda_block_size = (-1, -1, -1)
+        self.timeStepStrategy = time_step_strategy
         self.warmupSteps = 10
 
+        self.cudaEnabledMpi = cuda_enabled_mpi
+
         # bubble parameters
         self.bubbleRadius = min(self.size) // 4
         self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
 
-        self.scenario = 1  # 1 rising bubble, 2 RTI
         self.config_dict = self.config()
 
-        self.csv_file = "benchmark_cpu.csv"
+        self.csv_file = "benchmark.csv"
 
     @wlb.member_callback
     def config(self):
         return {
             'DomainSetup': {
                 'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                'cellsPerBlock': self.cells_per_block,
                 'periodic': self.periodic,
             },
             'Parameters': {
                 'timesteps': self.timesteps,
                 'vtkWriteFrequency': self.vtkWriteFrequency,
-                'useGui': 0,
-                'remainingTimeLoggerFrequency': 10.0,
+                'remainingTimeLoggerFrequency': -1,
                 'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
-                'gpuBlockSize': self.cuda_block_size,
                 'warmupSteps': self.warmupSteps,
-                'scenario': self.scenario,
+                'scenario': 1,
+                'cudaEnabledMpi': self.cudaEnabledMpi
             },
             'Boundaries_GPU': {
                 'Border': []
@@ -86,31 +84,23 @@ class Scenario:
             df.to_csv(self.csv_file, index=False, mode='a', header=False)
 
 
-def overlap_benchmark():
+def benchmark():
     scenarios = wlb.ScenarioManager()
-    overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
-                         (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
-                         (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32),
-                         (64, 32, 32), (64, 64, 32), (64, 64, 64)]
-    # no overlap
-    scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1)))
-
-    # overlap
-    for overlap_strategy in ['overlap']:
-        for overlappingWidth in overlappingWidths:
-            scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                overlappingWidth=overlappingWidth)
-            scenarios.add(scenario)
+    block_size = (64, 64, 64)
+
+    scenarios.add(Scenario(time_step_strategy='normal', cells_per_block=block_size))
 
 
 def kernel_benchmark():
     scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]
 
-    # overlap
-    scenario = Scenario(timeStepStrategy='overlap',
-                        overlappingWidth=(8, 8, 8))
-    scenarios.add(scenario)
+    for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
+        for block_size in block_sizes:
+            scenario = Scenario(time_step_strategy=time_step_strategy,
+                                cells_per_block=block_size)
+            scenarios.add(scenario)
 
 
-# overlap_benchmark()
+# benchmark()
 kernel_benchmark()
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
index f9a20e01a0ddcd11a65eade491cf05a35f25cca4..5af04a35531a3bebe2123018a760aa4a00c72623 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -37,8 +37,6 @@
 
 #include "timeloop/SweepTimeloop.h"
 
-#include <blockforest/communication/UniformBufferedScheme.h>
-
 #include "InitializerFunctions.h"
 
 //////////////////////////////
@@ -48,40 +46,19 @@
 #if defined(WALBERLA_BUILD_WITH_CUDA)
 #   include "cuda/AddGPUFieldToStorage.h"
 #   include "cuda/DeviceSelectMPI.h"
-#   include "cuda/HostFieldAllocator.h"
 #   include "cuda/ParallelStreams.h"
-#   include "cuda/communication/GPUPackInfo.h"
 #   include "cuda/communication/MemcpyPackInfo.h"
 #   include "cuda/communication/UniformGPUScheme.h"
-
-#   include "GenDefines.h"
-#   include "PackInfo_phase_field.h"
-#   include "PackInfo_phase_field_distributions.h"
-#   include "PackInfo_velocity_based_distributions.h"
-#   include "hydro_LB_step.h"
-#   include "initialize_phase_field_distributions.h"
-#   include "initialize_velocity_based_distributions.h"
-#   include "phase_field_LB_step.h"
-
 #else
-#   include "GenDefines.h"
-#   include "PackInfo_phase_field.h"
-#   include "PackInfo_phase_field_distributions.h"
-#   include "PackInfo_velocity_based_distributions.h"
-#   include "hydro_LB_step.h"
-#   include "initialize_phase_field_distributions.h"
-#   include "initialize_velocity_based_distributions.h"
-#   include "phase_field_LB_step.h"
+#   include <blockforest/communication/UniformBufferedScheme.h>
 #endif
 
+#include "GenDefines.h"
+
 using namespace walberla;
 
-using PdfField_phase_T = GhostLayerField< real_t, Stencil_phase_T::Size >;
-using PdfField_hydro_T = GhostLayerField< real_t, Stencil_hydro_T::Size >;
-using VelocityField_T  = GhostLayerField< real_t, Stencil_hydro_T::Dimension >;
-using PhaseField_T     = GhostLayerField< real_t, 1 >;
-using flag_t           = walberla::uint8_t;
-using FlagField_T      = FlagField< flag_t >;
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
 typedef cuda::GPUField< real_t > GPUField;
@@ -111,9 +88,7 @@ int main(int argc, char** argv)
       const real_t remainingTimeLoggerFrequency =
          parameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0);
       const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
-      Vector3< int > overlappingWidth =
-         parameters.getParameter< Vector3< int > >("overlappingWidth", Vector3< int >(1, 1, 1));
-      const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
+      const uint_t warmupSteps  = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
       // CPU fields
@@ -165,24 +140,21 @@ int main(int argc, char** argv)
       pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
 
       pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1],
-         Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+         lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
       pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
-                                              gpuBlockSize[1],
-                                              Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+                                              gpuBlockSize[1], gpuBlockSize[2]);
 #else
       pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
       pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
-      pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field, phase_field, vel_field, Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
-      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field,
-                                              Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+      pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, vel_field);
+      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
 #endif
 
 // add communication
 #if defined(WALBERLA_BUILD_WITH_CUDA)
+      const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
       auto Comm_velocity_based_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
+         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_velocity_based_distributions =
          make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
@@ -190,13 +162,12 @@ int main(int argc, char** argv)
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
 
       auto Comm_phase_field_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
+         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field_distributions =
          make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
 #else
 
-
       blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
 
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
@@ -244,29 +215,15 @@ int main(int argc, char** argv)
       auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
       auto normalTimeStep = [&]() {
+         Comm_velocity_based_distributions->startCommunication(defaultStream);
          for (auto& block : *blocks)
-         {
-            Comm_phase_field_distributions->communicate(nullptr);
-            phase_field_LB_step(&block);
+            phase_field_LB_step(&block, defaultStream);
+         Comm_velocity_based_distributions->wait(defaultStream);
 
-            Comm_velocity_based_distributions->communicate(nullptr);
-            hydro_LB_step(&block);
-         }
-      };
-      auto simpleOverlapTimeStep = [&]() {
          Comm_phase_field_distributions->startCommunication(defaultStream);
          for (auto& block : *blocks)
-            phase_field_LB_step.inner(&block, defaultStream);
+            hydro_LB_step(&block, defaultStream);
          Comm_phase_field_distributions->wait(defaultStream);
-         for (auto& block : *blocks)
-            phase_field_LB_step.outer(&block, defaultStream);
-
-         Comm_velocity_based_distributions->startCommunication(defaultStream);
-         for (auto& block : *blocks)
-            hydro_LB_step.inner(&block, defaultStream);
-         Comm_velocity_based_distributions->wait(defaultStream);
-         for (auto& block : *blocks)
-            hydro_LB_step.outer(&block, defaultStream);
       };
       auto phase_only = [&]() {
          for (auto& block : *blocks)
@@ -284,29 +241,15 @@ int main(int argc, char** argv)
       };
 #else
       auto normalTimeStep = [&]() {
-         for (auto& block : *blocks)
-         {
-            Comm_phase_field_distributions();
-            phase_field_LB_step(&block);
-
-            Comm_velocity_based_distributions();
-            hydro_LB_step(&block);
-         }
-      };
-      auto simpleOverlapTimeStep = [&]() {
-        Comm_phase_field_distributions.startCommunication();
-        for (auto& block : *blocks)
-           phase_field_LB_step.inner(&block);
-        Comm_phase_field_distributions.wait();
-        for (auto& block : *blocks)
-           phase_field_LB_step.outer(&block);
-
-        Comm_velocity_based_distributions.startCommunication();
-        for (auto& block : *blocks)
-           hydro_LB_step.inner(&block);
-        Comm_velocity_based_distributions.wait();
-        for (auto& block : *blocks)
-           hydro_LB_step.outer(&block);
+            Comm_velocity_based_distributions.startCommunication();
+            for (auto& block : *blocks)
+               phase_field_LB_step(&block);
+            Comm_velocity_based_distributions.wait();
+
+            Comm_phase_field_distributions.startCommunication();
+            for (auto& block : *blocks)
+               hydro_LB_step(&block);
+            Comm_phase_field_distributions.wait();
       };
       auto phase_only = [&]() {
          for (auto& block : *blocks)
@@ -324,12 +267,7 @@ int main(int argc, char** argv)
       };
 #endif
       std::function< void() > timeStep;
-      if (timeStepStrategy == "overlap")
-      {
-         timeStep = std::function< void() >(simpleOverlapTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("overlapping timestep")
-      }
-      else if (timeStepStrategy == "phase_only")
+      if (timeStepStrategy == "phase_only")
       {
          timeStep = std::function< void() >(phase_only);
          WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
@@ -347,7 +285,7 @@ int main(int argc, char** argv)
       else
       {
          timeStep = std::function< void() >(normalTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("normal timestep with no overlapping")
+         WALBERLA_LOG_INFO_ON_ROOT("normal timestep with overlapping")
       }
 
       timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
@@ -364,10 +302,8 @@ int main(int argc, char** argv)
          auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                          "simulation_step", false, true, true, false, 0);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-         vtkOutput->addBeforeFunction([&]() {
-            cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-            // cuda::fieldCpy<VelocityField_T, GPUField>( blocks, vel_field, vel_field_gpu );
-         });
+         vtkOutput->addBeforeFunction(
+            [&]() { cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu); });
 #endif
          auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
          vtkOutput->addCellDataWriter(phaseWriter);
@@ -402,6 +338,11 @@ int main(int argc, char** argv)
          if (pythonCallbackResults.isCallable())
          {
             pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+            pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
+            pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
+            #if defined(WALBERLA_BUILD_WITH_CUDA)
+               pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", cudaEnabledMpi);
+            #endif
             // Call Python function to report results
             pythonCallbackResults();
          }
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_piz_daint.py b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_piz_daint.py
deleted file mode 100755
index 816d74b2787e1ca718ce7454be06b8f818011593..0000000000000000000000000000000000000000
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_piz_daint.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import os
-import waLBerla as wlb
-import csv
-
-from waLBerla.tools.sqlitedb import sequenceValuesToScalars
-
-import sys
-
-
-class Scenario:
-    def __init__(self, timeStepStrategy, overlappingWidth, cuda_block_size):
-        # output frequencies
-        self.vtkWriteFrequency = 0
-
-        # simulation parameters
-        self.timesteps = 201
-        edge_size = 100
-        self.cells = (edge_size, edge_size, edge_size)
-        self.blocks = (1, 1, 1)
-        self.periodic = (1, 1, 1)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
-
-        self.timeStepStrategy = timeStepStrategy
-        self.overlappingWidth = overlappingWidth
-        self.cuda_block_size = cuda_block_size
-        self.warmupSteps = 20
-
-        # bubble parameters
-        self.bubbleRadius = min(self.size) // 4
-        self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
-
-        self.scenario = 1  # 1 rising bubble, 2 RTI
-        self.config_dict = self.config()
-
-        self.csv_file = "benchmark_piz_daint.csv"
-
-    @wlb.member_callback
-    def config(self):
-        return {
-            'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
-                'periodic': self.periodic,
-            },
-            'Parameters': {
-                'timesteps': self.timesteps,
-                'vtkWriteFrequency': self.vtkWriteFrequency,
-                'useGui': 0,
-                'remainingTimeLoggerFrequency': -1,
-                'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
-                'gpuBlockSize': self.cuda_block_size,
-                'warmupSteps': self.warmupSteps,
-                'scenario': self.scenario,
-            },
-            'Boundaries_GPU': {
-                'Border': []
-            },
-            'Boundaries_CPU': {
-                'Border': []
-            },
-            'Bubble': {
-                'bubbleMidPoint': self.bubbleMidPoint,
-                'bubbleRadius': self.bubbleRadius,
-            },
-        }
-
-    @wlb.member_callback
-    def results_callback(self, **kwargs):
-        data = {}
-        data.update(self.config_dict['Parameters'])
-        data.update(self.config_dict['DomainSetup'])
-        data.update(kwargs)
-        data['executable'] = sys.argv[0]
-        data['compile_flags'] = wlb.build_info.compiler_flags
-        data['walberla_version'] = wlb.build_info.version
-        data['build_machine'] = wlb.build_info.build_machine
-        sequenceValuesToScalars(data)
-
-        if not os.path.isfile(self.csv_file):
-            try:
-                with open(self.csv_file, 'w') as csvfile:
-                    writer = csv.DictWriter(csvfile, fieldnames=data)
-                    writer.writeheader()
-                    writer.writerow(data)
-            except IOError:
-                print("could not create csv file")
-        else:
-            try:
-                with open(self.csv_file, 'a') as csvfile:
-                    writer = csv.DictWriter(csvfile, fieldnames=data)
-                    writer.writerow(data)
-            except IOError:
-                print("could not write to csv file")
-
-
-def overlap_benchmark():
-    scenarios = wlb.ScenarioManager()
-    overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
-                         (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
-                         (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
-
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1),
-                   (32, 4, 1), (64, 4, 1),
-                   (32, 4, 2), (32, 8, 1), (16, 16, 1)]
-
-    # no overlap
-    scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1), cuda_block_size=(16, 16, 1)))
-
-    # overlap
-    for overlap_strategy in ['overlap']:
-        for overlappingWidth in overlappingWidths:
-            for cuda_block in cuda_blocks:
-                scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                    overlappingWidth=overlappingWidth,
-                                    cuda_block_size=cuda_block)
-                scenarios.add(scenario)
-
-
-def kernel_benchmark():
-    scenarios = wlb.ScenarioManager()
-
-    # overlap
-    # for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
-    for overlap_strategy in ['overlap']:
-        scenario = Scenario(timeStepStrategy=overlap_strategy,
-                            overlappingWidth=(16, 16, 16),
-                            cuda_block_size=(128, 4, 1))
-        scenarios.add(scenario)
-
-
-def weak_scaling():
-    scenarios = wlb.ScenarioManager()
-
-    # overlap
-    # for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
-    for overlap_strategy in ['overlap']:
-        scenario = Scenario(timeStepStrategy=overlap_strategy,
-                            overlappingWidth=(32, 32, 32),
-                            cuda_block_size=(128, 2, 1))
-        scenarios.add(scenario)
-
-
-# overlap_benchmark()
-# kernel_benchmark()
-weak_scaling()
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py b/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
index bbd994de58a0b180293b7e643a2a27e0b6c50068..9be9c174b29b7d9452202f70d77d21ad497a8db2 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
@@ -1,11 +1,10 @@
 from pystencils import fields, TypedSymbol
 from pystencils.simp import sympy_cse
-from pystencils import AssignmentCollection
 
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
+from lbmpy.creationfunctions import create_lb_method
 from lbmpy.stencils import get_stencil
 
-from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field, generate_info_header
 from lbmpy_walberla import generate_lb_pack_info
 
 from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb, \
@@ -16,133 +15,145 @@ from lbmpy.phasefield_allen_cahn.force_model import MultiphaseForceModel
 
 import numpy as np
 
-stencil_phase = get_stencil("D3Q15", "walberla")
-stencil_hydro = get_stencil("D3Q27", "walberla")
-q_phase = len(stencil_phase)
-q_hydro = len(stencil_hydro)
-
-maxwellian_moments = False
-assert (len(stencil_phase[0]) == len(stencil_hydro[0]))
-dimensions = len(stencil_hydro[0])
-
-########################
-# PARAMETER DEFINITION #
-########################
-
-density_liquid = 1.0
-density_gas = 0.001
-surface_tension = 0.0001
-M = 0.02
-
-# phase-field parameter
-drho3 = (density_liquid - density_gas) / 3
-# interface thickness
-W = 5
-# coefficient related to surface tension
-beta = 12.0 * (surface_tension / W)
-# coefficient related to surface tension
-kappa = 1.5 * surface_tension * W
-# relaxation rate allen cahn (h)
-w_c = 1.0 / (0.5 + (3.0 * M))
-
-########################
-# FIELDS #
-########################
-
-# velocity field
-u = fields(f"vel_field({dimensions}): [{dimensions}D]", layout='fzyx')
-# phase-field
-C = fields(f"phase_field: [{dimensions}D]", layout='fzyx')
-C_tmp = fields(f"phase_field_tmp: [{dimensions}D]", layout='fzyx')
-
-# phase-field distribution functions
-h = fields(f"lb_phase_field({q_phase}): [{dimensions}D]", layout='fzyx')
-h_tmp = fields(f"lb_phase_field_tmp({q_phase}): [{dimensions}D]", layout='fzyx')
-# hydrodynamic distribution functions
-g = fields(f"lb_velocity_field({q_hydro}): [{dimensions}D]", layout='fzyx')
-g_tmp = fields(f"lb_velocity_field_tmp({q_hydro}): [{dimensions}D]", layout='fzyx')
-
-########################################
-# RELAXATION RATES AND EXTERNAL FORCES #
-########################################
-
-# calculate the relaxation rate for the hydro lb as well as the body force
-density = density_gas + C.center * (density_liquid - density_gas)
-
-# force acting on all phases of the model
-body_force = np.array([0, 1e-6, 0])
-
-relaxation_time = 0.03 + 0.5
-relaxation_rate = 1.0 / relaxation_time
-
-###############
-# LBM METHODS #
-###############
-
-method_phase = create_lb_method(stencil=stencil_phase, method='srt', relaxation_rate=w_c, compressible=True)
-
-method_hydro = create_lb_method(stencil=stencil_hydro, method="mrt", weighted=True,
-                                relaxation_rates=[relaxation_rate, 1, 1, 1, 1, 1],
-                                maxwellian_moments=maxwellian_moments)
-
-# create the kernels for the initialization of the g and h field
-h_updates = initializer_kernel_phase_field_lb(h, C, u, method_phase, W)
-g_updates = initializer_kernel_hydro_lb(g, u, method_hydro)
-
-
-force_h = [f / 3 for f in interface_tracking_force(C, stencil_phase, W, fd_stencil=get_stencil("D3Q27"))]
-force_model_h = MultiphaseForceModel(force=force_h)
-
-force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta, body_force,
-                             fd_stencil=get_stencil("D3Q27"))
-
-force_model_g = MultiphaseForceModel(force=force_g, rho=density)
-
-####################
-# LBM UPDATE RULES #
-####################
-
-phase_field_LB_step = get_collision_assignments_phase(lb_method=method_phase,
-                                                      velocity_input=u,
-                                                      output={'density': C_tmp},
-                                                      force_model=force_model_h,
-                                                      symbolic_fields={"symbolic_field": h,
-                                                                       "symbolic_temporary_field": h_tmp},
-                                                      kernel_type='stream_pull_collide')
-
-phase_field_LB_step = sympy_cse(phase_field_LB_step)
-
-# ---------------------------------------------------------------------------------------------------------
-
-hydro_LB_step = get_collision_assignments_hydro(lb_method=method_hydro,
-                                                density=density,
-                                                velocity_input=u,
-                                                force_model=force_model_g,
-                                                sub_iterations=2,
-                                                symbolic_fields={"symbolic_field": g,
-                                                                 "symbolic_temporary_field": g_tmp},
-                                                kernel_type='collide_stream_push')
-
-###################
-# GENERATE SWEEPS #
-###################
-
-cpu_vec = {'assume_inner_stride_one': True, 'nontemporal': True}
-
-vp = [('int32_t', 'cudaBlockSize0'),
-      ('int32_t', 'cudaBlockSize1')]
+with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
 
-sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                    TypedSymbol("cudaBlockSize1", np.int32),
-                    1)
-sweep_params = {'block_size': sweep_block_size}
+    stencil_phase_name = "D3Q15"
+    stencil_hydro_name = "D3Q27"
+
+    stencil_phase = get_stencil(stencil_phase_name, "walberla")
+    stencil_hydro = get_stencil(stencil_hydro_name, "walberla")
+    q_phase = len(stencil_phase)
+    q_hydro = len(stencil_hydro)
+
+    assert (len(stencil_phase[0]) == len(stencil_hydro[0]))
+    dimensions = len(stencil_hydro[0])
+
+    ########################
+    # PARAMETER DEFINITION #
+    ########################
+
+    density_liquid = 1.0
+    density_gas = 0.001
+    surface_tension = 0.0001
+    M = 0.02
+
+    # phase-field parameter
+    drho3 = (density_liquid - density_gas) / 3
+    # interface thickness
+    W = 5
+    # coefficient related to surface tension
+    beta = 12.0 * (surface_tension / W)
+    # coefficient related to surface tension
+    kappa = 1.5 * surface_tension * W
+    # relaxation rate allen cahn (h)
+    w_c = 1.0 / (0.5 + (3.0 * M))
+
+    ########################
+    # FIELDS #
+    ########################
+
+    # velocity field
+    u = fields(f"vel_field({dimensions}): {field_type}[{dimensions}D]", layout='fzyx')
+    # phase-field
+    C = fields(f"phase_field: {field_type}[{dimensions}D]", layout='fzyx')
+    C_tmp = fields(f"phase_field_tmp: {field_type}[{dimensions}D]", layout='fzyx')
+
+    # phase-field distribution functions
+    h = fields(f"lb_phase_field({q_phase}): {field_type}[{dimensions}D]", layout='fzyx')
+    h_tmp = fields(f"lb_phase_field_tmp({q_phase}): {field_type}[{dimensions}D]", layout='fzyx')
+    # hydrodynamic distribution functions
+    g = fields(f"lb_velocity_field({q_hydro}): {field_type}[{dimensions}D]", layout='fzyx')
+    g_tmp = fields(f"lb_velocity_field_tmp({q_hydro}): {field_type}[{dimensions}D]", layout='fzyx')
+
+    ########################################
+    # RELAXATION RATES AND EXTERNAL FORCES #
+    ########################################
+
+    # calculate the relaxation rate for the hydro lb as well as the body force
+    density = density_gas + C.center * (density_liquid - density_gas)
+
+    # force acting on all phases of the model
+    body_force = np.array([0, 1e-6, 0])
+
+    relaxation_time = 0.03 + 0.5
+    relaxation_rate = 1.0 / relaxation_time
+
+    ###############
+    # LBM METHODS #
+    ###############
+
+    method_phase = create_lb_method(stencil=stencil_phase, method='srt', relaxation_rate=w_c, compressible=True)
+
+    method_hydro = create_lb_method(stencil=stencil_hydro, method="mrt", weighted=True,
+                                    relaxation_rates=[relaxation_rate, 1, 1, 1, 1, 1])
+
+    # create the kernels for the initialization of the g and h field
+    h_updates = initializer_kernel_phase_field_lb(h, C, u, method_phase, W)
+    g_updates = initializer_kernel_hydro_lb(g, u, method_hydro)
+
+    force_h = [f / 3 for f in interface_tracking_force(C, stencil_phase, W, fd_stencil=get_stencil("D3Q27"))]
+    force_model_h = MultiphaseForceModel(force=force_h)
+
+    force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta,
+                                 body_force,
+                                 fd_stencil=get_stencil("D3Q27"))
+
+    force_model_g = MultiphaseForceModel(force=force_g, rho=density)
+
+    ####################
+    # LBM UPDATE RULES #
+    ####################
+
+    phase_field_LB_step = get_collision_assignments_phase(lb_method=method_phase,
+                                                          velocity_input=u,
+                                                          output={'density': C_tmp},
+                                                          force_model=force_model_h,
+                                                          symbolic_fields={"symbolic_field": h,
+                                                                           "symbolic_temporary_field": h_tmp},
+                                                          kernel_type='stream_pull_collide')
+
+    phase_field_LB_step = sympy_cse(phase_field_LB_step)
+
+    # ---------------------------------------------------------------------------------------------------------
+
+    hydro_LB_step = get_collision_assignments_hydro(lb_method=method_hydro,
+                                                    density=density,
+                                                    velocity_input=u,
+                                                    force_model=force_model_g,
+                                                    sub_iterations=2,
+                                                    symbolic_fields={"symbolic_field": g,
+                                                                     "symbolic_temporary_field": g_tmp},
+                                                    kernel_type='collide_stream_push')
+
+    hydro_LB_step = sympy_cse(hydro_LB_step)
+    ###################
+    # GENERATE SWEEPS #
+    ###################
+
+    cpu_vec = {'assume_inner_stride_one': True, 'nontemporal': True}
+
+    vp = [('int32_t', 'cudaBlockSize0'),
+          ('int32_t', 'cudaBlockSize1'),
+          ('int32_t', 'cudaBlockSize2')]
+
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        TypedSymbol("cudaBlockSize2", np.int32))
+    sweep_params = {'block_size': sweep_block_size}
+
+    stencil_typedefs = {'Stencil_phase_T': stencil_phase,
+                        'Stencil_hydro_T': stencil_hydro}
+    field_typedefs = {'PdfField_phase_T': h,
+                      'PdfField_hydro_T': g,
+                      'VelocityField_T': u,
+                      'PhaseField_T': C}
 
-info_header = f"""
-#include "stencil/D3Q{q_phase}.h"\nusing Stencil_phase_T = walberla::stencil::D3Q{q_phase};
-#include "stencil/D3Q{q_hydro}.h"\nusing Stencil_hydro_T = walberla::stencil::D3Q{q_hydro};
-"""
+    additional_code = f"""
+    const char * StencilNamePhase = "{stencil_phase_name}";
+    const char * StencilNameHydro = "{stencil_hydro_name}";
+    """
 
-with CodeGeneration() as ctx:
     if not ctx.cuda:
         if not ctx.optimize_for_localhost:
             cpu_vec = {'instruction_set': None}
@@ -169,8 +180,6 @@ with CodeGeneration() as ctx:
 
         generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target='cpu')
 
-        ctx.write_file("GenDefines.h", info_header)
-
     if ctx.cuda:
         generate_sweep(ctx, 'initialize_phase_field_distributions',
                        h_updates, target='gpu')
@@ -179,14 +188,12 @@ with CodeGeneration() as ctx:
 
         generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
                        field_swaps=[(h, h_tmp), (C, C_tmp)],
-                       inner_outer_split=True,
                        target='gpu',
                        gpu_indexing_params=sweep_params,
                        varying_parameters=vp)
 
         generate_sweep(ctx, 'hydro_LB_step', hydro_LB_step,
                        field_swaps=[(g, g_tmp)],
-                       inner_outer_split=True,
                        target='gpu',
                        gpu_indexing_params=sweep_params,
                        varying_parameters=vp)
@@ -199,6 +206,8 @@ with CodeGeneration() as ctx:
 
         generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target='gpu')
 
-        ctx.write_file("GenDefines.h", info_header)
+        # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'GenDefines', stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
+                         additional_code=additional_code)
 
 print("finished code generation successfully")
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/profiling.py b/apps/benchmarks/PhaseFieldAllenCahn/profiling.py
index f8e6bed300cb1ce5e04d1ae46915bf926d90c809..d2e416f93f1bdfb21b030c840dd86e0700197bbf 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/profiling.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/profiling.py
@@ -16,15 +16,13 @@ class Scenario:
                      self.cells[1] * self.blocks[1],
                      self.cells[2] * self.blocks[2])
 
-        self.timeStepStrategy = 'phase_only'
-        self.overlappingWidth = (1, 1, 1)
+        self.timeStepStrategy = 'phase_only'  # 'phase_only', 'hydro_only', 'kernel_only', 'normal'
         self.cuda_block_size = (128, 1, 1)
 
         # bubble parameters
         self.bubbleRadius = min(self.size) // 4
         self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
 
-        self.scenario = 1  # 1 rising bubble, 2 RTI
         self.config_dict = self.config()
 
     @wlb.member_callback
@@ -38,13 +36,11 @@ class Scenario:
             'Parameters': {
                 'timesteps': self.timesteps,
                 'vtkWriteFrequency': self.vtkWriteFrequency,
-                'useGui': 0,
                 'remainingTimeLoggerFrequency': -1,
                 'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
                 'gpuBlockSize': self.cuda_block_size,
                 'warmupSteps': 0,
-                'scenario': self.scenario,
+                'scenario': 1,
             },
             'Boundaries_GPU': {
                 'Border': []
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
index b3e7ac03396100609e509149f35ab7179712f945..8b503fedaabc15bd0b8bc465ac6ed7a1f6d1047d 100755
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -11,10 +11,9 @@ import os
 import waLBerla as wlb
 from waLBerla.tools.config import block_decomposition
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
-from functools import reduce
-import operator
 import sys
 import sqlite3
+from math import prod
 
 # Number of time steps run for a workload of 128^3 per GPU
 # if double as many cells are on the GPU, half as many time steps are run etc.
@@ -36,10 +35,6 @@ BASE_CONFIG = {
 }
 
 
-def prod(seq):
-    return reduce(operator.mul, seq, 1)
-
-
 def num_time_steps(block_size, time_steps_for_128_block=200):
     cells = block_size[0] * block_size[1] * block_size[2]
     time_steps = (128 ** 3 / cells) * time_steps_for_128_block
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py b/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py
index 34bdb9fdceb1432638a89d5670c7c8ce537158d5..2f6c36edd7d96359a6c1c7aef9eeebd9bfdcbfea 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py
@@ -2,7 +2,7 @@ import waLBerla as wlb
 
 
 class Scenario:
-    def __init__(self):
+    def __init__(self, cuda_enabled_mpi=False):
         # output frequencies
         self.vtkWriteFrequency = 1000
 
@@ -28,6 +28,9 @@ class Scenario:
         self.counter = 0
         self.yPositions = []
 
+        self.cudaEnabledMpi = cuda_enabled_mpi
+        self.cuda_blocks = (64, 2, 2)
+
     @wlb.member_callback
     def config(self):
         return {
@@ -44,6 +47,8 @@ class Scenario:
                 'overlappingWidth': self.overlappingWidth,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
+                'cudaEnabledMpi': self.cudaEnabledMpi,
+                'gpuBlockSize': self.cuda_blocks
             },
             'PhysicalParameters': {
                 'density_liquid': 1.0,
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
index a97ea043fd4facc1010f2838b99e5269c91a6023..f2c8f0087fd54a3de147537613b8282a4b5f23a8 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
@@ -58,16 +58,6 @@
 ////////////////////////////
 
 #include "GenDefines.h"
-#include "PackInfo_phase_field.h"
-#include "PackInfo_phase_field_distributions.h"
-#include "PackInfo_velocity_based_distributions.h"
-#include "hydro_LB_NoSlip.h"
-#include "hydro_LB_step.h"
-#include "initialize_phase_field_distributions.h"
-#include "initialize_velocity_based_distributions.h"
-#include "phase_field_LB_NoSlip.h"
-#include "phase_field_LB_step.h"
-#include "ContactAngle.h"
 
 ////////////
 // USING //
@@ -75,31 +65,11 @@
 
 using namespace walberla;
 
-using NormalsField_T = GhostLayerField< int8_t, Stencil_hydro_T::Dimension >;
-using FlagField_T    = FlagField< uint8_t >;
+using FlagField_T = FlagField< uint8_t >;
 
 typedef cuda::GPUField< real_t > GPUField;
 typedef cuda::GPUField< uint8_t > GPUField_int;
 
-class Filter
-{
- public:
-   explicit Filter(Vector3< uint_t > numberOfCells) : numberOfCells_(numberOfCells) {}
-
-   void operator()(const IBlock& /*block*/) {}
-
-   bool operator()(const cell_idx_t x, const cell_idx_t y, const cell_idx_t z) const
-   {
-      return x >= -1 && x <= cell_idx_t(numberOfCells_[0]) && y >= -1 && y <= cell_idx_t(numberOfCells_[1]) &&
-             z >= -1 && z <= cell_idx_t(numberOfCells_[2]);
-   }
-
- private:
-   Vector3< uint_t > numberOfCells_;
-};
-
-using FluidFilter_T = Filter;
-
 int main(int argc, char** argv)
 {
    mpi::Environment Env(argc, argv);
@@ -120,7 +90,7 @@ int main(int argc, char** argv)
 
       auto domainSetup                = config->getOneBlock("DomainSetup");
       Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
-      const bool tube                 = domainSetup.getParameter< bool >("tube", true);
+      const bool tube                 = domainSetup.getParameter< bool >("tube", false);
 
       ////////////////////////////////////////
       // ADD GENERAL SIMULATION PARAMETERS //
@@ -132,10 +102,9 @@ int main(int argc, char** argv)
       const real_t remainingTimeLoggerFrequency =
          parameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0);
       const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
-      Vector3< int > overlappingWidth =
-         parameters.getParameter< Vector3< int > >("overlappingWidth", Vector3< int >(1, 1, 1));
       Vector3< int > gpuBlockSize =
          parameters.getParameter< Vector3< int > >("gpuBlockSize", Vector3< int >(128, 1, 1));
+      const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
 
       /////////////////////////
       // ADD DATA TO BLOCKS //
@@ -178,7 +147,6 @@ int main(int argc, char** argv)
          const real_t bubbleRadius              = bubbleParameters.getParameter< real_t >("bubbleRadius", 20.0);
          const bool bubble                      = bubbleParameters.getParameter< bool >("bubble", true);
          initPhaseField_sphere(blocks, phase_field, bubbleRadius, bubbleMidPoint, bubble, interface_thickness);
-
       }
       else if (scenario == 2)
       {
@@ -213,33 +181,32 @@ int main(int argc, char** argv)
                                                               interface_thickness);
       pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
 
-      pystencils::phase_field_LB_step phase_field_LB_step(
-         flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, interface_thickness, mobility,
-         gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2],
-         Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+      pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu,
+                                                          vel_field_gpu, interface_thickness, mobility, gpuBlockSize[0],
+                                                          gpuBlockSize[1], gpuBlockSize[2]);
 
       pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu,
                                               gravitational_acceleration, interface_thickness, density_liquid,
                                               density_gas, surface_tension, relaxation_time_liquid, relaxation_time_gas,
-                                              gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2],
-                                              Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+                                              gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
 
       ////////////////////////
       // ADD COMMUNICATION //
       //////////////////////
 
       auto Comm_velocity_based_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 1);
+         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_velocity_based_distributions =
          make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
 
-      auto Comm_phase_field = make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 1);
+      auto Comm_phase_field =
+         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
       Comm_phase_field->addPackInfo(generatedPackInfo_phase_field);
 
       auto Comm_phase_field_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 1);
+         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field_distributions =
          make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
@@ -261,7 +228,8 @@ int main(int argc, char** argv)
             const real_t eccentricity      = domainSetup.getParameter< real_t >("ratio", real_c(0));
             const real_t start_transition  = domainSetup.getParameter< real_t >("start_transition", real_c(60));
             const real_t length_transition = domainSetup.getParameter< real_t >("length_transition", real_c(10));
-            const bool eccentricity_or_pipe_ration = domainSetup.getParameter< bool >("eccentricity_or_pipe_ration", true);
+            const bool eccentricity_or_pipe_ration =
+               domainSetup.getParameter< bool >("eccentricity_or_pipe_ration", true);
             initTubeWithCylinder(blocks, flagFieldID, wallFlagUID, inner_radius, eccentricity, start_transition,
                                  length_transition, eccentricity_or_pipe_ration);
          }
@@ -290,56 +258,30 @@ int main(int argc, char** argv)
 
       auto timeLoop       = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
       auto normalTimeStep = [&]() {
+         Comm_velocity_based_distributions->startCommunication(defaultStream);
          for (auto& block : *blocks)
          {
-            Comm_phase_field_distributions->communicate(nullptr);
-            phase_field_LB_NoSlip(&block);
-
-            phase_field_LB_step(&block);
-            contact_angle(&block);
-            Comm_phase_field->communicate(nullptr);
-
-            hydro_LB_step(&block);
-
-            Comm_velocity_based_distributions->communicate(nullptr);
-            hydro_LB_NoSlip(&block);
+            phase_field_LB_NoSlip(&block, defaultStream);
+            phase_field_LB_step(&block, defaultStream);
          }
-      };
-      auto simpleOverlapTimeStep = [&]() {
+         Comm_velocity_based_distributions->wait(defaultStream);
+
          for (auto& block : *blocks)
-            phase_field_LB_NoSlip(&block);
+         {
+            contact_angle(&block, defaultStream);
+            Comm_phase_field->communicate(defaultStream);
+         }
 
          Comm_phase_field_distributions->startCommunication(defaultStream);
          for (auto& block : *blocks)
-            phase_field_LB_step.inner(&block, defaultStream);
+         {
+            hydro_LB_NoSlip(&block, defaultStream);
+            hydro_LB_step(&block, defaultStream);
+         }
          Comm_phase_field_distributions->wait(defaultStream);
-         for (auto& block : *blocks)
-            phase_field_LB_step.outer(&block, defaultStream);
-
-         for (auto& block : *blocks)
-            contact_angle(&block);
-
-         Comm_phase_field->startCommunication(defaultStream);
-         for (auto& block : *blocks)
-            hydro_LB_step.inner(&block, defaultStream);
-         Comm_phase_field->wait(defaultStream);
-         for (auto& block : *blocks)
-            hydro_LB_step.outer(&block, defaultStream);
-
-         for (auto& block : *blocks)
-            hydro_LB_NoSlip(&block);
       };
       std::function< void() > timeStep;
-      if (timeStepStrategy == "overlap")
-      {
-         timeStep = std::function< void() >(simpleOverlapTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("overlapping timestep")
-      }
-      else
-      {
-         timeStep = std::function< void() >(normalTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("normal timestep with no overlapping")
-      }
+      timeStep = std::function< void() >(normalTimeStep);
 
       timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
 
@@ -360,77 +302,87 @@ int main(int argc, char** argv)
          init_h(&block);
          init_g(&block);
       }
+
+      for (auto& block : *blocks)
+      {
+         Comm_phase_field_distributions->communicate(nullptr);
+         phase_field_LB_NoSlip(&block);
+      }
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the PDFs done")
-      uint_t dbWriteFrequency = parameters.getParameter< uint_t >("dbWriteFrequency", 10000000);
+      uint_t dbWriteFrequency = parameters.getParameter< uint_t >("dbWriteFrequency", 0);
       int targetRank          = 0;
 
-      timeLoop->addFuncAfterTimeStep(
-         [&]() {
-            if (timeLoop->getCurrentTimeStep() % dbWriteFrequency == 0)
-            {
-               cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-               cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
-               if (scenario == 4)
+      if (dbWriteFrequency > 0)
+      {
+         timeLoop->addFuncAfterTimeStep(
+            [&]() {
+               if (timeLoop->getCurrentTimeStep() % dbWriteFrequency == 0)
                {
-                  std::array< real_t, 4 > total_velocity = { 0.0, 0.0, 0.0, 0.0 };
-                  real_t volume;
-                  uint_t nrCells;
-                  PhaseField_T gatheredPhaseField(0, 0, 0, 0);
-                  VelocityField_T gatheredVelocityField(0, 0, 0, 0);
-
-                  CellInterval boundingBox = blocks->getDomainCellBB();
-                  if (cell_idx_t(center_of_mass[1] - cell_idx_t(cellsPerBlock[0]) * 1.5) >= 0)
-                     boundingBox.min()[1] = cell_idx_t(center_of_mass[1] - cell_idx_t(cellsPerBlock[0]) * 1.5);
-                  if (cell_idx_t(center_of_mass[1] + cell_idx_t(cellsPerBlock[0]) * 1.5) <= boundingBox.max()[1])
-                     boundingBox.max()[1] = cell_idx_t(center_of_mass[1] + cell_idx_t(cellsPerBlock[0]) * 1.5);
-
-                  field::gather< PhaseField_T >(gatheredPhaseField, blocks, phase_field, boundingBox, targetRank);
-                  field::gather< VelocityField_T >(gatheredVelocityField, blocks, vel_field, boundingBox, targetRank);
-
-                  WALBERLA_EXCLUSIVE_WORLD_SECTION(targetRank)
-                  {
-                     flood_fill(gatheredPhaseField, gatheredVelocityField, boundingBox, volume, nrCells, center_of_mass,
-                                total_velocity);
-                  }
-                  WALBERLA_MPI_SECTION() { walberla::mpi::broadcastObject(center_of_mass, targetRank); }
-
-                  python_coupling::PythonCallback callback("at_end_of_time_step");
-                  if (callback.isCallable())
+                  cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
+                  cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
+                  if (scenario == 4)
                   {
-                     callback.data().exposeValue("blocks", blocks);
-                     callback.data().exposeValue("timeStep", timeLoop->getCurrentTimeStep());
-                     callback.data().exposeValue("target_rank", targetRank);
-                     callback.data().exposeValue("bounding_box_min", boundingBox.min()[1]);
-                     callback.data().exposeValue("bounding_box_max", boundingBox.max()[1]);
-                     callback.data().exposeValue("total_velocity", total_velocity[0]);
-                     callback.data().exposeValue("total_velocity_X", total_velocity[1]);
-                     callback.data().exposeValue("total_velocity_Y", total_velocity[2]);
-                     callback.data().exposeValue("total_velocity_Z", total_velocity[3]);
-                     callback.data().exposeValue("center_of_mass_X", center_of_mass[0]);
-                     callback.data().exposeValue("center_of_mass_Y", center_of_mass[1]);
-                     callback.data().exposeValue("center_of_mass_Z", center_of_mass[2]);
-                     callback.data().exposeValue("sum_inv_phi", volume);
-                     callback.data().exposeValue("gas_cells_of_the_taylor_bubble", nrCells);
-                     callback.data().exposeValue("stencil_phase", stencil_phase_name);
-                     callback.data().exposeValue("stencil_hydro", stencil_hydro_name);
-                     callback();
+                     std::array< real_t, 4 > total_velocity = { 0.0, 0.0, 0.0, 0.0 };
+                     real_t volume;
+                     uint_t nrCells;
+                     PhaseField_T gatheredPhaseField(0, 0, 0, 0);
+                     VelocityField_T gatheredVelocityField(0, 0, 0, 0);
+
+                     CellInterval boundingBox = blocks->getDomainCellBB();
+                     if (cell_idx_t(center_of_mass[1] - cell_idx_t(cellsPerBlock[0]) * 1.5) >= 0)
+                        boundingBox.min()[1] = cell_idx_t(center_of_mass[1] - cell_idx_t(cellsPerBlock[0]) * 1.5);
+                     if (cell_idx_t(center_of_mass[1] + cell_idx_t(cellsPerBlock[0]) * 1.5) <= boundingBox.max()[1])
+                        boundingBox.max()[1] = cell_idx_t(center_of_mass[1] + cell_idx_t(cellsPerBlock[0]) * 1.5);
+
+                     field::gather< PhaseField_T >(gatheredPhaseField, blocks, phase_field, boundingBox, targetRank);
+                     field::gather< VelocityField_T >(gatheredVelocityField, blocks, vel_field, boundingBox,
+                                                      targetRank);
+
+                     WALBERLA_EXCLUSIVE_WORLD_SECTION(targetRank)
+                     {
+                        flood_fill(gatheredPhaseField, gatheredVelocityField, boundingBox, volume, nrCells,
+                                   center_of_mass, total_velocity);
+                     }
+                     WALBERLA_MPI_SECTION() { walberla::mpi::broadcastObject(center_of_mass, targetRank); }
+
+                     python_coupling::PythonCallback callback("at_end_of_time_step");
+                     if (callback.isCallable())
+                     {
+                        callback.data().exposeValue("blocks", blocks);
+                        callback.data().exposeValue("timeStep", timeLoop->getCurrentTimeStep());
+                        callback.data().exposeValue("target_rank", targetRank);
+                        callback.data().exposeValue("bounding_box_min", boundingBox.min()[1]);
+                        callback.data().exposeValue("bounding_box_max", boundingBox.max()[1]);
+                        callback.data().exposeValue("total_velocity", total_velocity[0]);
+                        callback.data().exposeValue("total_velocity_X", total_velocity[1]);
+                        callback.data().exposeValue("total_velocity_Y", total_velocity[2]);
+                        callback.data().exposeValue("total_velocity_Z", total_velocity[3]);
+                        callback.data().exposeValue("center_of_mass_X", center_of_mass[0]);
+                        callback.data().exposeValue("center_of_mass_Y", center_of_mass[1]);
+                        callback.data().exposeValue("center_of_mass_Z", center_of_mass[2]);
+                        callback.data().exposeValue("sum_inv_phi", volume);
+                        callback.data().exposeValue("gas_cells_of_the_taylor_bubble", nrCells);
+                        callback.data().exposeValue("stencil_phase", StencilNamePhase);
+                        callback.data().exposeValue("stencil_hydro", StencilNameHydro);
+                        callback();
+                     }
                   }
-               }
-               else
-               {
-                  python_coupling::PythonCallback callback("at_end_of_time_step");
-                  if (callback.isCallable())
+                  else
                   {
-                     callback.data().exposeValue("blocks", blocks);
-                     callback.data().exposeValue("timeStep", timeLoop->getCurrentTimeStep());
-                     callback.data().exposeValue("stencil_phase", stencil_phase_name);
-                     callback.data().exposeValue("stencil_hydro", stencil_hydro_name);
-                     callback();
+                     python_coupling::PythonCallback callback("at_end_of_time_step");
+                     if (callback.isCallable())
+                     {
+                        callback.data().exposeValue("blocks", blocks);
+                        callback.data().exposeValue("timeStep", timeLoop->getCurrentTimeStep());
+                        callback.data().exposeValue("stencil_phase", StencilNamePhase);
+                        callback.data().exposeValue("stencil_hydro", StencilNameHydro);
+                        callback();
+                     }
                   }
                }
-            }
-         },
-         "Python callback");
+            },
+            "Python callback");
+      }
 
       int meshWriteFrequency = parameters.getParameter< int >("meshWriteFrequency", 0);
       int counter            = 0;
@@ -469,7 +421,7 @@ int main(int argc, char** argv)
                                                          "simulation_step", false, true, true, false, 0);
          vtkOutput->addBeforeFunction([&]() {
             cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-            cuda::fieldCpy<VelocityField_T, GPUField>( blocks, vel_field, vel_field_gpu );
+            cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
          });
          auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T, float > >(phase_field, "PhaseField");
          vtkOutput->addCellDataWriter(phaseWriter);
@@ -480,12 +432,6 @@ int main(int argc, char** argv)
          auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float > >(vel_field, "Velocity");
          vtkOutput->addCellDataWriter(velWriter);
 
-         FluidFilter_T filter(cellsPerBlock);
-
-         auto QCriterionWriter = make_shared< lbm::QCriterionVTKWriter< VelocityField_T, FluidFilter_T, float > >(
-            blocks, filter, vel_field, "Q-Criterion");
-         vtkOutput->addCellDataWriter(QCriterionWriter);
-
          timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
 
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py
index 024b66a9e15c4291f045d500d4493de2448f137b..f4d6f9db166032fe1f57d60fa4ff13ec5f722f19 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py
@@ -10,7 +10,7 @@ from lbmpy.phasefield_allen_cahn.parameter_calculation import calculate_paramete
 
 
 class Scenario:
-    def __init__(self):
+    def __init__(self, cuda_enabled_mpi=False):
         # output frequencies
         self.vtkWriteFrequency = 1000
 
@@ -51,13 +51,14 @@ class Scenario:
         self.tube = False
 
         # everything else
-        self.dbFile = "RTI.csv"
-
         self.scenario = 2  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
 
         self.counter = 0
         self.yPositions = []
 
+        self.cudaEnabledMpi = cuda_enabled_mpi
+        self.cuda_blocks = (64, 2, 2)
+
         self.config_dict = self.config()
 
     @wlb.member_callback
@@ -75,6 +76,8 @@ class Scenario:
                 'dbWriteFrequency': self.dbWriteFrequency,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
+                'cudaEnabledMpi': self.cudaEnabledMpi,
+                'gpuBlockSize': self.cuda_blocks
             },
             'PhysicalParameters': {
                 'density_liquid': self.density_heavy,
@@ -147,7 +150,8 @@ class Scenario:
 
                 sequenceValuesToScalars(data)
 
-                csv_file = f"RTI_{data['stencil_phase']}_{data['stencil_hydro']}_Re_{self.reynolds_number}_tube.csv"
+                csv_file = f"RTI_{data['stencil_phase']}_{data['stencil_hydro']}_Re_{self.reynolds_number}"
+                csv_file += "_tube.csv" if self.tube else ".csv"
 
                 df = pd.DataFrame.from_records([data])
                 if not os.path.isfile(csv_file):
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_codegen.py b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_codegen.py
index 0abf47375eaa5d43924d6582761307278ecb52ea..ae42819a4b97579616086e14ba0d70391ef1839a 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_codegen.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_codegen.py
@@ -9,11 +9,11 @@ from lbmpy.creationfunctions import create_lb_method
 from lbmpy.stencils import get_stencil
 
 import pystencils_walberla
-from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field, generate_info_header
 from lbmpy_walberla import generate_boundary, generate_lb_pack_info
 
-from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb,\
-    initializer_kernel_hydro_lb, interface_tracking_force, hydrodynamic_force, get_collision_assignments_hydro,\
+from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb, \
+    initializer_kernel_hydro_lb, interface_tracking_force, hydrodynamic_force, get_collision_assignments_hydro, \
     get_collision_assignments_phase
 
 from lbmpy.phasefield_allen_cahn.force_model import MultiphaseForceModel
@@ -21,180 +21,174 @@ from lbmpy.phasefield_allen_cahn.force_model import MultiphaseForceModel
 import numpy as np
 import sympy as sp
 
-stencil_phase_name = "D3Q27"
-stencil_hydro_name = "D3Q27"
-
-contact_angle_in_degrees = 22
-
-stencil_phase = get_stencil(stencil_phase_name)
-stencil_hydro = get_stencil(stencil_hydro_name)
-q_phase = len(stencil_phase)
-q_hydro = len(stencil_hydro)
-
-assert (len(stencil_phase[0]) == len(stencil_hydro[0]))
-dimensions = len(stencil_hydro[0])
-
-########################
-# PARAMETER DEFINITION #
-########################
-
-density_liquid = sp.Symbol("rho_H")
-density_gas = sp.Symbol("rho_L")
-
-surface_tension = sp.Symbol("sigma")
-mobility = sp.Symbol("mobility")
-
-gravitational_acceleration = sp.Symbol("gravity")
-
-relaxation_time_liquid = sp.Symbol("tau_H")
-relaxation_time_gas = sp.Symbol("tau_L")
-
-# phase-field parameter
-drho3 = (density_liquid - density_gas) / 3
-# interface thickness
-W = sp.Symbol("interface_thickness")
-# coefficients related to surface tension
-beta = 12.0 * (surface_tension / W)
-kappa = 1.5 * surface_tension * W
-
-########################
-# FIELDS #
-########################
-
-# velocity field
-u = fields(f"vel_field({dimensions}): [{dimensions}D]", layout='fzyx')
-# phase-field
-C = fields(f"phase_field: [{dimensions}D]", layout='fzyx')
-C_tmp = fields(f"phase_field_tmp: [{dimensions}D]", layout='fzyx')
-
-flag = fields(f"flag_field: uint8[{dimensions}D]", layout='fzyx')
-# phase-field distribution functions
-h = fields(f"lb_phase_field({q_phase}): [{dimensions}D]", layout='fzyx')
-h_tmp = fields(f"lb_phase_field_tmp({q_phase}): [{dimensions}D]", layout='fzyx')
-# hydrodynamic distribution functions
-g = fields(f"lb_velocity_field({q_hydro}): [{dimensions}D]", layout='fzyx')
-g_tmp = fields(f"lb_velocity_field_tmp({q_hydro}): [{dimensions}D]", layout='fzyx')
-
-########################################
-# RELAXATION RATES AND EXTERNAL FORCES #
-########################################
-
-# relaxation rate for interface tracking LB step
-relaxation_rate_allen_cahn = 1.0 / (0.5 + (3.0 * mobility))
-# calculate the relaxation rate for hydrodynamic LB step
-density = density_gas + C.center * (density_liquid - density_gas)
-# force acting on all phases of the model
-body_force = np.array([0, gravitational_acceleration * density, 0])
-# calculation of the relaxation time via viscosities
-# viscosity = viscosity_gas * viscosity_liquid + C.center\
-#             * (density_liquid*viscosity_liquid - viscosity_liquid*viscosity_gas)
-# relaxation_time = 3 * viscosity / density + 0.5
-
-relaxation_time = 0.5 + relaxation_time_gas + C.center * (relaxation_time_liquid - relaxation_time_gas)
-# calculate the relaxation time if the phase-field has values over one
-# relaxation_rate = 1.0 / relaxation_time
-relaxation_rate = sp.Symbol("s8")
-relaxation_rate_cutoff = sp.Piecewise((1 / (0.5 + relaxation_time_liquid), C.center > 0.999),   # True value
-                                      (1 / relaxation_time, True))                              # Else value
-
-###############
-# LBM METHODS #
-###############
-
-# method_phase = create_lb_method(stencil=stencil_phase, method="mrt", compressible=True, weighted=True,
-#                                 relaxation_rates=[1, 1.5, 1, 1.5, 1, 1.5])
-method_phase = create_lb_method(stencil=stencil_phase, method="mrt", compressible=True, weighted=True,
-                                relaxation_rates=[1, 1, 1, 1, 1, 1])
-
-method_phase.set_conserved_moments_relaxation_rate(relaxation_rate_allen_cahn)
-
-method_hydro = create_lb_method(stencil=stencil_hydro, method="mrt", weighted=True,
-                                relaxation_rates=[relaxation_rate, 1, 1, 1, 1, 1])
-
-
-# create the kernels for the initialization of the g and h field
-h_updates = initializer_kernel_phase_field_lb(h, C, u, method_phase, W, fd_stencil=get_stencil("D3Q27"))
-g_updates = initializer_kernel_hydro_lb(g, u, method_hydro)
-
-force_h = [f / 3 for f in interface_tracking_force(C, stencil_phase, W, fd_stencil=get_stencil("D3Q27"))]
-force_model_h = MultiphaseForceModel(force=force_h)
-
-force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta, body_force,
-                             fd_stencil=get_stencil("D3Q27"))
-
-force_model_g = MultiphaseForceModel(force=force_g, rho=density)
-
-####################
-# LBM UPDATE RULES #
-####################
-
-phase_field_LB_step = get_collision_assignments_phase(lb_method=method_phase,
-                                                      velocity_input=u,
-                                                      output={'density': C_tmp},
-                                                      force_model=force_model_h,
-                                                      symbolic_fields={"symbolic_field": h,
-                                                                       "symbolic_temporary_field": h_tmp},
-                                                      kernel_type='stream_pull_collide')
-
-phase_field_LB_step = sympy_cse(phase_field_LB_step)
-
-phase_field_LB_step = [Conditional(sp.Eq(flag.center(), 2),
-                                   Block(phase_field_LB_step),
-                                   Block([Assignment(C_tmp.center, C.center)]))]
-# ---------------------------------------------------------------------------------------------------------
-hydro_LB_step = get_collision_assignments_hydro(lb_method=method_hydro,
-                                                density=density,
-                                                velocity_input=u,
-                                                force_model=force_model_g,
-                                                sub_iterations=2,
-                                                symbolic_fields={"symbolic_field": g,
-                                                                 "symbolic_temporary_field": g_tmp},
-                                                kernel_type='collide_stream_push')
-
-hydro_LB_step.set_sub_expressions_from_dict({**{relaxation_rate: relaxation_rate_cutoff},
-                                             **hydro_LB_step.subexpressions_dict})
-
-hydro_LB_step = [Conditional(sp.Eq(flag.center(), 2), Block(hydro_LB_step))]
-
-contact_angle = ContactAngle(contact_angle_in_degrees, W)
-
-
-###################
-# GENERATE SWEEPS #
-###################
-
-vp = [('int32_t', 'cudaBlockSize0'),
-      ('int32_t', 'cudaBlockSize1'),
-      ('int32_t', 'cudaBlockSize2')]
-
-sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                    TypedSymbol("cudaBlockSize1", np.int32),
-                    TypedSymbol("cudaBlockSize2", np.int32))
-
-sweep_params = {'block_size': sweep_block_size}
-
-info_header = f"""
-using namespace walberla;
-#include "stencil/D3Q{q_phase}.h"\nusing Stencil_phase_T = walberla::stencil::D3Q{q_phase};
-#include "stencil/D3Q{q_hydro}.h"\nusing Stencil_hydro_T = walberla::stencil::D3Q{q_hydro};
-using PdfField_phase_T = GhostLayerField<real_t, {q_phase}>;
-using PdfField_hydro_T = GhostLayerField<real_t, {q_hydro}>;
-using VelocityField_T = GhostLayerField<real_t, {dimensions}>;
-using PhaseField_T = GhostLayerField<real_t, 1>;
-#ifndef UTIL_H
-#define UTIL_H
-const char * stencil_phase_name = "{stencil_phase_name}";
-const char * stencil_hydro_name = "{stencil_hydro_name}";
-#endif
-"""
-
 with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
+
+    stencil_phase_name = "D3Q27"
+    stencil_hydro_name = "D3Q27"
+
+    contact_angle_in_degrees = 22
+
+    stencil_phase = get_stencil(stencil_phase_name)
+    stencil_hydro = get_stencil(stencil_hydro_name)
+    q_phase = len(stencil_phase)
+    q_hydro = len(stencil_hydro)
+
+    assert (len(stencil_phase[0]) == len(stencil_hydro[0]))
+    dimensions = len(stencil_hydro[0])
+
+    ########################
+    # PARAMETER DEFINITION #
+    ########################
+
+    density_liquid = sp.Symbol("rho_H")
+    density_gas = sp.Symbol("rho_L")
+
+    surface_tension = sp.Symbol("sigma")
+    mobility = sp.Symbol("mobility")
+
+    gravitational_acceleration = sp.Symbol("gravity")
+
+    relaxation_time_liquid = sp.Symbol("tau_H")
+    relaxation_time_gas = sp.Symbol("tau_L")
+
+    # phase-field parameter
+    drho3 = (density_liquid - density_gas) / 3
+    # interface thickness
+    W = sp.Symbol("interface_thickness")
+    # coefficients related to surface tension
+    beta = 12.0 * (surface_tension / W)
+    kappa = 1.5 * surface_tension * W
+
+    ########################
+    # FIELDS #
+    ########################
+
+    # velocity field
+    u = fields(f"vel_field({dimensions}): {field_type}[{dimensions}D]", layout='fzyx')
+    # phase-field
+    C = fields(f"phase_field: {field_type}[{dimensions}D]", layout='fzyx')
+    C_tmp = fields(f"phase_field_tmp: {field_type}[{dimensions}D]", layout='fzyx')
+
+    flag = fields(f"flag_field: uint8[{dimensions}D]", layout='fzyx')
+    # phase-field distribution functions
+    h = fields(f"lb_phase_field({q_phase}): {field_type}[{dimensions}D]", layout='fzyx')
+    h_tmp = fields(f"lb_phase_field_tmp({q_phase}): {field_type}[{dimensions}D]", layout='fzyx')
+    # hydrodynamic distribution functions
+    g = fields(f"lb_velocity_field({q_hydro}): {field_type}[{dimensions}D]", layout='fzyx')
+    g_tmp = fields(f"lb_velocity_field_tmp({q_hydro}): {field_type}[{dimensions}D]", layout='fzyx')
+
+    ########################################
+    # RELAXATION RATES AND EXTERNAL FORCES #
+    ########################################
+
+    # relaxation rate for interface tracking LB step
+    relaxation_rate_allen_cahn = 1.0 / (0.5 + (3.0 * mobility))
+    # calculate the relaxation rate for hydrodynamic LB step
+    density = density_gas + C.center * (density_liquid - density_gas)
+    # force acting on all phases of the model
+    body_force = np.array([0, gravitational_acceleration * density, 0])
+    # calculation of the relaxation time via viscosities
+    # viscosity = viscosity_gas * viscosity_liquid + C.center\
+    #             * (density_liquid*viscosity_liquid - viscosity_liquid*viscosity_gas)
+    # relaxation_time = 3 * viscosity / density + 0.5
+
+    relaxation_time = 0.5 + relaxation_time_gas + C.center * (relaxation_time_liquid - relaxation_time_gas)
+    # calculate the relaxation time if the phase-field has values over one
+    # relaxation_rate = 1.0 / relaxation_time
+    relaxation_rate = sp.Symbol("s8")
+    relaxation_rate_cutoff = sp.Piecewise((1 / (0.5 + relaxation_time_liquid), C.center > 0.999),  # True value
+                                          (1 / relaxation_time, True))  # Else value
+
+    ###############
+    # LBM METHODS #
+    ###############
+    method_phase = create_lb_method(stencil=stencil_phase, method="mrt", compressible=True, weighted=True,
+                                    relaxation_rates=[1, 1, 1, 1, 1, 1])
+
+    method_phase.set_first_moment_relaxation_rate(relaxation_rate_allen_cahn)
+
+    method_hydro = create_lb_method(stencil=stencil_hydro, method="mrt", weighted=True,
+                                    relaxation_rates=[relaxation_rate, 1, 1, 1, 1, 1])
+
+    # create the kernels for the initialization of the g and h field
+    h_updates = initializer_kernel_phase_field_lb(h, C, u, method_phase, W, fd_stencil=get_stencil("D3Q27"))
+    g_updates = initializer_kernel_hydro_lb(g, u, method_hydro)
+
+    force_h = [f / 3 for f in interface_tracking_force(C, stencil_phase, W, fd_stencil=get_stencil("D3Q27"))]
+    force_model_h = MultiphaseForceModel(force=force_h)
+
+    force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta,
+                                 body_force,
+                                 fd_stencil=get_stencil("D3Q27"))
+
+    force_model_g = MultiphaseForceModel(force=force_g, rho=density)
+
+    ####################
+    # LBM UPDATE RULES #
+    ####################
+
+    phase_field_LB_step = get_collision_assignments_phase(lb_method=method_phase,
+                                                          velocity_input=u,
+                                                          output={'density': C_tmp},
+                                                          force_model=force_model_h,
+                                                          symbolic_fields={"symbolic_field": h,
+                                                                           "symbolic_temporary_field": h_tmp},
+                                                          kernel_type='stream_pull_collide')
+
+    phase_field_LB_step = sympy_cse(phase_field_LB_step)
+
+    phase_field_LB_step = [Conditional(sp.Eq(flag.center(), 2),
+                                       Block(phase_field_LB_step),
+                                       Block([Assignment(C_tmp.center, C.center)]))]
+    # ---------------------------------------------------------------------------------------------------------
+    hydro_LB_step = get_collision_assignments_hydro(lb_method=method_hydro,
+                                                    density=density,
+                                                    velocity_input=u,
+                                                    force_model=force_model_g,
+                                                    sub_iterations=2,
+                                                    symbolic_fields={"symbolic_field": g,
+                                                                     "symbolic_temporary_field": g_tmp},
+                                                    kernel_type='collide_stream_push')
+
+    hydro_LB_step.set_sub_expressions_from_dict({**{relaxation_rate: relaxation_rate_cutoff},
+                                                 **hydro_LB_step.subexpressions_dict})
+
+    hydro_LB_step = [Conditional(sp.Eq(flag.center(), 2), Block(hydro_LB_step))]
+
+    contact_angle = ContactAngle(contact_angle_in_degrees, W)
+
+    ###################
+    # GENERATE SWEEPS #
+    ###################
+
+    vp = [('int32_t', 'cudaBlockSize0'),
+          ('int32_t', 'cudaBlockSize1'),
+          ('int32_t', 'cudaBlockSize2')]
+
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        TypedSymbol("cudaBlockSize2", np.int32))
+
+    sweep_params = {'block_size': sweep_block_size}
+
+    stencil_typedefs = {'Stencil_phase_T': stencil_phase,
+                        'Stencil_hydro_T': stencil_hydro}
+    field_typedefs = {'PdfField_phase_T': h,
+                      'PdfField_hydro_T': g,
+                      'VelocityField_T': u,
+                      'PhaseField_T': C}
+
+    additional_code = f"""
+    const char * StencilNamePhase = "{stencil_phase_name}";
+    const char * StencilNameHydro = "{stencil_hydro_name}";
+    """
+
     generate_sweep(ctx, 'initialize_phase_field_distributions', h_updates, target='gpu')
     generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target='gpu')
 
     generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
                    field_swaps=[(h, h_tmp), (C, C_tmp)],
-                   inner_outer_split=True,
                    target='gpu',
                    gpu_indexing_params=sweep_params,
                    varying_parameters=vp)
@@ -202,7 +196,6 @@ with CodeGeneration() as ctx:
 
     generate_sweep(ctx, 'hydro_LB_step', hydro_LB_step,
                    field_swaps=[(g, g_tmp)],
-                   inner_outer_split=True,
                    target='gpu',
                    gpu_indexing_params=sweep_params,
                    varying_parameters=vp)
@@ -221,6 +214,7 @@ with CodeGeneration() as ctx:
     pystencils_walberla.boundary.generate_boundary(ctx, 'ContactAngle', contact_angle,
                                                    C.name, stencil_hydro, index_shape=[], target='gpu')
 
-    ctx.write_file("GenDefines.h", info_header)
+    generate_info_header(ctx, 'GenDefines', stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
+                         additional_code=additional_code)
 
 print("finished code generation successfully")
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py
index 186590774aa43e8624eaf891f94616aad081360f..54f63d35f9ca27d6095d07751a8bb36bd910c3a1 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py
@@ -7,7 +7,7 @@ from lbmpy.phasefield_allen_cahn.parameter_calculation import calculate_dimensio
 
 
 class Scenario:
-    def __init__(self):
+    def __init__(self, cuda_enabled_mpi=False):
         # output frequencies
         self.vtkWriteFrequency = 1000
         self.dbWriteFrequency = 200
@@ -49,6 +49,9 @@ class Scenario:
         self.counter = 0
         self.yPositions = []
 
+        self.cudaEnabledMpi = cuda_enabled_mpi
+        self.cuda_blocks = (64, 2, 2)
+
     @wlb.member_callback
     def config(self):
         return {
@@ -65,6 +68,8 @@ class Scenario:
                 'overlappingWidth': self.overlappingWidth,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
+                'cudaEnabledMpi': self.cudaEnabledMpi,
+                'gpuBlockSize': self.cuda_blocks
             },
             'PhysicalParameters': {
                 'density_liquid': self.density_heavy,
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/parameters_taylor_bubble.py b/apps/showcases/PhaseFieldAllenCahn/GPU/parameters_taylor_bubble.py
deleted file mode 100644
index 55db7aa693400b12f71d097470c57bed2609ee8e..0000000000000000000000000000000000000000
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/parameters_taylor_bubble.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import math
-
-
-def calculate_parameters_taylor_bubble(reference_length=128,
-                                       reference_time=16000,
-                                       density_heavy=1.0,
-                                       d1=0.0254,
-                                       d2=0.0127):
-    r"""
-    Calculate the simulation parameters for a rising Taylor bubble. The calculation can be found in
-    10.1016/S0009-2509(97)00210-8 by G. Das
-
-    Args:
-        reference_length: chosen reference length
-        reference_time: chosen reference time
-        density_heavy: density of the heavier fluid
-        d1: diameter of the outer tube
-        d2: diameter of the inner cylinder
-    """
-
-    water_rho = 998  # kg/m3
-    air_rho = 1.2047  # kg/m3
-    surface_tension = 0.07286  # kg/s2
-    water_mu = 1.002e-3  # kg/ms
-
-    water_nu = water_mu / water_rho  # m2/s
-    air_mu = 1.8205e-5  # kg/ms
-    air_nu = air_mu / air_rho  # m2/s
-    gravity = 9.81  # m/s2
-
-    dh = d1 - d2
-    dr = d1 / d2
-    de = d1 + d2
-    # ur = 0.1695  # (0.28913, 0.23882, 0.1695)
-
-    inverse_viscosity_number = math.sqrt((water_rho - air_rho) * water_rho * gravity * dh ** 3) / water_mu
-    bond_number = (water_rho - air_rho) * gravity * dh ** 2 / surface_tension
-    morton_number = gravity * water_mu ** 4 * (water_rho - air_rho) / (water_rho ** 2 * surface_tension ** 3)
-
-    d = reference_length / dr
-
-    density_light = 1.0 / (water_rho / air_rho)
-    dh = reference_length - d
-    g = dh / reference_time ** 2
-
-    mu_h = math.sqrt((density_heavy - density_light) * density_heavy * g * dh ** 3) / inverse_viscosity_number
-    mu_l = mu_h / (water_mu / air_mu)
-
-    dynamic_viscosity_heavy = mu_h / density_heavy
-    dynamic_viscosity_light = mu_l / density_light
-
-    relaxation_time_heavy = 3 * dynamic_viscosity_heavy
-    relaxation_time_light = 3 * dynamic_viscosity_light
-
-    sigma = (density_heavy - density_light) * g * dh ** 2 / bond_number
-
-    parameters = {
-        "inverse_viscosity_number": inverse_viscosity_number,
-        "bond_number": bond_number,
-        "morton_number": morton_number,
-        "density_light": density_light,
-        "dynamic_viscosity_heavy": dynamic_viscosity_heavy,
-        "dynamic_viscosity_light": dynamic_viscosity_light,
-        "relaxation_time_heavy": relaxation_time_heavy,
-        "relaxation_time_light": relaxation_time_light,
-        "gravitational_acceleration": -g,
-        "surface_tension": sigma
-    }
-    return parameters
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/taylor_bubble.py b/apps/showcases/PhaseFieldAllenCahn/GPU/taylor_bubble.py
index 0afb25234ac119bff33d6876e23bfec7cd79391b..9bac32e1b46874049188db142d8503aeb10848a1 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/taylor_bubble.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/taylor_bubble.py
@@ -11,7 +11,7 @@ from waLBerla.tools.sqlitedb import sequenceValuesToScalars
 from scipy.ndimage.filters import gaussian_filter
 import scipy.spatial as spatial
 
-from parameters_taylor_bubble import calculate_parameters_taylor_bubble
+from lbmpy.phasefield_allen_cahn.parameter_calculation import calculate_parameters_taylor_bubble
 
 
 def intersection(points1, points2, eps):
@@ -55,16 +55,17 @@ def get_circle(midpoint, radius):
 
 
 class Scenario:
-    def __init__(self):
+    def __init__(self, cuda_enabled_mpi=False):
         self.density_liquid = 1.0
         self.reference_time = 16000
         self.reference_length = 128
         d1 = 0.0254  # (0.0508, 0.0381, 0.0254)
         d2 = 0.0127  # (0.0254, 0.0127, 0.0127)
-        gpus = 1
-        self.interface_width = 5
+        self.interface_width = 4
         self.mobility = 0.05
 
+        num_processes = wlb.mpi.numProcesses()
+
         # output frequencies
         self.vtkWriteFrequency = self.reference_time
         self.dbWriteFrequency = self.reference_time // 25
@@ -74,8 +75,8 @@ class Scenario:
         # simulation parameters
         self.diameter = self.reference_length
         self.timesteps = self.reference_time * 15 + 1
-        self.cells = (self.diameter, (self.reference_length * 15) // gpus, self.diameter)
-        self.blocks = (1, gpus, 1)
+        self.cells = (self.diameter, (self.reference_length * 15) // num_processes, self.diameter)
+        self.blocks = (1, num_processes, 1)
         self.periodic = (0, 0, 0)
         self.size = (self.cells[0] * self.blocks[0],
                      self.cells[1] * self.blocks[1],
@@ -86,16 +87,13 @@ class Scenario:
         self.center_y = self.size[1] / 2
         self.center_z = self.size[2] / 2
 
-        self.overlappingWidth = (8, 1, 1)
-        self.timeStepStrategy = 'normal'
-
         self.scenario = 4  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
 
         self.counter = 0
         self.yPositions = []
 
         self.eccentricity_or_pipe_ratio = False  # if True eccentricity is conducted otherwise pipe ratio
-        self.ratio = 0.5
+        self.ratio = 0
 
         self.start_transition = (self.size[1] // 2) - 2 * self.diameter
         self.length_transition = 4 * self.diameter
@@ -117,8 +115,8 @@ class Scenario:
         parameters = calculate_parameters_taylor_bubble(reference_length=self.reference_length,
                                                         reference_time=self.reference_time,
                                                         density_heavy=self.density_liquid,
-                                                        d1=d1,
-                                                        d2=d2)
+                                                        diameter_outer=d1,
+                                                        diameter_inner=d2)
 
         self.density_gas = parameters["density_light"]
         self.surface_tension = parameters["surface_tension"]
@@ -128,6 +126,9 @@ class Scenario:
         self.relaxation_time_liquid = parameters.get("relaxation_time_heavy")
         self.relaxation_time_gas = parameters.get("relaxation_time_light")
 
+        self.cudaEnabledMpi = cuda_enabled_mpi
+        self.cuda_blocks = (64, 2, 2)
+
         self.config_dict = self.config()
 
     @wlb.member_callback
@@ -151,6 +152,8 @@ class Scenario:
                 'meshWriteFrequency': self.meshWriteFrequency,
                 'remainingTimeLoggerFrequency': 60.0,
                 'scenario': self.scenario,
+                'cudaEnabledMpi': self.cudaEnabledMpi,
+                'gpuBlockSize': self.cuda_blocks
             },
             'PhysicalParameters': {
                 'density_liquid': self.density_liquid,
@@ -275,7 +278,7 @@ class Scenario:
                 else:
                     theta = 360
 
-                if t % self.pngWriteFrequency == 0:
+                if self.pngWriteFrequency > 0 and t % self.pngWriteFrequency == 0:
                     plt.plot(center_x + np.linspace(0, center_x) * np.cos(np.radians(angle)),
                              center_z + np.linspace(0, center_z) * np.sin(np.radians(angle)), 'b-')
                     plt.plot(center_x + np.linspace(0, center_x) * np.cos(np.radians(angle1)),
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/util.h b/apps/showcases/PhaseFieldAllenCahn/GPU/util.h
index cefd57d38736d9f8370a86c78e07215fa947e851..0d5689c6945247ac722c32ee61d7f19c00ce6496 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/util.h
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/util.h
@@ -17,6 +17,7 @@
 //! \author Markus Holzer <markus.holzer@fau.de>
 //
 //======================================================================================================================
+#pragma once
 
 #include "python_coupling/DictWrapper.h"
 
@@ -28,7 +29,7 @@
 #include "field/FlagField.h"
 #include "field/vtk/VTKWriter.h"
 #include "GenDefines.h"
-#pragma once
+
 
 namespace walberla {
 
diff --git a/python/pystencils_walberla/templates/Boundary.tmpl.h b/python/pystencils_walberla/templates/Boundary.tmpl.h
index 33c355a530e9313f52d9c623a5e71da71d0a9c7c..cc5351d519cc46b618d100dba1460aeaaf8ea434 100644
--- a/python/pystencils_walberla/templates/Boundary.tmpl.h
+++ b/python/pystencils_walberla/templates/Boundary.tmpl.h
@@ -251,7 +251,7 @@ public:
             {
             {%- for dirIdx, dirVec, offset in additional_data_handler.stencil_info %}
                 dx = {{dirVec[0]}}; dy = {{dirVec[1]}}; {%if dim == 3%} dz = {{dirVec[2]}}; {% endif %}
-                dot = dx*sum_x + dy*sum_y {%if dim == 3%} + dz*sum_z {% endif %};
+                dot = real_c( dx*sum_x + dy*sum_y {%if dim == 3%} + dz*sum_z {% endif %});
                 if (dot > maxn)
                 {
                     maxn = dot;
diff --git a/python/pystencils_walberla/utility.py b/python/pystencils_walberla/utility.py
index cda1ddbec3a6f98a9ea75d96037866c5018875b8..3283e84c044ca24d28b1c6aac690d19787e23b5c 100644
--- a/python/pystencils_walberla/utility.py
+++ b/python/pystencils_walberla/utility.py
@@ -47,7 +47,9 @@ def generate_info_header(ctx: CodeGenerationContext,
     typedefs = {**stencil_typedefs, **field_typedefs, **additional_typedefs}
     typedefs = [f"using {alias} = {typename};" for alias, typename in typedefs.items()]
 
-    lines = '\n'.join(headers_to_include + [''] + typedefs) + '\n'
+    lines = "#pragma once\n"
+
+    lines += '\n'.join(headers_to_include + [''] + typedefs) + '\n'
 
     if path.splitext(filename)[1] not in HEADER_EXTENSIONS:
         filename += '.h'
diff --git a/src/cuda/communication/UniformGPUScheme.h b/src/cuda/communication/UniformGPUScheme.h
index ba702c5740be3847f8e057af68d002f6fc62e21e..dcd93210ae137e8c9afadcbd6d735e460bbf6447 100644
--- a/src/cuda/communication/UniformGPUScheme.h
+++ b/src/cuda/communication/UniformGPUScheme.h
@@ -50,11 +50,11 @@ namespace communication {
 
        void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
 
-       void startCommunication( cudaStream_t stream = 0);
-       void wait( cudaStream_t stream = 0);
+       void startCommunication( cudaStream_t stream = nullptr);
+       void wait( cudaStream_t stream = nullptr);
 
-      void operator()( cudaStream_t stream = 0 )         { communicate( stream ); }
-      inline void communicate( cudaStream_t stream = 0 ) { startCommunication(stream); wait(stream); }
+      void operator()( cudaStream_t stream = nullptr )         { communicate( stream ); }
+      inline void communicate( cudaStream_t stream = nullptr ) { startCommunication(stream); wait(stream); }
 
    private:
        void setupCommunication();
diff --git a/src/cuda/communication/UniformGPUScheme.impl.h b/src/cuda/communication/UniformGPUScheme.impl.h
index c2bdb32056762ee9b68bdd68f7916ee793c11966..fa8e0c2aa7289e67280761d94ee8d98b79f3cf0b 100644
--- a/src/cuda/communication/UniformGPUScheme.impl.h
+++ b/src/cuda/communication/UniformGPUScheme.impl.h
@@ -43,7 +43,7 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
    template<typename Stencil>
    void UniformGPUScheme<Stencil>::startCommunication( cudaStream_t stream )
    {
-      WALBERLA_ASSERT( !communicationInProgress_ );
+      WALBERLA_ASSERT( !communicationInProgress_ )
       auto forest = blockForest_.lock();
 
       auto currentBlockForestStamp = forest->getBlockForest().getModificationStamp();
@@ -79,14 +79,14 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
                   parallelSection.run([&](auto s) {
                      auto size = pi->size( *dir, block );
                      auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
-                     WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
+                     WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
                      pi->pack( *dir, gpuDataPtr, block, s );
 
                      if( !sendFromGPU_ )
                      {
                         auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
-                        WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr );
-                        WALBERLA_CUDA_CHECK( cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, s ));
+                        WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
+                        WALBERLA_CUDA_CHECK( cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, s ))
                      }
                   });
                }
@@ -109,7 +109,7 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
    template<typename Stencil>
    void UniformGPUScheme<Stencil>::wait( cudaStream_t stream )
    {
-      WALBERLA_ASSERT( communicationInProgress_ );
+      WALBERLA_ASSERT( communicationInProgress_ )
 
       auto forest = blockForest_.lock();
 
@@ -127,7 +127,7 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
                {
                   auto size = pi->size( header.dir, block );
                   auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size );
-                  WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
+                  WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
                   parallelSection.run([&](auto s) {
                      pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
                   });
@@ -152,12 +152,12 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
                   auto size = pi->size( header.dir, block );
                   auto cpuDataPtr = recvInfo.buffer().advanceNoResize( size );
                   auto gpuDataPtr = gpuBuffer.advanceNoResize( size );
-                  WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr );
-                  WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
+                  WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
+                  WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
 
                   parallelSection.run([&](auto s) {
                      WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
-                                                           cudaMemcpyHostToDevice, s ));
+                                                           cudaMemcpyHostToDevice, s ))
                      pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
                   });
                }
@@ -190,9 +190,9 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
                continue;
 
             WALBERLA_ASSERT( block->neighborhoodSectionHasEquallySizedBlock( neighborIdx ),
-                             "Works for uniform setups only" );
+                             "Works for uniform setups only" )
             WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize( neighborIdx ), uint_t( 1 ),
-                                   "Works for uniform setups only" );
+                                   "Works for uniform setups only" )
 
             const BlockID &nBlockId = block->getNeighborId( neighborIdx, uint_t( 0 ));
             auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
@@ -235,7 +235,7 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf
    template<typename Stencil>
    void UniformGPUScheme<Stencil>::addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi )
    {
-      WALBERLA_ASSERT( !communicationInProgress_, "Cannot add pack info while communication is in progress" );
+      WALBERLA_ASSERT( !communicationInProgress_, "Cannot add pack info while communication is in progress" )
       packInfos_.push_back( pi );
       setupBeforeNextCommunication_ = true;
    }