diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/droplet_contact_angle.py b/apps/showcases/PhaseFieldAllenCahn/CPU/droplet_contact_angle.py
index ee297075c1030f87b1d4b3cab4ee505f07dc381b..e9a954de8ce8993f0bf7ba9d015006ec7a7017c5 100755
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/droplet_contact_angle.py
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/droplet_contact_angle.py
@@ -8,40 +8,38 @@ class Scenario:
 
         # simulation parameters
         self.timesteps = 10001
-        self.cells = (32, 64, 32)
-        self.blocks = (4, 1, 4)
-        self.periodic = (0, 0, 0)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
 
-        self.overlappingWidth = (8, 1, 1)
-        self.timeStepStrategy = 'normal'
+        # domain decomposition can be specified manually by specifying the number of cells per block and the
+        # number of blocks. The number of blocks must be equal to the MPI processes used. If only the total domain size
+        # is specified with 'cells' waLBerla will take care of the decomposition depending on the number of MPI
+        # processes at runtime
+
+        # self.cell_per_block = (32, 64, 32)
+        # self.blocks = (4, 1, 4)
+        self.cells = (128, 64, 128)
+
+        self.periodic = (0, 0, 0)
 
         # bubble parameters
         self.dropletRadius = 24.0
         self.dropletMidPoint = (64, 24, 64)
 
         # everything else
-        self.scenario = 1  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
-
-        self.counter = 0
-        self.yPositions = []
+        self.scenario = 1  # 1 rising bubble or droplet, 2 RTI
 
     @wlb.member_callback
     def config(self):
         return {
             'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                # 'blocks': self.blocks,
+                # 'cellsPerBlock': self.cell_per_block,
+                'cells': self.cells,
                 'periodic': self.periodic,
                 'tube': False
             },
             'Parameters': {
                 'timesteps': self.timesteps,
                 'vtkWriteFrequency': self.vtkWriteFrequency,
-                'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
             },
@@ -53,7 +51,7 @@ class Scenario:
                 'gravitational_acceleration': 0.0,
                 'relaxation_time_liquid': 3 * 0.166,
                 'relaxation_time_gas': 3 * 0.0166,
-                'interface_thickness': 5
+                'interface_thickness': 4
             },
             'Boundaries': {
                 'Border': [
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase.cpp
index 184672199ba99fa86d747cf7b76c134a461b26ef..8e0f2cb621979514e391c4c23258445717125aaf 100644
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase.cpp
@@ -31,24 +31,25 @@
 
 #include "geometry/InitBoundaryHandling.h"
 
+#include "lbm/PerformanceEvaluation.h"
+
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/PythonCallback.h"
 
 #include "timeloop/SweepTimeloop.h"
 
+#include "GenDefines.h"
 #include "InitializerFunctions.h"
 #include "PythonExports.h"
 
-#include "GenDefines.h"
-
 ////////////
 // USING //
 //////////
 
 using namespace walberla;
 
-using flag_t           = walberla::uint8_t;
-using FlagField_T      = FlagField< flag_t >;
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
 
 int main(int argc, char** argv)
 {
@@ -68,7 +69,6 @@ int main(int argc, char** argv)
       //////////////////////////
 
       auto domainSetup                = config->getOneBlock("DomainSetup");
-      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
       const bool tube                 = domainSetup.getParameter< bool >("tube", true);
 
       ////////////////////////////////////////
@@ -80,21 +80,19 @@ int main(int argc, char** argv)
       const uint_t timesteps             = parameters.getParameter< uint_t >("timesteps", uint_c(50));
       const real_t remainingTimeLoggerFrequency =
          parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0));
-      const uint_t scenario  = parameters.getParameter< uint_t >("scenario", uint_c(1));
-
-      Vector3< int > overlappingWidth =
-         parameters.getParameter< Vector3< int > >("overlappingWidth", Vector3< int >(1, 1, 1));
+      const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
 
       /////////////////////////
       // ADD DATA TO BLOCKS //
       ///////////////////////
 
-      BlockDataID lb_phase_field =
-         field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
-      BlockDataID lb_velocity_field =
-         field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx);
-      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
-      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
+      BlockDataID allen_cahn_PDFs_ID =
+         field::addToStorage< PdfField_phase_T >(blocks, "LB phase field", real_c(0.0), field::fzyx);
+      BlockDataID hydrodynamic_PDFs_ID =
+         field::addToStorage< PdfField_hydro_T >(blocks, "LB velocity field", real_c(0.0), field::fzyx);
+      BlockDataID velocity_field_ID =
+         field::addToStorage< VelocityField_T >(blocks, "velocity", real_c(0.0), field::fzyx);
+      BlockDataID phase_field_ID = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
 
       BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
 
@@ -116,43 +114,60 @@ int main(int argc, char** argv)
          const Vector3< real_t > bubbleMidPoint = bubbleParameters.getParameter< Vector3< real_t > >("bubbleMidPoint");
          const real_t bubbleRadius              = bubbleParameters.getParameter< real_t >("bubbleRadius", real_c(20.0));
          const bool bubble                      = bubbleParameters.getParameter< bool >("bubble", true);
-         initPhaseField_sphere(blocks, phase_field, bubbleRadius, bubbleMidPoint, bubble);
-      }
-      else if (scenario == 2)
-      {
-         initPhaseField_RTI(blocks, phase_field, interface_thickness, tube);
+         initPhaseField_sphere(blocks, phase_field_ID, bubbleRadius, bubbleMidPoint, bubble);
       }
+      else if (scenario == 2) { initPhaseField_RTI(blocks, phase_field_ID, interface_thickness, tube); }
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the phase-field done")
 
       /////////////////
       // ADD SWEEPS //
       ///////////////
 
-      pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field, interface_thickness);
-      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
+      pystencils::initialize_phase_field_distributions init_h(allen_cahn_PDFs_ID, phase_field_ID, velocity_field_ID,
+                                                              interface_thickness);
+      pystencils::initialize_velocity_based_distributions init_g(hydrodynamic_PDFs_ID, velocity_field_ID);
 
-      pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field, phase_field, vel_field, interface_thickness, mobility,
-         Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
-      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field, gravitational_acceleration,
-                                              interface_thickness, density_liquid, density_gas, surface_tension, relaxation_time_liquid,
-                                              relaxation_time_gas,
-                                              Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+      pystencils::phase_field_LB_step phase_field_LB_step(allen_cahn_PDFs_ID, phase_field_ID, velocity_field_ID,
+                                                          mobility, interface_thickness);
+      pystencils::hydro_LB_step hydro_LB_step(
+         hydrodynamic_PDFs_ID, phase_field_ID, velocity_field_ID, gravitational_acceleration, interface_thickness,
+         density_liquid, density_gas, surface_tension, relaxation_time_liquid, relaxation_time_gas);
 
       ////////////////////////
       // ADD COMMUNICATION //
       //////////////////////
-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field(blocks);
-      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
-      Comm_phase_field.addPackInfo(generatedPackInfo_phase_field);
-
-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
-      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
-      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
-
-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
-      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
-      Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
+      auto UniformBufferedSchemeVelocityDistributions =
+         make_shared< blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > >(blocks);
+      auto generatedPackInfo_velocity_based_distributions =
+         make_shared< lbm::PackInfo_velocity_based_distributions >(hydrodynamic_PDFs_ID);
+      UniformBufferedSchemeVelocityDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      auto Comm_velocity_based_distributions =
+         std::function< void() >([&]() { UniformBufferedSchemeVelocityDistributions->communicate(); });
+      auto Comm_velocity_based_distributions_start =
+         std::function< void() >([&]() { UniformBufferedSchemeVelocityDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait =
+         std::function< void() >([&]() { UniformBufferedSchemeVelocityDistributions->wait(); });
+
+      auto UniformBufferedSchemePhaseField =
+         make_shared< blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > >(blocks);
+      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_ID);
+      UniformBufferedSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
+      auto Comm_phase_field = std::function< void() >([&]() { UniformBufferedSchemePhaseField->communicate(); });
+      auto Comm_phase_field_start =
+         std::function< void() >([&]() { UniformBufferedSchemePhaseField->startCommunication(); });
+      auto Comm_phase_field_wait = std::function< void() >([&]() { UniformBufferedSchemePhaseField->wait(); });
+
+      auto UniformBufferedSchemePhaseFieldDistributions =
+         make_shared< blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > >(blocks);
+      auto generatedPackInfo_phase_field_distributions =
+         make_shared< lbm::PackInfo_phase_field_distributions >(allen_cahn_PDFs_ID);
+      UniformBufferedSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      auto Comm_phase_field_distributions =
+         std::function< void() >([&]() { UniformBufferedSchemePhaseFieldDistributions->communicate(); });
+      auto Comm_phase_field_distributions_start =
+         std::function< void() >([&]() { UniformBufferedSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait =
+         std::function< void() >([&]() { UniformBufferedSchemePhaseFieldDistributions->wait(); });
 
       ////////////////////////
       // BOUNDARY HANDLING //
@@ -168,9 +183,9 @@ int main(int argc, char** argv)
          geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
       }
 
-      lbm::phase_field_LB_NoSlip phase_field_LB_NoSlip(blocks, lb_phase_field);
-      lbm::hydro_LB_NoSlip hydro_LB_NoSlip(blocks, lb_velocity_field);
-      pystencils::ContactAngle contact_angle(blocks, phase_field, interface_thickness);
+      lbm::phase_field_LB_NoSlip phase_field_LB_NoSlip(blocks, allen_cahn_PDFs_ID);
+      lbm::hydro_LB_NoSlip hydro_LB_NoSlip(blocks, hydrodynamic_PDFs_ID);
+      pystencils::ContactAngle contact_angle(blocks, phase_field_ID, interface_thickness);
 
       phase_field_LB_NoSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
       hydro_LB_NoSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
@@ -182,62 +197,19 @@ int main(int argc, char** argv)
       // TIME LOOP //
       //////////////
 
-      auto timeLoop       = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
-      auto normalTimeStep = [&]() {
-         for (auto& block : *blocks)
-         {
-            Comm_phase_field_distributions();
-            phase_field_LB_NoSlip(&block);
-
-
-            phase_field_LB_step(&block);
-            contact_angle(&block);
-            Comm_phase_field();
-
+      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
 
-            hydro_LB_step(&block);
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_NoSlip, "NoSlip Phase");
+      timeloop.add() << Sweep(phase_field_LB_step, "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
+      timeloop.add() << Sweep(contact_angle, "Contact Angle")
+                     << AfterFunction(Comm_phase_field, "Communication Phase Field");
 
-            Comm_velocity_based_distributions();
-            hydro_LB_NoSlip(&block);
-         }
-      };
-      auto simpleOverlapTimeStep = [&]() {
-        for (auto& block : *blocks)
-           phase_field_LB_NoSlip(&block);
-
-        Comm_phase_field_distributions.startCommunication();
-        for (auto& block : *blocks)
-           phase_field_LB_step.inner(&block);
-        Comm_phase_field_distributions.wait();
-        for (auto& block : *blocks)
-           phase_field_LB_step.outer(&block);
-
-        for (auto& block : *blocks)
-           contact_angle(&block);
-
-        Comm_phase_field.startCommunication();
-        for (auto& block : *blocks)
-           hydro_LB_step.inner(&block);
-        Comm_phase_field.wait();
-        for (auto& block : *blocks)
-           hydro_LB_step.outer(&block);
-
-        for (auto& block : *blocks)
-           hydro_LB_NoSlip(&block);
-      };
-      std::function< void() > timeStep;
-      if (timeStepStrategy == "overlap")
-      {
-         timeStep = std::function< void() >(simpleOverlapTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("overlapping timestep")
-      }
-      else
-      {
-         timeStep = std::function< void() >(normalTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("normal timestep with no overlapping")
-      }
-
-      timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_NoSlip, "NoSlip Hydro");
+      timeloop.add() << Sweep(hydro_LB_step, "Hydro LB Step")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
 
       if (scenario == 4)
       {
@@ -255,18 +227,19 @@ int main(int argc, char** argv)
          init_h(&block);
          init_g(&block);
       }
+      Comm_phase_field_distributions();
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the PDFs done")
       uint_t dbWriteFrequency = parameters.getParameter< uint_t >("dbWriteFrequency", 10000000);
 
-      timeLoop->addFuncAfterTimeStep(
+      timeloop.addFuncAfterTimeStep(
          [&]() {
-            if (timeLoop->getCurrentTimeStep() % dbWriteFrequency == 0)
+            if (timeloop.getCurrentTimeStep() % dbWriteFrequency == 0)
             {
                python_coupling::PythonCallback callback("at_end_of_time_step");
                if (callback.isCallable())
                {
                   callback.data().exposeValue("blocks", blocks);
-                  callback.data().exposeValue( "timeStep", timeLoop->getCurrentTimeStep());
+                  callback.data().exposeValue("timeStep", timeloop.getCurrentTimeStep());
                   callback.data().exposeValue("stencil_phase", StencilNamePhase);
                   callback.data().exposeValue("stencil_hydro", StencilNameHydro);
                   callback();
@@ -276,35 +249,41 @@ int main(int argc, char** argv)
          "Python callback");
 
       // remaining time logger
-      timeLoop->addFuncAfterTimeStep(
-         timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+      timeloop.addFuncAfterTimeStep(
+         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
          "remaining time logger");
 
       uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
       if (vtkWriteFrequency > 0)
       {
          auto vtkOutput   = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                         "simulation_step", false, true, true, false, 0);
-         auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "PhaseField");
+                                                           "simulation_step", false, true, true, false, 0);
+         auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field_ID, "PhaseField");
          vtkOutput->addCellDataWriter(phaseWriter);
 
-         // auto velWriter = make_shared<field::VTKWriter<VelocityField_T>>(vel_field, "Velocity");
-         // vtkOutput->addCellDataWriter(velWriter);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velocity_field_ID, "Velocity");
+         vtkOutput->addCellDataWriter(velWriter);
 
-         timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+         timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
 
-      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      lbm::PerformanceEvaluation< FlagField_T > performance(blocks, flagFieldID, fluidFlagUID);
+      WcTimingPool timeloopTiming;
       WcTimer simTimer;
+
+      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      WALBERLA_MPI_WORLD_BARRIER()
       simTimer.start();
-      timeLoop->run();
+      timeloop.run(timeloopTiming);
+      WALBERLA_MPI_WORLD_BARRIER()
       simTimer.end();
-      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-      auto time            = real_c(simTimer.last());
-      auto nrOfCells       = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
-      auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
-      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
-      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
+
+      auto time = real_c(simTimer.max());
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
    }
    return EXIT_SUCCESS;
 }
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_RTI_3D.py b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_RTI_3D.py
index 401ce693ac893aba8375c64dd2b56f6853a9c08c..d7e2f1959125ce33c3a49faa7f15be94417a9977 100755
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_RTI_3D.py
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_RTI_3D.py
@@ -17,12 +17,20 @@ class Scenario:
         # simulation parameters
         self.time = 2  # physical time in seconds
 
-        self.cells = (128, 64, 128)
-        self.blocks = (1, 8, 1)
+        # domain decomposition can be specified manually by specifying the number of cells per block and the
+        # number of blocks. The number of blocks must be equal to the MPI processes used. If only the total domain size
+        # is specified with 'cells' waLBerla will take care of the decomposition depending on the number of MPI
+        # processes at runtime
+
+        # self.cell_per_block = (128, 64, 128)
+        # self.blocks = (1, 8, 1)
+        # self.size = (self.cell_per_block[0] * self.blocks[0],
+        #              self.cell_per_block[1] * self.blocks[1],
+        #              self.cell_per_block[2] * self.blocks[2])
+
+        self.cells = (128, 512, 128)
+        self.size = self.cells
         self.periodic = (1, 0, 1)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
 
         # physical parameters
         self.density_heavy = 1.0
@@ -53,19 +61,16 @@ class Scenario:
         # everything else
         self.dbFile = "RTI.csv"
 
-        self.scenario = 2  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
-
-        self.counter = 0
-        self.yPositions = []
-
+        self.scenario = 2  # 1 rising bubble or droplet, 2 RTI
         self.config_dict = self.config()
 
     @wlb.member_callback
     def config(self):
         return {
             'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                # 'blocks': self.blocks,
+                # 'cellsPerBlock': self.cell_per_block,
+                'cells': self.cells,
                 'periodic': self.periodic,
                 'tube': self.tube
             },
@@ -77,13 +82,13 @@ class Scenario:
                 'scenario': self.scenario,
             },
             'PhysicalParameters': {
-                'density_liquid': self.density_heavy,
-                'density_gas': self.parameters.get("density_light"),
-                'surface_tension': self.parameters.get("surface_tension"),
-                'mobility': self.parameters.get("mobility"),
-                'gravitational_acceleration': self.parameters.get("gravitational_acceleration"),
-                'relaxation_time_liquid': self.parameters.get("relaxation_time_heavy"),
-                'relaxation_time_gas': self.parameters.get("relaxation_time_light"),
+                'density_liquid': self.parameters.density_heavy,
+                'density_gas': self.parameters.density_light,
+                'surface_tension': self.parameters.surface_tension,
+                'mobility': self.parameters.mobility,
+                'gravitational_acceleration': self.parameters.gravitational_acceleration,
+                'relaxation_time_liquid': self.parameters.relaxation_time_heavy,
+                'relaxation_time_gas': self.parameters.relaxation_time_heavy,
                 'interface_thickness': self.interface_thickness
             },
             'Boundaries': {
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_codegen.py b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_codegen.py
index a2dba9cb578e74c130f2fca3fdb25a642fb196aa..1b5113454b3dfbafa2db8b7c37fefd0d78983d73 100644
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_codegen.py
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_codegen.py
@@ -87,7 +87,7 @@ with CodeGeneration() as ctx:
 
     # create the kernels for the initialization of the g and h field
     h_updates = initializer_kernel_phase_field_lb(method_phase, C, u, h, parameters)
-    g_updates = initializer_kernel_hydro_lb(method_hydro, 1, u, g)
+    g_updates = initializer_kernel_hydro_lb(method_hydro, 1.0, u, g)
 
     force_h = interface_tracking_force(C, stencil_phase, parameters)
     hydro_force = hydrodynamic_force(g, C, method_hydro, parameters, body_force)
@@ -116,17 +116,6 @@ with CodeGeneration() as ctx:
     ###################
     # GENERATE SWEEPS #
     ###################
-
-    vp = [('int32_t', 'cudaBlockSize0'),
-          ('int32_t', 'cudaBlockSize1'),
-          ('int32_t', 'cudaBlockSize2')]
-
-    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                        TypedSymbol("cudaBlockSize1", np.int32),
-                        TypedSymbol("cudaBlockSize2", np.int32))
-
-    sweep_params = {'block_size': sweep_block_size}
-
     stencil_typedefs = {'Stencil_phase_T': stencil_phase,
                         'Stencil_hydro_T': stencil_hydro}
     field_typedefs = {'PdfField_phase_T': h,
@@ -144,13 +133,11 @@ with CodeGeneration() as ctx:
 
     generate_sweep(ctx, 'phase_field_LB_step', allen_cahn_update_rule,
                    field_swaps=[(h, h_tmp), (C, C_tmp)],
-                   inner_outer_split=True,
                    target=Target.CPU)
     generate_boundary(ctx, 'phase_field_LB_NoSlip', NoSlip(), method_phase, target=Target.CPU, streaming_pattern='pull')
 
     generate_sweep(ctx, 'hydro_LB_step', hydro_lb_update_rule,
                    field_swaps=[(g, g_tmp)],
-                   inner_outer_split=True,
                    target=Target.CPU)
     generate_boundary(ctx, 'hydro_LB_NoSlip', NoSlip(), method_hydro, target=Target.CPU, streaming_pattern='push')
 
diff --git a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_rising_bubble.py b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_rising_bubble.py
index c040fc2f1df72793085b745787bd9dff6d00b9a5..5caf4b34353ca8bcbb57ce60dab13994a7d39c46 100755
--- a/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_rising_bubble.py
+++ b/apps/showcases/PhaseFieldAllenCahn/CPU/multiphase_rising_bubble.py
@@ -14,15 +14,21 @@ class Scenario:
 
         # simulation parameters
         self.timesteps = 10000
-        self.cells = (32, 32, 64)
-        self.blocks = (2, 4, 1)
-        self.periodic = (0, 0, 0)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
 
-        self.overlappingWidth = (8, 1, 1)
-        self.timeStepStrategy = 'normal'
+        # domain decomposition can be specified manually by specifying the number of cells per block and the
+        # number of blocks. The number of blocks must be equal to the MPI processes used. If only the total domain size
+        # is specified with 'cells' waLBerla will take care of the decomposition depending on the number of MPI
+        # processes at runtime
+
+        # self.cell_per_block = (32, 32, 64)
+        # self.blocks = (2, 4, 1)
+        # self.size = (self.cell_per_block[0] * self.blocks[0],
+        #              self.cell_per_block[1] * self.blocks[1],
+        #              self.cell_per_block[2] * self.blocks[2])
+
+        self.cells = (64, 128, 64)
+        self.size = self.cells
+        self.periodic = (0, 0, 0)
 
         # physical parameters
         self.bubbleRadius = 16
@@ -53,27 +59,26 @@ class Scenario:
     def config(self):
         return {
             'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                # 'blocks': self.blocks,
+                # 'cellsPerBlock': self.cell_per_block,
+                'cells': self.cells,
                 'periodic': self.periodic,
             },
             'Parameters': {
                 'timesteps': self.timesteps,
                 'vtkWriteFrequency': self.vtkWriteFrequency,
                 'dbWriteFrequency': self.dbWriteFrequency,
-                'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
             },
             'PhysicalParameters': {
-                'density_liquid': self.density_heavy,
-                'density_gas': self.parameters["density_light"],
-                'surface_tension': self.parameters["surface_tension"],
-                'mobility': self.parameters.get("mobility", 0.1),
-                'gravitational_acceleration': self.parameters["gravitational_acceleration"],
-                'relaxation_time_liquid': self.parameters.get("relaxation_time_heavy"),
-                'relaxation_time_gas': self.parameters.get("relaxation_time_light"),
+                'density_liquid': self.parameters.density_heavy,
+                'density_gas': self.parameters.density_light,
+                'surface_tension': self.parameters.surface_tension,
+                'mobility': self.parameters.mobility,
+                'gravitational_acceleration': self.parameters.gravitational_acceleration,
+                'relaxation_time_liquid': self.parameters.relaxation_time_heavy,
+                'relaxation_time_gas': self.parameters.relaxation_time_light,
                 'interface_thickness': self.interface_thickness
             },
             'Boundaries': {
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py b/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py
index 2f6c36edd7d96359a6c1c7aef9eeebd9bfdcbfea..5678df4c85d9946c5a49d06ef89ad46d1b95741b 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/droplet_contact_angle.py
@@ -7,16 +7,18 @@ class Scenario:
         self.vtkWriteFrequency = 1000
 
         # simulation parameters
-        self.timesteps = 10001
+        self.timesteps = 10000
+
+        # domain decomposition can be specified manually by specifying the number of cells per block and the
+        # number of blocks. The number of blocks must be equal to the MPI processes used. If only the total domain size
+        # is specified with 'cells' waLBerla will take care of the decomposition depending on the number of MPI
+        # processes at runtime
+
+        # self.cell_per_block = (32, 64, 32)
+        # self.blocks = (4, 1, 4)
         self.cells = (128, 64, 128)
-        self.blocks = (1, 1, 1)
-        self.periodic = (0, 0, 0)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
 
-        self.overlappingWidth = (8, 1, 1)
-        self.timeStepStrategy = 'normal'
+        self.periodic = (0, 0, 0)
 
         # bubble parameters
         self.dropletRadius = 24.0
@@ -25,26 +27,22 @@ class Scenario:
         # everything else
         self.scenario = 1  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
 
-        self.counter = 0
-        self.yPositions = []
-
         self.cudaEnabledMpi = cuda_enabled_mpi
-        self.cuda_blocks = (64, 2, 2)
+        self.cuda_blocks = (128, 1, 1)
 
     @wlb.member_callback
     def config(self):
         return {
             'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                # 'blocks': self.blocks,
+                # 'cellsPerBlock': self.cell_per_block,
+                'cells': self.cells,
                 'periodic': self.periodic,
                 'tube': False
             },
             'Parameters': {
                 'timesteps': self.timesteps,
                 'vtkWriteFrequency': self.vtkWriteFrequency,
-                'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
                 'cudaEnabledMpi': self.cudaEnabledMpi,
@@ -79,4 +77,4 @@ class Scenario:
 
 
 scenarios = wlb.ScenarioManager()
-scenarios.add(Scenario())
+scenarios.add(Scenario(cuda_enabled_mpi=False))
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
index 852bd26e5b1ef067be6c2723231562371475e938..8d5f3c49869c289c0e93130d5dfc07b6e9158f0f 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
@@ -39,7 +39,7 @@
 #include "geometry/InitBoundaryHandling.h"
 #include "geometry/mesh/TriangleMeshIO.h"
 
-#include "lbm/vtk/QCriterion.h"
+#include "lbm/PerformanceEvaluation.h"
 
 #include "postprocessing/FieldToSurfaceMesh.h"
 
@@ -89,7 +89,7 @@ int main(int argc, char** argv)
       //////////////////////////
 
       auto domainSetup                = config->getOneBlock("DomainSetup");
-      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock", Vector3< int >(128, 1, 1));
       const bool tube                 = domainSetup.getParameter< bool >("tube", false);
 
       ////////////////////////////////////////
@@ -182,7 +182,7 @@ int main(int argc, char** argv)
       pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
 
       pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu,
-                                                          vel_field_gpu, interface_thickness, mobility, gpuBlockSize[0],
+                                                          vel_field_gpu, mobility, interface_thickness, gpuBlockSize[0],
                                                           gpuBlockSize[1], gpuBlockSize[2]);
 
       pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu,
@@ -193,23 +193,43 @@ int main(int argc, char** argv)
       ////////////////////////
       // ADD COMMUNICATION //
       //////////////////////
+      int streamLowPriority  = 0;
+      int streamHighPriority = 0;
+      auto defaultStream     = cuda::StreamRAII::newPriorityStream(streamLowPriority);
+      auto innerOuterStreams = cuda::ParallelStreams(streamHighPriority);
 
-      auto Comm_velocity_based_distributions =
+      auto UniformGPUSchemeVelocityDistributions =
          make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_velocity_based_distributions =
          make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
-      Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemeVelocityDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      auto Comm_velocity_based_distributions =
+         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->communicate(defaultStream); });
+      auto Comm_velocity_based_distributions_start =
+         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->startCommunication(defaultStream); });
+      auto Comm_velocity_based_distributions_wait =
+         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(defaultStream); });
 
-      auto Comm_phase_field =
+      auto UniformGPUSchemePhaseField =
          make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
-      Comm_phase_field->addPackInfo(generatedPackInfo_phase_field);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(defaultStream); });
+      auto Comm_phase_field_start =
+         std::function< void() >([&]() { UniformGPUSchemePhaseField->startCommunication(defaultStream); });
+      auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(defaultStream); });
 
-      auto Comm_phase_field_distributions =
+      auto UniformGPUSchemePhaseFieldDistributions =
          make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field_distributions =
          make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
-      Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      auto Comm_phase_field_distributions =
+         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(defaultStream); });
+      auto Comm_phase_field_distributions_start =
+         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(defaultStream); });
+      auto Comm_phase_field_distributions_wait =
+         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(defaultStream); });
 
       ////////////////////////
       // BOUNDARY HANDLING //
@@ -250,40 +270,19 @@ int main(int argc, char** argv)
       ////////////////
       // TIME LOOP //
       //////////////
+      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
 
-      int streamLowPriority  = 0;
-      int streamHighPriority = 0;
-      auto defaultStream     = cuda::StreamRAII::newPriorityStream(streamLowPriority);
-      auto innerOuterStreams = cuda::ParallelStreams(streamHighPriority);
-
-      auto timeLoop       = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
-      auto normalTimeStep = [&]() {
-         Comm_velocity_based_distributions->startCommunication(defaultStream);
-         for (auto& block : *blocks)
-         {
-            phase_field_LB_NoSlip(&block, defaultStream);
-            phase_field_LB_step(&block, defaultStream);
-         }
-         Comm_velocity_based_distributions->wait(defaultStream);
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_NoSlip, "NoSlip Phase");
+      timeloop.add() << Sweep(phase_field_LB_step, "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
+      timeloop.add() << Sweep(contact_angle, "Contact Angle")
+                     << AfterFunction(Comm_phase_field, "Communication Phase Field");
 
-         for (auto& block : *blocks)
-         {
-            contact_angle(&block, defaultStream);
-            Comm_phase_field->communicate(defaultStream);
-         }
-
-         Comm_phase_field_distributions->startCommunication(defaultStream);
-         for (auto& block : *blocks)
-         {
-            hydro_LB_NoSlip(&block, defaultStream);
-            hydro_LB_step(&block, defaultStream);
-         }
-         Comm_phase_field_distributions->wait(defaultStream);
-      };
-      std::function< void() > timeStep;
-      timeStep = std::function< void() >(normalTimeStep);
-
-      timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_NoSlip, "NoSlip Hydro");
+      timeloop.add() << Sweep(hydro_LB_step, "Hydro LB Step")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
 
       if (scenario == 4)
       {
@@ -295,6 +294,7 @@ int main(int argc, char** argv)
          }
       }
       cuda::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
+      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
 
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the PDFs")
       for (auto& block : *blocks)
@@ -302,24 +302,22 @@ int main(int argc, char** argv)
          init_h(&block);
          init_g(&block);
       }
+      Comm_phase_field_distributions();
 
-      for (auto& block : *blocks)
-      {
-         Comm_phase_field_distributions->communicate(nullptr);
-         phase_field_LB_NoSlip(&block);
-      }
       WALBERLA_LOG_INFO_ON_ROOT("Initialisation of the PDFs done")
       uint_t dbWriteFrequency = parameters.getParameter< uint_t >("dbWriteFrequency", 0);
       int targetRank          = 0;
 
       if (dbWriteFrequency > 0)
       {
-         timeLoop->addFuncAfterTimeStep(
+         timeloop.addFuncAfterTimeStep(
             [&]() {
-               if (timeLoop->getCurrentTimeStep() % dbWriteFrequency == 0)
+               if (timeloop.getCurrentTimeStep() % dbWriteFrequency == 0)
                {
                   cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
                   cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
+                  WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+
                   if (scenario == 4)
                   {
                      std::array< real_t, 4 > total_velocity = { 0.0, 0.0, 0.0, 0.0 };
@@ -349,7 +347,7 @@ int main(int argc, char** argv)
                      if (callback.isCallable())
                      {
                         callback.data().exposeValue("blocks", blocks);
-                        callback.data().exposeValue("timeStep", timeLoop->getCurrentTimeStep());
+                        callback.data().exposeValue("timeStep", timeloop.getCurrentTimeStep());
                         callback.data().exposeValue("target_rank", targetRank);
                         callback.data().exposeValue("bounding_box_min", boundingBox.min()[1]);
                         callback.data().exposeValue("bounding_box_max", boundingBox.max()[1]);
@@ -373,7 +371,7 @@ int main(int argc, char** argv)
                      if (callback.isCallable())
                      {
                         callback.data().exposeValue("blocks", blocks);
-                        callback.data().exposeValue("timeStep", timeLoop->getCurrentTimeStep());
+                        callback.data().exposeValue("timeStep", timeloop.getCurrentTimeStep());
                         callback.data().exposeValue("stencil_phase", StencilNamePhase);
                         callback.data().exposeValue("stencil_hydro", StencilNameHydro);
                         callback();
@@ -388,9 +386,9 @@ int main(int argc, char** argv)
       int counter            = 0;
       if (meshWriteFrequency > 0)
       {
-         timeLoop->addFuncAfterTimeStep(
+         timeloop.addFuncAfterTimeStep(
             [&]() {
-               if (timeLoop->getCurrentTimeStep() % uint_t(meshWriteFrequency) == 0)
+               if (timeloop.getCurrentTimeStep() % uint_t(meshWriteFrequency) == 0)
                {
                   auto mesh = postprocessing::realFieldToSurfaceMesh< PhaseField_T >(blocks, phase_field, 0.5, 0, true,
                                                                                      targetRank, MPI_COMM_WORLD);
@@ -409,8 +407,8 @@ int main(int argc, char** argv)
       }
 
       // remaining time logger
-      timeLoop->addFuncAfterTimeStep(
-         timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
+      timeloop.addFuncAfterTimeStep(
+         timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency),
          "remaining time logger");
 
       uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
@@ -423,6 +421,8 @@ int main(int argc, char** argv)
             cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
             cuda::fieldCpy< VelocityField_T, GPUField >(blocks, vel_field, vel_field_gpu);
          });
+         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+
          auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T, float > >(phase_field, "PhaseField");
          vtkOutput->addCellDataWriter(phaseWriter);
 
@@ -432,23 +432,31 @@ int main(int argc, char** argv)
          auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float > >(vel_field, "Velocity");
          vtkOutput->addCellDataWriter(velWriter);
 
-         timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+         timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
 
-      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      lbm::PerformanceEvaluation< FlagField_T > performance(blocks, flagFieldID, fluidFlagUID);
+      WcTimingPool timeloopTiming;
       WcTimer simTimer;
+
+      WALBERLA_MPI_WORLD_BARRIER()
+      cudaDeviceSynchronize();
+      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+
+      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
       simTimer.start();
-      timeLoop->run();
+      timeloop.run(timeloopTiming);
 
       cudaDeviceSynchronize();
+      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
 
       simTimer.end();
-      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-      auto time            = real_c(simTimer.last());
-      auto nrOfCells       = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
-      auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
-      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
-      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
+      auto time = real_c(simTimer.max());
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
    }
    return EXIT_SUCCESS;
 }
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py
index f4d6f9db166032fe1f57d60fa4ff13ec5f722f19..312a705d7213a8e0f1385793f01debb8f7ff1b78 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_RTI_3D.py
@@ -17,12 +17,20 @@ class Scenario:
         # simulation parameters
         self.time = 2  # physical time in seconds
 
+        # domain decomposition can be specified manually by specifying the number of cells per block and the
+        # number of blocks. The number of blocks must be equal to the MPI processes used. If only the total domain size
+        # is specified with 'cells' waLBerla will take care of the decomposition depending on the number of MPI
+        # processes at runtime
+
+        # self.cell_per_block = (128, 64, 128)
+        # self.blocks = (1, 8, 1)
+        # self.size = (self.cell_per_block[0] * self.blocks[0],
+        #              self.cell_per_block[1] * self.blocks[1],
+        #              self.cell_per_block[2] * self.blocks[2])
+
         self.cells = (128, 512, 128)
-        self.blocks = (1, 1, 1)
+        self.size = self.cells
         self.periodic = (1, 0, 1)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
 
         # physical parameters
         self.density_heavy = 1.0
@@ -53,9 +61,6 @@ class Scenario:
         # everything else
         self.scenario = 2  # 1 rising bubble or droplet, 2 RTI, 3 bubble field, 4 taylor bubble set up
 
-        self.counter = 0
-        self.yPositions = []
-
         self.cudaEnabledMpi = cuda_enabled_mpi
         self.cuda_blocks = (64, 2, 2)
 
@@ -65,8 +70,9 @@ class Scenario:
     def config(self):
         return {
             'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                # 'blocks': self.blocks,
+                # 'cellsPerBlock': self.cell_per_block,
+                'cells': self.cells,
                 'periodic': self.periodic,
                 'tube': self.tube
             },
@@ -80,13 +86,13 @@ class Scenario:
                 'gpuBlockSize': self.cuda_blocks
             },
             'PhysicalParameters': {
-                'density_liquid': self.density_heavy,
-                'density_gas': self.parameters.get("density_light"),
-                'surface_tension': self.parameters.get("surface_tension"),
-                'mobility': self.parameters.get("mobility"),
-                'gravitational_acceleration': self.parameters.get("gravitational_acceleration"),
-                'relaxation_time_liquid': self.parameters.get("relaxation_time_heavy"),
-                'relaxation_time_gas': self.parameters.get("relaxation_time_light"),
+                'density_liquid': self.parameters.density_heavy,
+                'density_gas': self.parameters.density_light,
+                'surface_tension': self.parameters.surface_tension,
+                'mobility': self.parameters.mobility,
+                'gravitational_acceleration': self.parameters.gravitational_acceleration,
+                'relaxation_time_liquid': self.parameters.relaxation_time_heavy,
+                'relaxation_time_gas': self.parameters.relaxation_time_heavy,
                 'interface_thickness': self.interface_thickness
             },
             'Boundaries': {
@@ -177,4 +183,4 @@ class Scenario:
 
 
 scenarios = wlb.ScenarioManager()
-scenarios.add(Scenario())
+scenarios.add(Scenario(cuda_enabled_mpi=False))
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py
index 54f63d35f9ca27d6095d07751a8bb36bd910c3a1..8b7d788534ab0895d71b818c72b95e5109c39a4c 100755
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase_rising_bubble.py
@@ -14,15 +14,21 @@ class Scenario:
 
         # simulation parameters
         self.timesteps = 10000
+
+        # domain decomposition can be specified manually by specifying the number of cells per block and the
+        # number of blocks. The number of blocks must be equal to the MPI processes used. If only the total domain size
+        # is specified with 'cells' waLBerla will take care of the decomposition depending on the number of MPI
+        # processes at runtime
+
+        # self.cell_per_block = (32, 32, 64)
+        # self.blocks = (2, 4, 1)
+        # self.size = (self.cell_per_block[0] * self.blocks[0],
+        #              self.cell_per_block[1] * self.blocks[1],
+        #              self.cell_per_block[2] * self.blocks[2])
+
         self.cells = (64, 128, 64)
-        self.blocks = (1, 1, 1)
+        self.size = self.cells
         self.periodic = (0, 0, 0)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
-
-        self.overlappingWidth = (8, 1, 1)
-        self.timeStepStrategy = 'normal'
 
         # physical parameters
         self.bubbleRadius = 16
@@ -43,7 +49,6 @@ class Scenario:
 
         # everything else
         self.dbFile = "risingBubble3D.db"
-
         self.scenario = 1  # 1 rising bubble, 2 RTI
 
         self.counter = 0
@@ -56,29 +61,28 @@ class Scenario:
     def config(self):
         return {
             'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                # 'blocks': self.blocks,
+                # 'cellsPerBlock': self.cell_per_block,
+                'cells': self.cells,
                 'periodic': self.periodic,
             },
             'Parameters': {
                 'timesteps': self.timesteps,
                 'vtkWriteFrequency': self.vtkWriteFrequency,
                 'dbWriteFrequency': self.dbWriteFrequency,
-                'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
                 'remainingTimeLoggerFrequency': 10.0,
                 'scenario': self.scenario,
                 'cudaEnabledMpi': self.cudaEnabledMpi,
                 'gpuBlockSize': self.cuda_blocks
             },
             'PhysicalParameters': {
-                'density_liquid': self.density_heavy,
-                'density_gas': self.parameters["density_light"],
-                'surface_tension': self.parameters["surface_tension"],
-                'mobility': self.parameters.get("mobility", 0.1),
-                'gravitational_acceleration': self.parameters["gravitational_acceleration"],
-                'relaxation_time_liquid': self.parameters.get("relaxation_time_heavy"),
-                'relaxation_time_gas': self.parameters.get("relaxation_time_light"),
+                'density_liquid': self.parameters.density_heavy,
+                'density_gas': self.parameters.density_light,
+                'surface_tension': self.parameters.surface_tension,
+                'mobility': self.parameters.mobility,
+                'gravitational_acceleration': self.parameters.gravitational_acceleration,
+                'relaxation_time_liquid': self.parameters.relaxation_time_heavy,
+                'relaxation_time_gas': self.parameters.relaxation_time_light,
                 'interface_thickness': self.interface_thickness
             },
             'Boundaries': {
@@ -145,4 +149,4 @@ class Scenario:
 
 
 scenarios = wlb.ScenarioManager()
-scenarios.add(Scenario())
+scenarios.add(Scenario(cuda_enabled_mpi=False))