diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index db5e25972442ba7fb2384d71b81dc42fe661aa8f..58a4d0356579a8fcfe89078bc947d44fdfb3fe43 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -76,9 +76,14 @@ int main( int argc, char **argv )
 
       // Boundaries
       const FlagUID fluidFlagUID( "Fluid" );
-      auto boundariesConfig = config->getOneBlock( "Boundaries" );
-      geometry::initBoundaryHandling<FlagField_T>(*blocks, flagFieldID, boundariesConfig);
-      geometry::setNonBoundaryCellsToDomain<FlagField_T>(*blocks, flagFieldID, fluidFlagUID);
+      auto boundariesConfig = config->getBlock( "Boundaries" );
+      bool disableBoundaries = true;
+      if( boundariesConfig )
+      {
+          disableBoundaries = false;
+          geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
+          geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
+      }
 
       lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
       lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
@@ -88,19 +93,17 @@ int main( int argc, char **argv )
       noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
       //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID );
 
-      bool disableBoundaries = parameters.getParameter<bool>( "disableBoundaries", false );
-      bool kernelOnly = parameters.getParameter<bool>( "kernelOnly", false );
-
        // Communication setup
-      bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true );
       bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
       int communicationScheme = parameters.getParameter<int>( "communicationScheme", (int) CommunicationSchemeType::UniformGPUScheme_Baseline );
+      Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
+
 
       int streamHighPriority = 0;
       int streamLowPriority = 0;
       WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
 
-      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega );
+      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
       lbKernel.setOuterPriority( streamHighPriority );
       UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
          gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
@@ -110,8 +113,21 @@ int main( int argc, char **argv )
       auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
       auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
 
+      uint_t currentTimeStep = 0;
+
+      auto simpleOverlapTimeStep = [&] ()
+      {
+          gpuComm.startCommunication(defaultStream);
+          for( auto &block: *blocks )
+              lbKernel.inner( &block, defaultStream );
+          gpuComm.wait(defaultStream);
+          for( auto &block: *blocks )
+              lbKernel.outer( &block, defaultStream );
+      };
+
       auto overlapTimeStep = [&]()
       {
+         cuda::NvtxRange namedRange("timestep");
          auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
 
          innerOuterSection.run([&]( auto innerStream )
@@ -131,7 +147,7 @@ int main( int argc, char **argv )
 
          innerOuterSection.run([&]( auto outerStream )
          {
-            cuda::nameStream(outerStream, "inner stream");
+            cuda::nameStream(outerStream, "outer stream");
             gpuComm( outerStream );
 
             for( auto &block: *blocks )
@@ -145,6 +161,7 @@ int main( int argc, char **argv )
                lbKernel.outer( &block, outerStream );
             }
          });
+         currentTimeStep += 1;
       };
 
 
@@ -171,13 +188,22 @@ int main( int argc, char **argv )
       };
 
       SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
-      std::function<void()> timeStep = overlapCommunication ? std::function<void()>( overlapTimeStep ) :
-                                                              std::function<void()>( normalTimeStep );
-      if( kernelOnly )
-      {
+
+      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
+      std::function<void()> timeStep;
+      if (timeStepStrategy == "noOverlap")
+          timeStep = std::function<void()>( normalTimeStep );
+      else if (timeStepStrategy == "complexOverlap")
+          timeStep = std::function<void()>( overlapTimeStep );
+      else if (timeStepStrategy == "simpleOverlap")
+          timeStep = simpleOverlapTimeStep;
+      else if (timeStepStrategy == "kernelOnly") {
           WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
           timeStep = kernelOnlyFunc;
       }
+      else {
+          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
+      }
 
       timeLoop.add() << BeforeFunction( timeStep  )
                      << Sweep( []( IBlock * ) {}, "time step" );
@@ -196,33 +222,46 @@ int main( int argc, char **argv )
          timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
       }
 
-      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds
-      timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" );
-
-
-      WcTimer simTimer;
-      cudaDeviceSynchronize();
-      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps");
-      simTimer.start();
-      timeLoop.run();
-      cudaDeviceSynchronize();
-      simTimer.end();
-      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished");
-      auto time = simTimer.last();
-      auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-      auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess);
-      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c( timesteps ) );
-      WALBERLA_ROOT_SECTION()
-      {
-         python_coupling::PythonCallback pythonCallbackResults ( "results_callback" );
-         if ( pythonCallbackResults.isCallable() )
-         {
-            pythonCallbackResults.data().exposeValue( "mlups_per_process", mlupsPerProcess );
 
-            // Call Python function to report results
-            pythonCallbackResults();
-         }
+
+      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
+      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
+      for(int i=0; i < warmupSteps; ++i )
+         timeLoop.singleStep();
+
+      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
+      if (remainingTimeLoggerFrequency > 0) {
+          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency );
+          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
+      }
+
+      for(int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
+      {
+          timeLoop.setCurrentTimeStepToZero();
+          WcTimer simTimer;
+          cudaDeviceSynchronize();
+          WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+          simTimer.start();
+          timeLoop.run();
+          cudaDeviceSynchronize();
+          simTimer.end();
+          WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+          auto time = simTimer.last();
+          auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+          auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+          WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+          WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+          WALBERLA_ROOT_SECTION()
+          {
+              python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+              if ( pythonCallbackResults.isCallable())
+              {
+                  pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                  pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
+                  // Call Python function to report results
+                  pythonCallbackResults();
+              }
+          }
       }
    }
 
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
index 22877854be5e21834fe8005bc6e1636ef2cce15e..c6c1e18b2e4ce242241912d2cfb46291a3cfab12 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -1,32 +1,32 @@
+DomainSetup
+{
+   blocks        <  1,    1,   1 >;
+   cellsPerBlock <  256, 256, 128 >;
+   periodic      <  1,    1,   1 >;
+}
 
 Parameters 
 {
 	omega           1.8;
-	timesteps       500;
+	timesteps       10000;
+    warmupSteps     0;
+    outerIterations 1;
 
-	remainingTimeLoggerFrequency 3;
-	vtkWriteFrequency 0;
+	remainingTimeLoggerFrequency 30;
+	vtkWriteFrequency 500;
 
-	overlapCommunication true;
 	cudaEnabledMPI false;
 
-	kernelOnly false;
-	disableBoundaries false;
-}
-
-DomainSetup
-{
-   blocks        <  1,    1, 1 >;
-   cellsPerBlock <  300, 300, 150 >;
-   periodic      <  0,    0, 1 >;  
+	timeStepStrategy noOverlap;
+	innerOuterSplit < 64, 1, 1>;
 }
 
-Boundaries 
+/*
+Boundaries
 {
-    /*
 	Border { direction W;    walldistance -1;  flag NoSlip; }
 	Border { direction E;    walldistance -1;  flag NoSlip; }
     Border { direction S;    walldistance -1;  flag NoSlip; }
     Border { direction N;    walldistance -1;  flag UBB; }
-    */
 }
+*/
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..65ad0b5bf8e7d723673bcb356963a544f442c8c9
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+import os
+import pandas as pd
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars
+from os import getcwd
+from waLBerla.tools.jobscripts import createJobscript
+from datetime import timedelta
+
+
+CSV_FILE = "overlap_benchmark.csv"
+
+BASE_CONFIG = {
+    'DomainSetup': {
+        'cellsPerBlock': (256, 256, 256),
+        'periodic': (1, 1, 1),
+    },
+    'Parameters': {
+        'omega': 1.8,
+        'timesteps': 400,
+        'cudaEnabledMPI': False,
+        'warmupSteps': 5,
+        'outerIterations': 1,
+    }
+}
+
+
+class Scenario:
+    def __init__(self, **kwargs):
+        self.config_dict = BASE_CONFIG.copy()
+        self.config_dict['Parameters'].update(kwargs)
+        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
+
+    @wlb.member_callback
+    def config(self, **kwargs):
+        from pprint import pformat
+        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
+        return self.config_dict
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        sequenceValuesToScalars(data)
+
+        df = pd.DataFrame.from_records([data])
+        if not os.path.isfile(CSV_FILE):
+            df.to_csv(CSV_FILE, index=False)
+        else:
+            df.to_csv(CSV_FILE, index=False, mode='a', header=False)
+
+
+def overlap_benchmark():
+    scenarios = wlb.ScenarioManager()
+    inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1), (64, 1, 1),
+                          (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1), (64, 64, 1),
+                          (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32), (64, 64, 64)]
+
+    for strategy in ['simpleOverlap', 'complexOverlap', 'noOverlap']:
+        for inner_outer_split in inner_outer_splits:
+            if strategy == 'noOverlap' and inner_outer_split != (1, 1, 1):
+                continue
+            scenario = Scenario(timeStepStrategy=strategy, innerOuterSplit=inner_outer_split)
+            scenarios.add(scenario)
+
+
+if __name__ == '__main__':
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]:
+        with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f:
+            js = createJobscript(nodes=node_count,
+                                 output_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
+                                 error_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
+                                 initial_dir=getcwd(),
+                                 exe_name='UniformGridBenchmarkGPU',
+                                 parameter_files=['overlap_benchmark.py'],
+                                 wall_time=timedelta(minutes=25),
+                                 machine='pizdaint_hybrid',
+                                 account='d105',
+                                 )
+            f.write(js)
+else:
+    overlap_benchmark()