diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index e1b183d8532a11643345d7cecd3df4cea784d64c..32d06a834473ff86e05f75a4d0c4cccec3a07e67 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -89,15 +89,27 @@ int main( int argc, char **argv )
 
       lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
       lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
-      //lbm::GeneratedFixedDensity pressure(blocks, pdfFieldGpuID);
 
       ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
       noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
-      //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID );
 
        // Communication setup
       bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
-      int communicationScheme = parameters.getParameter<int>( "communicationScheme", (int) CommunicationSchemeType::UniformGPUScheme_Baseline );
+
+      const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
+      CommunicationSchemeType communicationScheme;
+      if( communicationSchemeStr == "GPUPackInfo_Baseline")
+          communicationScheme = GPUPackInfo_Baseline;
+      else if (communicationSchemeStr == "GPUPackInfo_Streams")
+          communicationScheme = GPUPackInfo_Streams;
+      else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
+          communicationScheme = UniformGPUScheme_Baseline;
+      else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
+          communicationScheme = UniformGPUScheme_Memcpy;
+      else {
+          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
+      }
+
       Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
 
 
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
index c6c1e18b2e4ce242241912d2cfb46291a3cfab12..81a0996f872ab2f409a0bef1a9619de8a2095e6b 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -7,18 +7,21 @@ DomainSetup
 
 Parameters 
 {
-	omega           1.8;
-	timesteps       10000;
-    warmupSteps     0;
-    outerIterations 1;
 
-	remainingTimeLoggerFrequency 30;
-	vtkWriteFrequency 500;
+	timesteps       10000;  // time steps of one performance measurement
+	warmupSteps     0;      // number of steps to run before measurement starts
+    outerIterations 1;      // how many measurements to conduct
 
-	cudaEnabledMPI false;
+    // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy
+    communicationScheme UniformGPUScheme_Baseline;
 
-	timeStepStrategy noOverlap;
-	innerOuterSplit < 64, 1, 1>;
+	vtkWriteFrequency 0;             // write a VTK file every n'th step, if zero VTK output is disabled
+	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation
+
+	timeStepStrategy noOverlap;      // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
+	innerOuterSplit < 32, 1, 1>;     // slice-thickness that 'outer'-kernels process when overlapping
+
+	remainingTimeLoggerFrequency 0;  // interval in seconds to log the estimated remaining time
 }
 
 /*
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
old mode 100644
new mode 100755
index 123b953817391e225275209ba00641d8f6d9ffd9..420be74fed17999af756a54aa7330bd9217580cf
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
@@ -60,15 +60,21 @@ def overlap_benchmark():
                           (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
                           (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
 
-    scenarios.add(Scenario(timeStepStrategy='noOverlap'))
-    for strategy in ['simpleOverlap', 'complexOverlap']:
-        for inner_outer_split in inner_outer_splits:
-            scenario = Scenario(timeStepStrategy=strategy, innerOuterSplit=inner_outer_split)
-            scenarios.add(scenario)
+    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:  # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
+        # no overlap
+        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1)))
 
+        # overlap
+        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
+            for inner_outer_split in inner_outer_splits:
+                scenario = Scenario(timeStepStrategy=overlap_strategy,
+                                    communicationScheme=comm_strategy,
+                                    innerOuterSplit=inner_outer_split)
+                scenarios.add(scenario)
 
-if __name__ == '__main__':
-    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]:
+
+def generate_jobscripts(machine='pizdaint_hybrid'):
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
         with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f:
             js = createJobscript(nodes=node_count,
                                  output_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
@@ -77,9 +83,14 @@ if __name__ == '__main__':
                                  exe_name='UniformGridBenchmarkGPU',
                                  parameter_files=['overlap_benchmark.py'],
                                  wall_time=timedelta(minutes=25),
-                                 machine='pizdaint_hybrid',
+                                 machine=machine,
                                  account='d105',
                                  )
             f.write(js)
+
+
+if __name__ == '__main__':
+    print("Called without waLBerla - generating job scripts for PizDaint")
+    generate_jobscripts()
 else:
     overlap_benchmark()
diff --git a/src/cuda/DeviceSelectMPI.cpp b/src/cuda/DeviceSelectMPI.cpp
index f8eee582e69528c32caf4f34f0d742c7a1548a28..3ba255d9f6fd926721477158243022c0012611ed 100644
--- a/src/cuda/DeviceSelectMPI.cpp
+++ b/src/cuda/DeviceSelectMPI.cpp
@@ -58,8 +58,8 @@ void selectDeviceBasedOnMpiRank()
    }
    else if ( deviceCount > processesOnNode )
    {
-      WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node "
-                               << processesOnNode << " available GPUs on node " << deviceCount );
+      WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node: "
+                               << processesOnNode << ", available GPUs on node: " << deviceCount );
       WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
    }
    else