diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index 6e704b430899c6e2545e786b93a4606e1c110021..857df4d5f20c4be25a6dacaafd8e3c987a53531f 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -29,10 +29,12 @@ waLBerla_python_file_generates(UniformGridGPU_AA.py
         UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
         UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
         UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
+        UniformGridGPU_AA_Defines.h
         )
 
-set(config "srt")
-waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config}
-        FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py
-        DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
-        CODEGEN_CFG ${config})
+foreach(config srt trt mrt smagorinsky entropic )
+    waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config}
+            FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py
+            DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
+            CODEGEN_CFG ${config})
+endforeach()
diff --git a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
index fe038ebdb41d7102cca994d742b3139e08eb30f7..2aed66b1ade0380aed7ff59080367897937dae12 100644
--- a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
+++ b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
@@ -6,7 +6,7 @@ namespace walberla {
 
 
 inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
-                              const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
+                              const real_t xMagnitude=0.005, const real_t fluctuationMagnitude=0.05 )
 {
     math::seedRandomGenerator(0);
     auto halfZ = blocks->getDomainCellBB().zMax() / 2;
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index ff2c93c4db3dbcca34e2aaeeb75994c7c848abdb..a413df6934496259c62fb21b3295a5e9d64186e8 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -151,6 +151,8 @@ int main( int argc, char **argv )
           communicationScheme = UniformGPUScheme_Memcpy;
       else if (communicationSchemeStr == "MPIDatatypes")
           communicationScheme = MPIDatatypes;
+      else if (communicationSchemeStr == "MPIDatatypesFull")
+          communicationScheme = MPIDatatypesFull;
       else {
           WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
       }
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
index de2821b8277003c09714ea8ad8f8d161f42c69ec..3ef98e079053e48b38a698729785f7521dffb094 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -1,27 +1,28 @@
 DomainSetup
 {
    blocks        <  1,    1,   1 >;
-   cellsPerBlock <  512, 256, 256 >;
+   cellsPerBlock <  128, 128, 128 >;
    periodic      <  1,    1,   1 >;
 }
 
 Parameters 
 {
 
-	timesteps       2000; //10000;  // time steps of one performance measurement
-	warmupSteps     0;      // number of steps to run before measurement starts
-    outerIterations 1;      // how many measurements to conduct
+	timesteps       1000;   // time steps of one performance measurement
+	warmupSteps     100;    // number of steps to run before measurement starts
+    outerIterations 3;      // how many measurements to conduct
 
     // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy
     communicationScheme UniformGPUScheme_Baseline;
 
-	vtkWriteFrequency 0; //100;             // write a VTK file every n'th step, if zero VTK output is disabled
+	vtkWriteFrequency 0;             // write a VTK file every n'th step, if zero VTK output is disabled
 	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation
 
-	timeStepStrategy noOverlap;      // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
+	timeStepStrategy kernelOnly;    // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
 	innerOuterSplit < 8, 1, 1>;     // slice-thickness that 'outer'-kernels process when overlapping
 
 	remainingTimeLoggerFrequency 5;  // interval in seconds to log the estimated remaining time
+    	gpuBlockSize < 128, 1, 1>;
 
 	omega 1.92;
 	initShearFlow 1;
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
index 80a4a3490b825d855d1ba0a30b0e3964e76a1c0f..b95b514b5ed4b29aeacca7f71b38440f5f7d8e0e 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -12,10 +12,14 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio
 from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
 
 omega = sp.symbols("omega")
-# sweep_block_size = (128, 1, 1)
-sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                    TypedSymbol("cudaBlockSize1", np.int32),
-                    1)
+compile_time_block_size = False
+
+if compile_time_block_size:
+    sweep_block_size = (128, 1, 1)
+else:
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        1)
 
 sweep_params = {'block_size': sweep_block_size}
 
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
index 348f63fd0dd5d08fdb481cecde9ab25222e17bc4..be38e3cb3eb6fcaadaa3caca922776b7b2b6f480 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
@@ -21,9 +21,13 @@
 #include "cuda/communication/UniformGPUScheme.h"
 #include "cuda/DeviceSelectMPI.h"
 #include "domain_decomposition/SharedSweep.h"
-#include "stencil/D3Q19.h"
-#include "stencil/D3Q27.h"
 #include "InitShearVelocity.h"
+#include "gui/Gui.h"
+
+#ifdef WALBERLA_ENABLE_GUI
+#include "lbm/gui/PdfFieldDisplayAdaptor.h"
+#endif
+
 
 #include "UniformGridGPU_AA_PackInfoPush.h"
 #include "UniformGridGPU_AA_PackInfoPull.h"
@@ -31,11 +35,12 @@
 #include "UniformGridGPU_AA_MacroGetter.h"
 #include "UniformGridGPU_AA_LbKernelEven.h"
 #include "UniformGridGPU_AA_LbKernelOdd.h"
+#include "UniformGridGPU_AA_Defines.h"
 
+#include <cmath>
 
 using namespace walberla;
 
-using Stencil_T = stencil::D3Q19; //TODO make generic - and determine from python script
 using CommunicationStencil_T = Stencil_T;
 using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
 using VelocityField_T = GhostLayerField< real_t, 3 >;
@@ -50,6 +55,8 @@ int main( int argc, char **argv )
     {
         WALBERLA_MPI_WORLD_BARRIER();
 
+        WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
+
         auto config = *cfg;
         logging::configureLogging( config );
         auto blocks = blockforest::createUniformBlockGridFromConfig( config );
@@ -61,7 +68,7 @@ int main( int argc, char **argv )
         const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));
 
         // Creating fields
-        BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( 42.0 ), field::fzyx );
+        BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx );
         BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
 
         WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
@@ -215,32 +222,52 @@ int main( int argc, char **argv )
             timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
         }
 
-        for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
+        bool useGui = parameters.getParameter<bool>( "useGui", false );
+        if( useGui )
+        {
+            cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+            timeLoop.addFuncAfterTimeStep( cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ), "copy to CPU" );
+            GUI gui( timeLoop, blocks, argc, argv);
+                gui.registerDisplayAdaptorCreator(
+                [&](const IBlock & block, ConstBlockDataID blockDataID) -> gui::DisplayAdaptor * {
+                    if ( block.isDataOfType< PdfField_T >( blockDataID) )
+                        return new lbm::PdfFieldDisplayAdaptor<GhostLayerField<real_t, Stencil_T::Q>, Stencil_T >( blockDataID );
+                    return nullptr;
+                });
+            gui.run();
+        }
+        else
         {
-            timeLoop.setCurrentTimeStepToZero();
-            WcTimer simTimer;
-            cudaDeviceSynchronize();
-            WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
-            simTimer.start();
-            timeLoop.run();
-            cudaDeviceSynchronize();
-            simTimer.end();
-            WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
-            auto time = simTimer.last();
-            auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-
-            auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-            WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
-            WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
-            WALBERLA_ROOT_SECTION()
+            for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
             {
-                python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                if ( pythonCallbackResults.isCallable())
+                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
+
+                timeLoop.setCurrentTimeStepToZero();
+                WcTimer simTimer;
+                cudaDeviceSynchronize();
+                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
+                WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+                simTimer.start();
+                timeLoop.run();
+                cudaDeviceSynchronize();
+                simTimer.end();
+                WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+                auto time = simTimer.last();
+                auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+
+                auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+                WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+                WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+                WALBERLA_ROOT_SECTION()
                 {
-                    pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                    pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
-                    // Call Python function to report results
-                    pythonCallbackResults();
+                    python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+                    if ( pythonCallbackResults.isCallable())
+                    {
+                        pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                        pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
+                        // Call Python function to report results
+                        pythonCallbackResults();
+                    }
                 }
             }
         }
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
index 45ea8043d0cd1fe2a78080e70a93e9874844d40f..105cb48f4800cbc82ba16908025eb9dd120d8777 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
@@ -10,10 +10,14 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio
 from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
 
 omega = sp.symbols("omega")
-# sweep_block_size = (128, 1, 1)
-sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                    TypedSymbol("cudaBlockSize1", np.int32),
-                    1)
+compile_time_block_size = False
+
+if compile_time_block_size:
+    sweep_block_size = (128, 1, 1)
+else:
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        1)
 
 sweep_params = {'block_size': sweep_block_size}
 
@@ -58,7 +62,9 @@ with CodeGeneration() as ctx:
     common_options = {
         'field_name': 'pdfs',
         'optimization': {'cse_global': True,
-                         'cse_pdfs': False}
+                         'cse_pdfs': False,
+                         'field_layout': 'fzyx',
+                         }
     }
     options = options_dict.get(ctx.config, options_dict['srt'])
     options.update(common_options)
@@ -98,3 +104,6 @@ with CodeGeneration() as ctx:
     # communication
     generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'], kind='pull', target='gpu')
     generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'], kind='push', target='gpu')
+
+    ctx.write_file("UniformGridGPU_AA_Defines.h",
+                   '#include "stencil/D3Q{0}.h"\nusing Stencil_T = walberla::stencil::D3Q{0}; \n '.format(q))
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
index 6a3bf6b532fe67b753cf2d64737a25d7747e067d..db0ec86e6cc1f58c4548a57023ac8ce49d69478f 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
@@ -6,6 +6,7 @@
 #include "blockforest/communication/UniformBufferedScheme.h"
 #include "blockforest/communication/UniformDirectScheme.h"
 #include "field/communication/StencilRestrictedMPIDatatypeInfo.h"
+#include "field/communication/UniformMPIDatatypeInfo.h"
 #include "cuda/communication/GPUPackInfo.h"
 #include "cuda/communication/UniformGPUScheme.h"
 #include "cuda/communication/MemcpyPackInfo.h"
@@ -20,7 +21,8 @@ enum CommunicationSchemeType {
     GPUPackInfo_Streams = 1,
     UniformGPUScheme_Baseline = 2,
     UniformGPUScheme_Memcpy = 3,
-    MPIDatatypes = 4
+    MPIDatatypes = 4,
+    MPIDatatypesFull = 5
 };
 
 
@@ -36,6 +38,7 @@ public:
         auto generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId );
         auto memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
         auto dataTypeInfo = make_shared< field::communication::StencilRestrictedMPIDatatypeInfo< GPUFieldType, StencilType > >( bdId );
+        auto dataTypeInfoFull = make_shared< field::communication::UniformMPIDatatypeInfo<GPUFieldType> >( bdId );
 
         switch(_commSchemeType)
         {
@@ -63,6 +66,12 @@ public:
                 }
                 _directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfo );
                 break;
+            case MPIDatatypesFull:
+                if( ! cudaEnabledMPI ) {
+                    WALBERLA_ABORT("MPI datatype-based communication not possible if no cudaEnabledMPI is available.");
+                }
+                _directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfoFull );
+                break;
             default:
                 WALBERLA_ABORT("Invalid GPU communication scheme specified!");
         }
@@ -103,6 +112,7 @@ public:
                 _gpuCommunicationScheme->startCommunication( communicationStream );
                 break;
             case MPIDatatypes:
+            case MPIDatatypesFull:
                 WALBERLA_ASSERT_NOT_NULLPTR( _directScheme );
                 _directScheme->startCommunication();
                 break;
@@ -131,6 +141,7 @@ public:
                 _gpuCommunicationScheme->wait( communicationStream );
                 break;
             case MPIDatatypes:
+            case MPIDatatypesFull:
                 WALBERLA_ASSERT_NOT_NULLPTR( _directScheme );
                 _directScheme->wait();
                 break;
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
index 420be74fed17999af756a54aa7330bd9217580cf..1beacb0059ba1df29e03cc71387ee645cf730268 100755
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
@@ -9,6 +9,7 @@ from os import getcwd
 from waLBerla.tools.jobscripts import createJobscript
 from datetime import timedelta
 from copy import deepcopy
+import sys
 
 CSV_FILE = "overlap_benchmark.csv"
 
@@ -22,16 +23,18 @@ BASE_CONFIG = {
         'timesteps': 400,
         'cudaEnabledMPI': False,
         'warmupSteps': 5,
-        'outerIterations': 1,
+        'outerIterations': 3,
+        'initShearFlow': True,
     }
 }
 
 
 class Scenario:
-    def __init__(self, **kwargs):
+    def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
         self.config_dict = deepcopy(BASE_CONFIG)
         self.config_dict['Parameters'].update(kwargs)
         self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
+        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
 
     @wlb.member_callback
     def config(self, **kwargs):
@@ -45,6 +48,10 @@ class Scenario:
         data.update(self.config_dict['Parameters'])
         data.update(self.config_dict['DomainSetup'])
         data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
         sequenceValuesToScalars(data)
 
         df = pd.DataFrame.from_records([data])
@@ -73,15 +80,50 @@ def overlap_benchmark():
                 scenarios.add(scenario)
 
 
-def generate_jobscripts(machine='pizdaint_hybrid'):
+def single_gpu_benchmark():
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
+                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
+                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
+                   (32, 8, 1), (64, 8, 1),
+                   (32, 16, 1)]
+    for block_size in block_sizes:
+        for cuda_block_size in cuda_blocks:
+            cells = block_size[0] * block_size[1] * block_size[2]
+            time_steps_for_128_cubed = 1000
+            time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed
+            scenario = Scenario(cells_per_block=block_size,
+                                gpuBlockSize=cuda_block_size,
+                                timeStepStrategy='kernelOnly',
+                                timesteps=int(time_steps))
+            scenarios.add(scenario)
+
+
+all_executables = ('UniformGridBenchmarkGPU_AA_entropic',
+                   'UniformGridBenchmarkGPU_AA_mrt',
+                   'UniformGridBenchmarkGPU_AA_smagorinsky',
+                   'UniformGridBenchmarkGPU_AA_srt',
+                   'UniformGridBenchmarkGPU_AA_trt',
+                   'UniformGridBenchmarkGPU_entropic',
+                   'UniformGridBenchmarkGPU_mrt',
+                   'UniformGridBenchmarkGPU_smagorinsky',
+                   'UniformGridBenchmarkGPU_srt',
+                   'UniformGridBenchmarkGPU_trt')
+
+
+def generate_jobscripts(machine='pizdaint_hybrid',
+                        exe_names=all_executables):
     for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
         with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f:
+
             js = createJobscript(nodes=node_count,
                                  output_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
                                  error_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
                                  initial_dir=getcwd(),
-                                 exe_name='UniformGridBenchmarkGPU',
-                                 parameter_files=['overlap_benchmark.py'],
+                                 #exe_name='UniformGridBenchmarkGPU',
+                                 #parameter_files=['overlap_benchmark.py'],
+                                 commands=list((exe, 'overlap_benchmark.py') for exe in exe_names),
                                  wall_time=timedelta(minutes=25),
                                  machine=machine,
                                  account='d105',
@@ -93,4 +135,4 @@ if __name__ == '__main__':
     print("Called without waLBerla - generating job scripts for PizDaint")
     generate_jobscripts()
 else:
-    overlap_benchmark()
+    single_gpu_benchmark()