diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index c1ed30eafa8876741fc36490d51e79f998d131a0..391feddd71867bc6e3e3618edb1f878c8426e5ef 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -98,7 +98,7 @@ int main( int argc, char **argv )
       const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
       const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
       const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
-      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);
+      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
 
       // Creating fields
       BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
@@ -165,7 +165,12 @@ int main( int argc, char **argv )
       }
 
       Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
-
+      for(int i=0; i< 3; ++i)
+      {
+          if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
+              WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
+          }
+      }
 
       int streamHighPriority = 0;
       int streamLowPriority = 0;
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
index c8a2575067e0947cb135f566c85a8175042168e5..9ae07cd26653e21bc2f02fffc086386d816613a0 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -1,7 +1,7 @@
 import sympy as sp
 import numpy as np
 import pystencils as ps
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
+from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor, StreamPushTwoFieldsAccessor
 from pystencils_walberla import generate_pack_info_from_kernel
@@ -39,7 +39,7 @@ options_dict = {
     'mrt': {
         'method': 'mrt',
         'stencil': 'D3Q19',
-        'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7],
+        'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235],
     },
     'mrt_full': {
         'method': 'mrt',
@@ -147,7 +147,7 @@ with CodeGeneration() as ctx:
     # CPU lattice model - required for macroscopic value computation, VTK output etc.
     options_without_opt = options.copy()
     del options_without_opt['optimization']
-    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method, update_rule_params=options_without_opt)
+    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method, **options_without_opt))
 
     # gpu LB sweep & boundaries
     generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
index fdf1c1bf759ec07236098a22d70111fa9561a2da..c5fb073abd4a45c69308e6be92c338f3f2d0e314 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
@@ -83,6 +83,14 @@ int main( int argc, char **argv )
         BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
 
         Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
+
+        for(int i=0; i< 3; ++i)
+        {
+            if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
+                WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
+            }
+        }
+
         Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
         bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
         Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
index 08d8b875ac232f76dbf1a065a6e7ee4810c8fc66..882de06f9d1bbe5ae2a189d0b38a0dd707d98bc5 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
@@ -36,7 +36,7 @@ options_dict = {
     'mrt': {
         'method': 'mrt',
         'stencil': 'D3Q19',
-        'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1],
+        'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
     },
     'entropic': {
         'method': 'mrt3',
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py b/apps/benchmarks/UniformGridGPU/simulation_setup/base.py
deleted file mode 100644
index 235bfe948d57ae5947a944b368aa87bdc4d64e9c..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# encoding: utf-8
-
-import math
-import operator
-from functools import reduce
-import waLBerla as wlb
-
-# Constants that define the size of blocks that are used in the benchmarks
-MIN_CELLS_PER_BLOCK = 16
-MAX_CELLS_PER_BLOCK = 256
-INC_CELLS_PER_BLOCK = 16
-# Amount of cells per block
-cells_per_block_interval = range(MIN_CELLS_PER_BLOCK, MAX_CELLS_PER_BLOCK + 1, INC_CELLS_PER_BLOCK)
-# Blocks with size in [16, 32, 64, 128, 256]
-cells_per_block = [num_cells for num_cells in cells_per_block_interval]
-# Number of active MPI processes
-num_processes = wlb.mpi.numProcesses()
-# Whether to overlap computation with communication
-overlap_communication = [False, True]
-# Whether MPI supports buffers in GPU memory
-cuda_enabled_mpi = [False, True]
-# Supported communication schemes
-communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']
-
-
-def calculate_time_steps(runtime, expected_mlups, domain_size):
-    cells = reduce(operator.mul, domain_size, 1)
-    time_steps_per_second = expected_mlups * 1e6 / cells
-    return int(time_steps_per_second * runtime)
-
-
-def side_length_to_fill_memory(memory_fill_percentage, memory_in_gb):
-    bytes_per_cell = 19 * 2 * 8
-    max_cells = memory_in_gb * 1e9 / bytes_per_cell * memory_fill_percentage
-    return int(max_cells**(1/3))
-
-
-def get_block_decomposition(block_decomposition, num_processes):
-    bx = by = bz = 1
-    blocks_per_axis = int(math.log(num_processes, 2))
-    for i in range(blocks_per_axis):
-        decomposition_axis = block_decomposition[i % len(block_decomposition)]
-        if decomposition_axis == 'y':
-            by *= 2
-        elif decomposition_axis == 'z':
-            bz *= 2
-        elif decomposition_axis == 'x':
-            bx *= 2
-
-    assert (bx * by * bz) == num_processes
-    return bx, by, bz
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py
deleted file mode 100644
index daf4fee66347cbe31015aaf1420318689653c8bb..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# encoding: utf-8
-
-import os
-import pandas as pd
-import waLBerla as wlb
-import copy
-from datetime import datetime
-
-
-CommunicationSchemeType = {
-    'GPUPackInfo_Baseline': 0,
-    'GPUPackInfo_Streams': 1,
-    'UniformGPUScheme_Baseline': 2,
-    'UniformGPUScheme_Memcpy': 3,
-}
-
-CommunicationSchemeName = {
-    0: 'GPUPackInfo_Baseline',
-    1: 'GPUPackInfo_Streams',
-    2: 'UniformGPUScheme_Baseline',
-    3: 'UniformGPUScheme_Memcpy',
-}
-
-# Base configuration for the benchmark
-BASE_CONFIG = {
-    'DomainSetup': {
-        'cellsPerBlock': (64, 64, 64),
-        'blocks': (1, 1, 1),
-        'nrOfProcesses': (1, 1, 1),
-        'periodic': (0, 0, 1),
-        'dx': 1.0
-    },
-    'Parameters': {
-        'omega': 1.8,
-        'timesteps': 1001,
-        'remainingTimeLoggerFrequency': 250,
-        'vtkWriteFrequency': 0,
-        'overlapCommunication': False,
-        'cudaEnabledMPI': False,
-        'initialVelocity': (0, 0, 0),
-        'performanceReportFrequency': 250,
-        'communicationScheme': CommunicationSchemeType['UniformGPUScheme_Baseline'],
-    },
-    'Boundaries': {
-        'Border': [
-            {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
-            {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
-            {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
-            {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
-        ]
-    }
-}
-
-
-class BenchmarkScenario:
-    def __init__(self, testcase, time_steps, decomposition_axes=None, fully_periodic=False):
-        self.testcase = testcase
-        self.scenario_config = copy.deepcopy(BASE_CONFIG)
-        self.scenario_config['Parameters']['timesteps'] = time_steps
-        self.fully_periodic = fully_periodic
-        if fully_periodic:
-            del self.scenario_config['Boundaries']['Border']
-            self.scenario_config['DomainSetup']['periodic'] = (1, 1, 1)
-        self.decomposition_axes = decomposition_axes
-
-        now = datetime.now().replace(second=0, microsecond=0)
-        self.output_filename = f'{self.testcase}_{now.strftime("%Y-%m-%d_%H-%M")}.csv'
-
-    def get_data(self):
-        block_setup = self.scenario_config.get('DomainSetup')
-        params = self.scenario_config.get('Parameters')
-
-        return {
-            'processesX': block_setup.get('nrOfProcesses')[0],
-            'processesY': block_setup.get('nrOfProcesses')[1],
-            'processesZ': block_setup.get('nrOfProcesses')[2],
-            'blocksX': block_setup.get('blocks')[0],
-            'blocksY': block_setup.get('blocks')[1],
-            'blocksZ': block_setup.get('blocks')[2],
-            'fully_periodic': self.fully_periodic,
-            'cellsPerBlockX': block_setup.get('cellsPerBlock')[0],
-            'cellsPerBlockY': block_setup.get('cellsPerBlock')[1],
-            'cellsPerBlockZ': block_setup.get('cellsPerBlock')[2],
-            'cudaEnabledMPI': params.get('cudaEnabledMPI'),
-            'overlapCommunication': params.get('overlapCommunication'),
-            'time_steps': params['timesteps'],
-            'domainDecomposition': self.decomposition_axes,
-            'communicationScheme': CommunicationSchemeName[params.get('communicationScheme')],
-        }
-
-    @wlb.member_callback
-    def config(self, **kwargs):
-        from pprint import pformat
-        wlb.log_info_on_root("Scenario:\n" + pformat(self.get_data()))
-        return self.scenario_config
-
-    @wlb.member_callback
-    def results_callback(self, **kwargs):
-        data = self.get_data()
-        data.update(kwargs)
-        self.save_data([data])
-
-    def save_data(self, data):
-        df = pd.DataFrame(data)
-        if not os.path.isfile(self.output_filename):
-            df.to_csv(self.output_filename, index=False)
-        else:
-            df.to_csv(self.output_filename, index=False, mode='a', header=False)
-
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
new file mode 100755
index 0000000000000000000000000000000000000000..ca66b364cd331050e12a3540036fbe0ab9bd083b
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""
+This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication.
+Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1  then run e.g.
+ ./UniformGridBenchmarkGPU_AA_trt simulation_setup/benchmark_configs.py
+
+Look at the end of the file to select the benchmark to run
+"""
+
+import os
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition, toPrm
+from waLBerla.tools.sqlitedb import *
+from copy import deepcopy
+import sys
+import sqlite3
+
+# Number of time steps run for a workload of 128^3 per GPU
+# if double as many cells are on the GPU, half as many time steps are run etc.
+# increase this to get more reliable measurements
+TIME_STEPS_FOR_128_BLOCK = 200
+DB_FILE = "gpu_benchmark.sqlite3"
+
+BASE_CONFIG = {
+    'DomainSetup': {
+        'cellsPerBlock': (256, 128, 128),
+        'periodic': (1, 1, 1),
+    },
+    'Parameters': {
+        'omega': 1.8,
+        'cudaEnabledMPI': False,
+        'warmupSteps': 5,
+        'outerIterations': 3,
+    }
+}
+
+
+def num_time_steps(block_size):
+    cells = block_size[0] * block_size[1] * block_size[2]
+    time_steps = (128 ** 3 / cells) * TIME_STEPS_FOR_128_BLOCK
+    return int(time_steps)
+
+
+class Scenario:
+    def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
+        self.config_dict = deepcopy(BASE_CONFIG)
+        self.config_dict['Parameters'].update(kwargs)
+        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
+        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
+
+    @wlb.member_callback
+    def config(self, **kwargs):
+        from pprint import pformat
+        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
+        # Write out the configuration as text-based prm:
+        #print(toPrm(self.config_dict))
+        return self.config_dict
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        sequenceValuesToScalars(data)
+
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        for num_try in range(num_tries):  # check multiple times e.g. may fail when multiple benchmark processes are running
+            try:
+                checkAndUpdateSchema(result, "runs", DB_FILE)
+                storeSingle(result, "runs", DB_FILE)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning("Sqlite DB writing failed: try {}/{}  {}".format(num_try+1, num_tries, str(e)))
+
+
+# -------------------------------------- Functions trying different parameter sets -------------------------------------------------------------------
+
+
+def overlap_benchmark():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running different communication overlap strategies")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
+                          (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
+                          (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
+
+    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:  # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
+        # no overlap
+        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1)))
+
+        # overlap
+        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
+            for inner_outer_split in inner_outer_splits:
+                scenario = Scenario(timeStepStrategy=overlap_strategy,
+                                    communicationScheme=comm_strategy,
+                                    innerOuterSplit=inner_outer_split,
+                                    timesteps=num_time_steps((256, 128, 128)))
+                scenarios.add(scenario)
+
+
+def communication_compare():
+    """Tests different communication strategies"""
+    wlb.log_info_on_root("Running benchmarks to compare communication strategies")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
+        for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+
+            sc = Scenario(cells_per_block=block_size,
+                          gpuBlockSize=(128, 1, 1),
+                          timeStepStrategy='noOverlap',
+                          communicationScheme=comm_strategy,
+                          timesteps=num_time_steps(block_size))
+            scenarios.add(sc)
+            for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
+                if 3 * inner_outer_split[0] > block_size[0]:  # ensure that the inner part of the domain is still large enough
+                    continue
+                sc = Scenario(cells_per_block=block_size,
+                              gpuBlockSize=(128, 1, 1),
+                              timeStepStrategy='simpleOverlap',
+                              innerOuterSplit=inner_outer_split,
+                              communicationScheme=comm_strategy,
+                              timesteps=num_time_steps(block_size))
+                scenarios.add(sc)
+
+
+def single_gpu_benchmark():
+    """Benchmarks only the LBM compute kernel"""
+    wlb.log_info_on_root("Running single GPU benchmarks")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
+                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
+                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
+                   (32, 8, 1), (64, 8, 1),
+                   (32, 16, 1)]
+    for block_size in block_sizes:
+        for cuda_block_size in cuda_blocks:
+            scenario = Scenario(cells_per_block=block_size,
+                                gpuBlockSize=cuda_block_size,
+                                timeStepStrategy='kernelOnly',
+                                timesteps=num_time_steps(block_size))
+            scenarios.add(scenario)
+
+
+# -------------------------------------- Optional job script generation for PizDaint -------------------------------------------------------------------
+
+
+job_script_header = """
+#!/bin/bash -l
+#SBATCH --job-name=scaling
+#SBATCH --time=0:30:00
+#SBATCH --nodes={nodes}
+#SBATCH -o out_scaling_{nodes}_%j.txt
+#SBATCH -e err_scaling_{nodes}_%j.txt
+#SBATCH --ntasks-per-core=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=1
+#SBATCH --partition=normal
+#SBATCH --constraint=gpu
+#SBATCH --account=d105
+
+cd {folder}
+
+source ~/env.sh
+
+module load daint-gpu
+module load craype-accel-nvidia60
+export MPICH_RDMA_ENABLED_CUDA=1  # allow GPU-GPU data transfer
+export CRAY_CUDA_MPS=1            # allow GPU sharing
+export MPICH_G2G_PIPELINE=256     # adapt maximum number of concurrent in-flight messages
+
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export CRAY_CUDA_MPS=1
+
+export MPICH_RANK_REORDER_METHOD=3
+export PMI_MMAP_SYNC_WAIT_TIME=300
+
+
+# grid_order -R -H -c 1,1,8 -g 16,16,8
+
+ulimit -c 0
+"""
+
+job_script_exe_part = """
+
+export WALBERLA_SCENARIO_IDX=0
+while srun -n {nodes} ./{app} {config}
+do
+ ((WALBERLA_SCENARIO_IDX++))
+done
+"""
+
+
+all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
+                   'UniformGridBenchmarkGPU_smagorinsky_d3q27',
+                   'UniformGridBenchmarkGPU_cumulant'
+                   'UniformGridBenchmarkGPU_cumulant_d3q27')
+
+
+def generate_jobscripts(exe_names=all_executables):
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
+        folder_name = "scaling_{:04d}".format(node_count)
+        os.makedirs(folder_name, exist_ok=True)
+
+        # run grid_order
+        import subprocess
+        decomposition = block_decomposition(node_count)
+        decomposition_str = ",".join(str(e) for e in decomposition)
+        subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
+
+        job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
+        for exe in exe_names:
+            job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count, config='../communication_compare.py')
+
+        with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
+            f.write(job_script)
+
+
+if __name__ == '__main__':
+    print("Called without waLBerla - generating job scripts for PizDaint")
+    generate_jobscripts()
+else:
+    wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE))
+    # Select the benchmark you want to run
+    single_gpu_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU performance of compute kernel (no communication)
+    #communication_compare()  # benchmarks different communication routines, with and without overlap
+    #overlap_benchmark()      # benchmarks different communication overlap options
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/inter_node.py b/apps/benchmarks/UniformGridGPU/simulation_setup/inter_node.py
deleted file mode 100644
index 6498878fa229f8f2a7219fb2dc85abd32b211d19..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/inter_node.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# encoding: utf-8
-
-import itertools
-import waLBerla as wlb
-from base import get_block_decomposition, communication_schemes, overlap_communication, \
-                 cuda_enabled_mpi, cells_per_block, num_processes
-from benchmark import BenchmarkScenario, CommunicationSchemeType
-
-
-# Stores the scenarios for the current simulation
-scenarios = wlb.ScenarioManager()
-
-# Generates all block decompositions of xyz, 2 directions at a time
-#block_decompositions = itertools.combinations_with_replacement('xyz', r=2)
-block_decompositions = ['xy', 'yz', 'xz']
-
-scenario_generator = itertools.product(communication_schemes, overlap_communication, cuda_enabled_mpi, block_decompositions, cells_per_block)
-
-testcase_name = "inter-node"
-
-for scenario_params in scenario_generator:
-    # Extract parameters from tuple
-    comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params
-    if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True:
-        # Skip CUDA enabled MPI tests for GPUPackInfo tests
-        continue
-    elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True:
-        # Skip communication overlap tests for GPUPackInfo baseline
-        continue
-    # Convert the axes decompositions to string
-    decomposition_axes_str = ''.join(decomposition_axes)
-    # Compute block decomposition based on the specified axes and the number of processes
-    blocks = get_block_decomposition(decomposition_axes, num_processes)
-    # Create a benchmark scenario
-    scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str)
-    # Domain Setup parameters
-    domain_setup = scenario.scenario_config['DomainSetup']
-    domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,)
-    domain_setup['nrOfProcesses'] = blocks
-    domain_setup['blocks'] = blocks
-    # Additional parameters for benchmarking
-    params = scenario.scenario_config['Parameters']
-    params['cudaEnabledMPI'] = is_cuda_enabled_mpi
-    params['overlapCommunication'] = is_communication_overlapped
-    params['communicationScheme'] = CommunicationSchemeType[comm_scheme]
-    # Add scenario for execution
-    scenarios.add(scenario)
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/intra_node.py b/apps/benchmarks/UniformGridGPU/simulation_setup/intra_node.py
deleted file mode 100644
index 46b6c06002d0ac663dfeef6ae2dbb32bb8ba300d..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/intra_node.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# encoding: utf-8
-
-import itertools
-import waLBerla as wlb
-from base import get_block_decomposition, communication_schemes, overlap_communication, \
-                 cuda_enabled_mpi, cells_per_block, num_processes
-from benchmark import BenchmarkScenario, CommunicationSchemeType
-
-
-# Stores the scenarios for the current simulation
-scenarios = wlb.ScenarioManager()
-
-# Generates all block decompositions of xyz, 2 directions at a time
-#block_decompositions = itertools.combinations_with_replacement('xyz', r=2)
-block_decompositions = ['xy', 'yz', 'zx']
-
-scenario_generator = itertools.product(communication_schemes, overlap_communication, cuda_enabled_mpi, block_decompositions, cells_per_block)
-
-testcase_name = "intra-node"
-
-for scenario_params in scenario_generator:
-    # Extract parameters from tuple
-    comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params
-    if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True:
-        # Skip CUDA enabled MPI tests for GPUPackInfo tests
-        continue
-    elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True:
-        # Skip communication overlap tests for GPUPackInfo baseline
-        continue
-    # Convert the axes decompositions to string
-    decomposition_axes_str = ''.join(decomposition_axes)
-    # Compute block decomposition based on the specified axes and the number of processes
-    blocks = get_block_decomposition(decomposition_axes, num_processes)
-    # Create a benchmark scenario
-    scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str)
-    # Domain Setup parameters
-    domain_setup = scenario.scenario_config['DomainSetup']
-    domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,)
-    domain_setup['nrOfProcesses'] = blocks
-    domain_setup['blocks'] = blocks
-    # Additional parameters for benchmarking
-    params = scenario.scenario_config['Parameters']
-    params['cudaEnabledMPI'] = is_cuda_enabled_mpi
-    params['overlapCommunication'] = is_communication_overlapped
-    params['communicationScheme'] = CommunicationSchemeType[comm_scheme]
-    # Add scenario for execution
-    scenarios.add(scenario)
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
deleted file mode 100755
index 6d64d4d89f7ed0b82a194997b08c60cbeabd236d..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import pandas as pd
-import waLBerla as wlb
-from waLBerla.tools.config import block_decomposition
-from waLBerla.tools.sqlitedb import sequenceValuesToScalars
-from os import getcwd
-from waLBerla.tools.jobscripts import createJobscript
-from datetime import timedelta
-from copy import deepcopy
-import sys
-
-CSV_FILE = "overlap_benchmark.csv"
-
-BASE_CONFIG = {
-    'DomainSetup': {
-        'cellsPerBlock': (256, 128, 128),
-        'periodic': (1, 1, 1),
-    },
-    'Parameters': {
-        'omega': 1.8,
-        'timesteps': 400,
-        'cudaEnabledMPI': False,
-        'warmupSteps': 5,
-        'outerIterations': 3,
-        'initShearFlow': True,
-    }
-}
-
-
-class Scenario:
-    def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
-        self.config_dict = deepcopy(BASE_CONFIG)
-        self.config_dict['Parameters'].update(kwargs)
-        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
-        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
-
-    @wlb.member_callback
-    def config(self, **kwargs):
-        from pprint import pformat
-        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
-        return self.config_dict
-
-    @wlb.member_callback
-    def results_callback(self, **kwargs):
-        data = {}
-        data.update(self.config_dict['Parameters'])
-        data.update(self.config_dict['DomainSetup'])
-        data.update(kwargs)
-        data['executable'] = sys.argv[0]
-        data['compile_flags'] = wlb.build_info.compiler_flags
-        data['walberla_version'] = wlb.build_info.version
-        data['build_machine'] = wlb.build_info.build_machine
-        sequenceValuesToScalars(data)
-
-        df = pd.DataFrame.from_records([data])
-        if not os.path.isfile(CSV_FILE):
-            df.to_csv(CSV_FILE, index=False)
-        else:
-            df.to_csv(CSV_FILE, index=False, mode='a', header=False)
-
-
-def overlap_benchmark():
-    scenarios = wlb.ScenarioManager()
-    inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
-                          (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
-                          (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
-
-    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:  # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
-        # no overlap
-        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1)))
-
-        # overlap
-        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
-            for inner_outer_split in inner_outer_splits:
-                scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                    communicationScheme=comm_strategy,
-                                    innerOuterSplit=inner_outer_split)
-                scenarios.add(scenario)
-
-
-def single_gpu_benchmark():
-    scenarios = wlb.ScenarioManager()
-    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
-                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
-                   (32, 8, 1), (64, 8, 1),
-                   (32, 16, 1)]
-    for block_size in block_sizes:
-        for cuda_block_size in cuda_blocks:
-            cells = block_size[0] * block_size[1] * block_size[2]
-            time_steps_for_128_cubed = 1000
-            time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed
-            scenario = Scenario(cells_per_block=block_size,
-                                gpuBlockSize=cuda_block_size,
-                                timeStepStrategy='kernelOnly',
-                                timesteps=int(time_steps))
-            scenarios.add(scenario)
-
-
-all_executables = ('UniformGridBenchmarkGPU_AA_entropic',
-                   'UniformGridBenchmarkGPU_AA_mrt',
-                   'UniformGridBenchmarkGPU_AA_smagorinsky',
-                   'UniformGridBenchmarkGPU_AA_srt',
-                   'UniformGridBenchmarkGPU_AA_trt',
-                   'UniformGridBenchmarkGPU_entropic',
-                   'UniformGridBenchmarkGPU_mrt',
-                   'UniformGridBenchmarkGPU_smagorinsky',
-                   'UniformGridBenchmarkGPU_srt',
-                   'UniformGridBenchmarkGPU_trt')
-
-
-def generate_jobscripts(machine='pizdaint_hybrid',
-                        exe_names=all_executables):
-    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
-        with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f:
-
-            js = createJobscript(nodes=node_count,
-                                 output_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
-                                 error_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
-                                 initial_dir=getcwd(),
-                                 commands=list(("./" + exe, 'overlap_benchmark.py') for exe in exe_names),
-                                 wall_time=timedelta(minutes=25),
-                                 machine=machine,
-                                 account='d105',
-                                 )
-            f.write(js)
-
-
-if __name__ == '__main__':
-    print("Called without waLBerla - generating job scripts for PizDaint")
-    generate_jobscripts()
-else:
-    single_gpu_benchmark()
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py b/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py
deleted file mode 100755
index bfd59cd4c343015b4dd2ffe931f2167e7129665a..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python3
-from os import getcwd
-from waLBerla.tools.jobscripts import createJobscript
-from datetime import timedelta
-
-
-for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]:
-    with open("job_weak_scaling_{:04d}.sh".format(node_count), 'w') as f:
-        js = createJobscript(nodes=node_count,
-                             output_file='out_lbm_bench_{:04d}_%j.txt'.format(node_count),
-                             error_file='err_lbm_bench_{:04d}_%j.txt'.format(node_count),
-                             initial_dir=getcwd(),
-                             exe_name='UniformGridBenchmarkGPU',
-                             parameter_files=['weak_scaling.py'],
-                             wall_time=timedelta(minutes=25),
-                             machine='pizdaint_hybrid',
-                             account='d105',
-                             )
-        f.write(js)
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm b/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm
new file mode 100644
index 0000000000000000000000000000000000000000..ef257af8bc013da08ad5d04765e786c44a756a5e
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm
@@ -0,0 +1,37 @@
+DomainSetup
+{
+    blocks < 1,1,1 >;              // use as many blocks as MPI processes
+    cellsPerBlock < 128,64,64 >;   // domain size per MPI process, leave constant for weak scaling
+    periodic < 1,1,1 >;
+}
+
+
+Parameters
+{
+    cudaEnabledMPI False;           // set to true, if MPI was compiled with CUDA
+    outerIterations 3;              // number of measurements to make
+    timeStepStrategy simpleOverlap; // one of simpleOverlap, noOverlap, the non-AA version also supports complexOverlap
+                                    // fastest is simpleOverlap
+    innerOuterSplit <8, 1, 1>;      // only important when overlapping communication
+                                    // domain is split into communication-dependent outer and inner part
+                                    // this parameter makes the outer part larger than necessary since the processing of a single outer layer is slow
+                                    // this parameter specifies the thickness of the outer layer in each direction
+                                    // make sure your block is large enough, the outer part is 2*innerOuterSplit big, make sure there is a inner part left
+    timesteps 2000;                 // time steps per measurement
+    warmupSteps 5;                  // number of time steps before starting measurement
+
+    vtkWriteFrequency 0;            // how often to write VTK output
+
+    gpuBlockSize < 128,1,1 >;       // size of CUDA blocks - usually large x extents are fast
+    omega 1.8;
+
+    // valid in the non-AA version - determines how the ghost layer exchange is done
+    // the AA version uses always the fastest "UniformGPUScheme_Baseline"
+    //communicationScheme UniformGPUScheme_Baseline
+    //   UniformGPUScheme_Baseline:  packing/unpacking in generated kernels, every direction is handled by separate CUDA stream, can make use of CUDA aware MPI, most probably the fastest version
+    //   UniformGPUScheme_Memcpy:    some as above, but packing is done with cudaMemcpy(3D)
+    //   MPIDatatypes:               use MPI datatypes for packing, needs CUDA aware MPI
+    //   MPIDatatypesFull:           same as above but sends all PDFs
+    //   GPUPackInfo_Baseline:       old implementation based on communication mechanism for CPUs
+    //   GPUPackInfo_Streams:        same as above but with CUDA streams
+}
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.py b/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.py
deleted file mode 100644
index 78da581df4c61cb84a2e94fae4a9b4ed5505d393..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# encoding: utf-8
-
-import itertools
-import waLBerla as wlb
-from base import cells_per_block, num_processes
-from benchmark import BenchmarkScenario
-
-
-scenarios = wlb.ScenarioManager()
-
-testcase_name = "single-node"
-
-assert num_processes == 1
-
-for num_cells_per_block in cells_per_block:
-    # Create a benchmark scenario
-    scenario = BenchmarkScenario(testcase=testcase_name)
-    scenario.scenario_config['DomainSetup']['cellsPerBlock'] = 3 * (num_cells_per_block,)
-    # Add scenario for execution
-    scenarios.add(scenario)
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/strong_scaling.py b/apps/benchmarks/UniformGridGPU/simulation_setup/strong_scaling.py
deleted file mode 100644
index 7f038c837b81d836fba828685e15d634bad3f5dc..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/strong_scaling.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# encoding: utf-8
-
-import itertools
-import waLBerla as wlb
-from base import get_block_decomposition, communication_schemes, overlap_communication, \
-                 cuda_enabled_mpi, num_processes
-from benchmark import BenchmarkScenario, CommunicationSchemeType
-
-
-# Stores the scenarios for the current simulation
-scenarios = wlb.ScenarioManager()
-
-# Generates all block decompositions of xyz, 2 directions at a time
-#block_decompositions = itertools.combinations_with_replacement('xyz', r=2)
-block_decompositions = ['xyz', 'yzx', 'yxz', 'zyx']
-
-cells_per_block = [256,]
-
-if num_processes == 1:
-    scenario_generator = itertools.product(communication_schemes, [False,], [False,],
-                                           block_decompositions, cells_per_block)
-else:
-    scenario_generator = itertools.product(communication_schemes, overlap_communication, 
-                                           cuda_enabled_mpi, block_decompositions, cells_per_block)
-
-testcase_name = "strong-scaling"
-
-for scenario_params in scenario_generator:
-    # Extract parameters from tuple
-    comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params
-    if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True:
-        # Skip CUDA enabled MPI tests for GPUPackInfo tests
-        continue
-    elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True:
-        # Skip communication overlap tests for GPUPackInfo baseline
-        continue
-
-    # Convert the axes decompositions to string
-    decomposition_axes_str = ''.join(decomposition_axes)
-    # Compute block decomposition based on the specified axes and the number of processes
-    blocks = get_block_decomposition(decomposition_axes, num_processes)
-    # Create a benchmark scenario
-    scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str)
-    # Domain Setup parameters
-    domain_setup = scenario.scenario_config['DomainSetup']
-    domain_setup['cellsPerBlock'] = tuple(num_cells_per_block // block for block in blocks)
-    domain_setup['nrOfProcesses'] = blocks
-    domain_setup['blocks'] = blocks
-    # Additional parameters for benchmarking
-    params = scenario.scenario_config['Parameters']
-    params['cudaEnabledMPI'] = is_cuda_enabled_mpi
-    params['overlapCommunication'] = is_communication_overlapped
-    params['communicationScheme'] = CommunicationSchemeType[comm_scheme]
-    # Add scenario for execution
-    scenarios.add(scenario)
-
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py b/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py
deleted file mode 100644
index 66f9b2590625e7401c6cf830d35286c7bda23790..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# encoding: utf-8
-
-import itertools
-import waLBerla as wlb
-from base import get_block_decomposition, communication_schemes, overlap_communication, \
-                 cuda_enabled_mpi, num_processes, calculate_time_steps, side_length_to_fill_memory
-from benchmark import BenchmarkScenario, CommunicationSchemeType
-
-
-# Stores the scenarios for the current simulation
-scenarios = wlb.ScenarioManager()
-
-# Generates all block decompositions of xyz, 2 directions at a time
-#block_decompositions = itertools.combinations_with_replacement('xyz', r=2)
-block_decompositions = ['xyz', 'yzx', 'zyx', 'yxz']
-
-# compute number of cells depending on GPU memory i.e. by specifying the percentage of GPU memory to fill
-gpu_memory_gb = 16
-cells_per_block = [side_length_to_fill_memory(pc, gpu_memory_gb) for pc in (0.8, 0.5, 0.05)]
-
-expected_mlups = 200  # to compute how many time steps have to be done
-time_per_scenarios = 5  # benchmark time in seconds
-
-fully_periodic = [False, True]
-
-if num_processes == 1:
-    scenario_generator = itertools.product(communication_schemes, [False, ], [False, ],
-                                           block_decompositions, cells_per_block, fully_periodic)
-else:
-    scenario_generator = itertools.product(communication_schemes, [True],
-                                           cuda_enabled_mpi, block_decompositions, cells_per_block, fully_periodic)
-
-testcase_name = "weak-scaling"
-
-for scenario_params in scenario_generator:
-    # Extract parameters from tuple
-    comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block, fully_periodic = scenario_params
-    if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True:
-        # Skip CUDA enabled MPI tests for GPUPackInfo tests
-        continue
-    elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True:
-        # Skip communication overlap tests for GPUPackInfo without streams 
-        continue
-
-    # Convert the axes decompositions to string
-    decomposition_axes_str = ''.join(decomposition_axes)
-    # Compute block decomposition based on the specified axes and the number of processes
-    blocks = get_block_decomposition(decomposition_axes, num_processes)
-    # Estimate number of time steps
-    time_steps = max(50, calculate_time_steps(time_per_scenarios, expected_mlups, 3 * (num_cells_per_block,)))
-    # Create a benchmark scenario
-    scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str,
-                                 time_steps=time_steps, fully_periodic=fully_periodic)
-    # Domain Setup parameters
-    domain_setup = scenario.scenario_config['DomainSetup']
-    domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,)
-    domain_setup['nrOfProcesses'] = blocks
-    domain_setup['blocks'] = blocks
-    # Additional parameters for benchmarking
-    params = scenario.scenario_config['Parameters']
-    params['cudaEnabledMPI'] = is_cuda_enabled_mpi
-    params['overlapCommunication'] = is_communication_overlapped
-    params['communicationScheme'] = CommunicationSchemeType[comm_scheme]
-    # Add scenario for execution
-    scenarios.add(scenario)