diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index d749568caed53a30379bafeb8c65fd742b45bbc3..f6dc064cfa6a2dc138a9bf4fee5523c7e8a969a0 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -176,18 +176,14 @@ int main( int argc, char **argv )
       auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds
       timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" );
 
-      /*
-      lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, 500);
-      timeLoop.addFuncAfterTimeStep( performanceLogger, "remaining time logger" );
-
-      timeLoop.run();
-      */
 
       auto performanceReportFrequency = parameters.getParameter< uint_t >( "performanceReportFrequency", 500 ); // in timesteps
       lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, performanceReportFrequency);
       timeLoop.addFuncAfterTimeStep([&performanceLogger] { performanceLogger(); }, "performance logger" );
 
+      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps");
       timeLoop.run();
+      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished");
 
       std::map< std::string, int > integerProperties;
       std::map< std::string, double > realProperties;
@@ -204,9 +200,6 @@ int main( int argc, char **argv )
             pythonCallbackResults.data().exposeValue( "mlups_total", realProperties["MLUPS"] );
             pythonCallbackResults.data().exposeValue( "mlups_process", realProperties["MLUPS_process"] );
 
-            pythonCallbackResults.data().exposeValue( "mflups_total", realProperties["MFLUPS"] );
-            pythonCallbackResults.data().exposeValue( "mflups_process", realProperties["MFLUPS_process"] );
-
             // Call Python function to report results
             pythonCallbackResults();
          }
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py b/apps/benchmarks/UniformGridGPU/simulation_setup/base.py
index c72988a8bc772426e794b2e5660fa34fd8d17126..7b98938f2b226dd834dfbc8a46875e940d003bd3 100644
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/base.py
@@ -1,9 +1,10 @@
 # encoding: utf-8
 
 import math
+import operator
+from functools import reduce
 import waLBerla as wlb
 
-
 # Constants that define the size of blocks that are used in the benchmarks
 MIN_CELLS_PER_BLOCK = 16
 MAX_CELLS_PER_BLOCK = 256
@@ -13,7 +14,7 @@ cells_per_block_interval = range(MIN_CELLS_PER_BLOCK, MAX_CELLS_PER_BLOCK + 1, I
 # Blocks with size in [16, 32, 64, 128, 256]
 cells_per_block = [num_cells for num_cells in cells_per_block_interval]
 # Number of active MPI processes
-num_processes =  wlb.mpi.numProcesses()
+num_processes = wlb.mpi.numProcesses()
 # Whether to overlap computation with communication
 overlap_communication = [False, True]
 # Whether MPI supports buffers in GPU memory
@@ -22,6 +23,18 @@ cuda_enabled_mpi = [False, True]
 communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']
 
 
+def calculate_time_steps(runtime, expected_mlups, domain_size):
+    cells = reduce(operator.mul, domain_size, 1)
+    time_steps_per_second = expected_mlups * 1e6 / cells
+    return time_steps_per_second * runtime
+
+
+def side_length_to_fill_memory(memory_fill_percentage, memory_in_gb):
+    bytes_per_cell = 19 * 2 * 8
+    max_cells = memory_in_gb * 1e9 / bytes_per_cell * memory_fill_percentage
+    return int(max_cells**(1/3))
+
+
 def get_block_decomposition(block_decomposition, num_processes):
     bx = by = bz = 1
     blocks_per_axis = int(math.log(num_processes, 2))
@@ -35,5 +48,4 @@ def get_block_decomposition(block_decomposition, num_processes):
             bx *= 2
 
     assert (bx * by * bz) == num_processes
-
-    return (bx, by, bz)
+    return bx, by, bz
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py
index 76c8a8967711f7e902acda23b4ad9bf56f587f31..daf4fee66347cbe31015aaf1420318689653c8bb 100644
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py
@@ -23,7 +23,7 @@ CommunicationSchemeName = {
 
 # Base configuration for the benchmark
 BASE_CONFIG = {
-    'DomainSetup' : {
+    'DomainSetup': {
         'cellsPerBlock': (64, 64, 64),
         'blocks': (1, 1, 1),
         'nrOfProcesses': (1, 1, 1),
@@ -53,44 +53,52 @@ BASE_CONFIG = {
 
 
 class BenchmarkScenario:
-    def __init__(self, testcase, decomposition_axes=None):
+    def __init__(self, testcase, time_steps, decomposition_axes=None, fully_periodic=False):
         self.testcase = testcase
         self.scenario_config = copy.deepcopy(BASE_CONFIG)
+        self.scenario_config['Parameters']['timesteps'] = time_steps
+        self.fully_periodic = fully_periodic
+        if fully_periodic:
+            del self.scenario_config['Boundaries']['Border']
+            self.scenario_config['DomainSetup']['periodic'] = (1, 1, 1)
         self.decomposition_axes = decomposition_axes
 
         now = datetime.now().replace(second=0, microsecond=0)
         self.output_filename = f'{self.testcase}_{now.strftime("%Y-%m-%d_%H-%M")}.csv'
 
-    @wlb.member_callback
-    def config(self, **kwargs):
-        return self.scenario_config
-
-    @wlb.member_callback
-    def results_callback(self, **kwargs):
+    def get_data(self):
         block_setup = self.scenario_config.get('DomainSetup')
         params = self.scenario_config.get('Parameters')
 
-        data = [{
+        return {
             'processesX': block_setup.get('nrOfProcesses')[0],
             'processesY': block_setup.get('nrOfProcesses')[1],
             'processesZ': block_setup.get('nrOfProcesses')[2],
             'blocksX': block_setup.get('blocks')[0],
             'blocksY': block_setup.get('blocks')[1],
             'blocksZ': block_setup.get('blocks')[2],
+            'fully_periodic': self.fully_periodic,
             'cellsPerBlockX': block_setup.get('cellsPerBlock')[0],
             'cellsPerBlockY': block_setup.get('cellsPerBlock')[1],
             'cellsPerBlockZ': block_setup.get('cellsPerBlock')[2],
             'cudaEnabledMPI': params.get('cudaEnabledMPI'),
             'overlapCommunication': params.get('overlapCommunication'),
+            'time_steps': params['timesteps'],
             'domainDecomposition': self.decomposition_axes,
             'communicationScheme': CommunicationSchemeName[params.get('communicationScheme')],
-            'mlupsTotal': kwargs.get('mlups_total'),
-            'mlupsProcess': kwargs.get('mlups_process'),
-            'mflupsTotal': kwargs.get('mflups_total'),
-            'mflupsProcess': kwargs.get('mflups_process'),
-        }]
+        }
 
-        self.save_data(data)
+    @wlb.member_callback
+    def config(self, **kwargs):
+        from pprint import pformat
+        wlb.log_info_on_root("Scenario:\n" + pformat(self.get_data()))
+        return self.scenario_config
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = self.get_data()
+        data.update(kwargs)
+        self.save_data([data])
 
     def save_data(self, data):
         df = pd.DataFrame(data)
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py b/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c14d5b9b562bb53143b22bb660d82fecf66e500
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py
@@ -0,0 +1,17 @@
+from os import getcwd
+from waLBerla.tools.jobscripts import createJobscript
+from datetime import timedelta
+
+
+for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]:
+    with open("job_weak_scaling_{:04d}.sh".format(node_count), 'w') as f:
+        js = createJobscript(nodes=node_count,
+                             output_file='out_lbm_bench_%j.txt',
+                             error_file='err_lbm_bench_%j.txt',
+                             initial_dir=getcwd(),
+                             exe_name='UniformGridBenchmarkGPU',
+                             parameter_files=['weak_scaling.py'],
+                             wall_time=timedelta(minutes=25),
+                             machine='pizdaint_hybrid'
+                             )
+        f.write(js)
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py b/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py
index c29d935dca8306856ef4646c0565f636bb0fe7fb..8076aeef7405f6bb5101d7a61be488c0babd3932 100644
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py
@@ -3,7 +3,7 @@
 import itertools
 import waLBerla as wlb
 from base import get_block_decomposition, communication_schemes, overlap_communication, \
-                 cuda_enabled_mpi, num_processes
+                 cuda_enabled_mpi, num_processes, calculate_time_steps, side_length_to_fill_memory
 from benchmark import BenchmarkScenario, CommunicationSchemeType
 
 
@@ -14,20 +14,27 @@ scenarios = wlb.ScenarioManager()
 #block_decompositions = itertools.combinations_with_replacement('xyz', r=2)
 block_decompositions = ['xyz', 'yzx', 'zyx', 'yxz']
 
-cells_per_block = [64, 128, 240, 256]
+# compute number of cells depending on GPU memory i.e. by specifying the percentage of GPU memory to fill
+gpu_memory_gb = 16
+cells_per_block = [side_length_to_fill_memory(pc, gpu_memory_gb) for pc in (0.8, 0.5, 0.05)]
+
+expected_mlups = 200  # to compute how many time steps have to be done
+time_per_scenarios = 3  # benchmark time in seconds
+
+fully_periodic = [False, True]
 
 if num_processes == 1:
-    scenario_generator = itertools.product(communication_schemes, [False,], [False,],
-                                           block_decompositions, cells_per_block)
+    scenario_generator = itertools.product(communication_schemes, [False, ], [False, ],
+                                           block_decompositions, cells_per_block, fully_periodic)
 else:
-    scenario_generator = itertools.product(communication_schemes, overlap_communication, 
-                                           cuda_enabled_mpi, block_decompositions, cells_per_block)
+    scenario_generator = itertools.product(communication_schemes, [True],
+                                           cuda_enabled_mpi, block_decompositions, cells_per_block, fully_periodic)
 
 testcase_name = "weak-scaling"
 
 for scenario_params in scenario_generator:
     # Extract parameters from tuple
-    comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params
+    comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block, fully_periodic = scenario_params
     if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True:
         # Skip CUDA enabled MPI tests for GPUPackInfo tests
         continue
@@ -39,8 +46,11 @@ for scenario_params in scenario_generator:
     decomposition_axes_str = ''.join(decomposition_axes)
     # Compute block decomposition based on the specified axes and the number of processes
     blocks = get_block_decomposition(decomposition_axes, num_processes)
+    # Estimate number of time steps
+    time_steps = max(50, calculate_time_steps(time_per_scenarios, expected_mlups, 3 * (num_cells_per_block,)))
     # Create a benchmark scenario
-    scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str)
+    scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str,
+                                 time_steps=time_steps, fully_periodic=fully_periodic)
     # Domain Setup parameters
     domain_setup = scenario.scenario_config['DomainSetup']
     domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,)