7a6d3b57 · 7a6d3b57 · 7a6d3b57 · 7a6d3b57 · 7a6d3b57 · 7a6d3b57
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -26,7 +26,6 @@ Parameters
 	omega 1.92;
 	initShearFlow 1;
-	useGui 0;
 }
 /*

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
 import sympy as sp
 import numpy as np
 import pystencils as ps
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
+from lbmpy.enums import Method, Stencil, SubgridScaleModel
-from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
+from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
 from pystencils_walberla import generate_pack_info_from_kernel
+from lbmpy_walberla import generate_lattice_model, generate_boundary
 from pystencils_walberla import CodeGeneration, generate_sweep
-from pystencils.data_types import TypedSymbol
+from pystencils.typing import TypedSymbol
 from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
 from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
 omega = sp.symbols("omega")
 omega_free = sp.Symbol("omega_free")
+omega_fill = sp.symbols("omega_:10")
 compile_time_block_size = False
 if compile_time_block_size:
@@ -37,24 +41,42 @@ options_dict = {
    'mrt': {
        'method': 'mrt',
        'stencil': 'D3Q19',
-        'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
+        'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235],
+    },
+    'mrt_full': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
+                             omega_fill[3], omega_fill[4], omega_fill[5]],
    },
    'entropic': {
        'method': 'mrt',
        'stencil': 'D3Q19',
        'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
+        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free],
+        'entropic': True,
+    },
+    'entropic_kbc_n4': {
+        'method': 'trt-kbc-n4',
+        'stencil': 'D3Q27',
+        'compressible': True,
+        'relaxation_rates': [omega, omega_free],
        'entropic': True,
    },
    'smagorinsky': {
        'method': 'srt',
        'stencil': 'D3Q19',
-        'smagorinsky': True,
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
        'relaxation_rate': omega,
-    }
+    },
+    'cumulant': {
+        'method': 'cumulant',
+        'stencil': 'D3Q19',
+        'compressible': True,
+        'relaxation_rate': omega,
+    },
 }
 info_header = """
 #include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
 const char * infoStencil = "{stencil}";
@@ -63,22 +85,37 @@ const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
 with CodeGeneration() as ctx:
-    accessors = {
+    accessor = StreamPullTwoFieldsAccessor()
-        'Even': AAEvenTimeStepAccessor(),
+    # accessor = StreamPushTwoFieldsAccessor()
-        'Odd': AAOddTimeStepAccessor()
+    assert not accessor.is_inplace, "This app does not work for inplace accessors"
-    }
    common_options = {
        'field_name': 'pdfs',
+        'temporary_field_name': 'pdfs_tmp',
+        'kernel_type': accessor,
        'optimization': {'cse_global': True,
-                         'cse_pdfs': False,
+                         'cse_pdfs': False}
-                         'field_layout': 'fzyx',
-                         }
    }
-    options = options_dict.get(ctx.config, options_dict['srt'])
+    config_name = ctx.config
+    noopt = False
+    d3q27 = False
+    if config_name.endswith("_noopt"):
+        noopt = True
+        config_name = config_name[:-len("_noopt")]
+    if config_name.endswith("_d3q27"):
+        d3q27 = True
+        config_name = config_name[:-len("_d3q27")]
+    options = options_dict[config_name]
    options.update(common_options)
+    options = options.copy()
+    if noopt:
+        options['optimization']['cse_global'] = False
+        options['optimization']['cse_pdfs'] = False
+    if d3q27:
+        options['stencil'] = 'D3Q27'
    stencil_str = options['stencil']
    q = int(stencil_str[stencil_str.find('Q') + 1:])
@@ -86,37 +123,48 @@ with CodeGeneration() as ctx:
    options['optimization']['symbolic_field'] = pdfs
    vp = [
+        ('double', 'omega_0'),
+        ('double', 'omega_1'),
+        ('double', 'omega_2'),
+        ('double', 'omega_3'),
+        ('double', 'omega_4'),
+        ('double', 'omega_5'),
+        ('double', 'omega_6'),
        ('int32_t', 'cudaBlockSize0'),
-        ('int32_t', 'cudaBlockSize1')
+        ('int32_t', 'cudaBlockSize1'),
    ]
    lb_method = create_lb_method(**options)
+    update_rule = create_lb_update_rule(lb_method=lb_method, **options)
-    # Kernels
+    if not noopt:
-    options_without_opt = options.copy()
-    del options_without_opt['optimization']
-    update_rules = {}
-    for name, accessor in accessors.items():
-        update_rule = create_lb_update_rule(lb_method=lb_method, kernel_type=accessor, **options)
        update_rule = insert_fast_divisions(update_rule)
        update_rule = insert_fast_sqrts(update_rule)
-        update_rules[name] = update_rule
-        generate_sweep(ctx, 'UniformGridGPU_AA_LbKernel' + name, update_rule,
+    # CPU lattice model - required for macroscopic value computation, VTK output etc.
-                       inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
+    options_without_opt = options.copy()
-                       varying_parameters=vp)
+    del options_without_opt['optimization']
+    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method,
+                                                                                        **options_without_opt))
+    # gpu LB sweep & boundaries
+    generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
+                   field_swaps=[('pdfs', 'pdfs_tmp')],
+                   inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
+                   varying_parameters=vp)
+    generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
+    generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
    # getter & setter
    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
                                                   pdfs=pdfs.center_vector, density=1.0)
    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
                                                   pdfs=pdfs.center_vector, density=None)
-    generate_sweep(ctx, 'UniformGridGPU_AA_MacroSetter', setter_assignments)
+    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments)
-    generate_sweep(ctx, 'UniformGridGPU_AA_MacroGetter', getter_assignments)
+    generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
    # communication
-    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'],
+    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
-                                   kind='pull', target='gpu')
-    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'],
-                                   kind='push', target='gpu')
    infoHeaderParams = {
        'stencil': stencil_str,
@@ -125,4 +173,4 @@ with CodeGeneration() as ctx:
        'cse_global': int(options['optimization']['cse_global']),
        'cse_pdfs': int(options['optimization']['cse_pdfs']),
    }
-    ctx.write_file("UniformGridGPU_AA_Defines.h", info_header.format(**infoHeaderParams))
+    ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams))
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
@@ -7,9 +7,9 @@
 #include "blockforest/communication/UniformDirectScheme.h"
 #include "field/communication/StencilRestrictedMPIDatatypeInfo.h"
 #include "field/communication/UniformMPIDatatypeInfo.h"
-#include "cuda/communication/GPUPackInfo.h"
+#include "gpu/communication/GPUPackInfo.h"
-#include "cuda/communication/UniformGPUScheme.h"
+#include "gpu/communication/UniformGPUScheme.h"
-#include "cuda/communication/MemcpyPackInfo.h"
+#include "gpu/communication/MemcpyPackInfo.h"
 #include "UniformGridGPU_PackInfo.h"
@@ -36,28 +36,28 @@ public:
          _gpuCommunicationScheme(nullptr), _directScheme(nullptr)
    {
        auto generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId );
-        auto memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
+        auto memcpyPackInfo = make_shared< gpu::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
        auto dataTypeInfo = make_shared< field::communication::StencilRestrictedMPIDatatypeInfo< GPUFieldType, StencilType > >( bdId );
        auto dataTypeInfoFull = make_shared< field::communication::UniformMPIDatatypeInfo<GPUFieldType> >( bdId );
        switch(_commSchemeType)
        {
            case GPUPackInfo_Baseline:
-                _gpuPackInfo = make_shared< cuda::communication::GPUPackInfo< GPUFieldType > >( bdId );
+                _gpuPackInfo = make_shared< gpu::communication::GPUPackInfo< GPUFieldType > >( bdId );
                _cpuCommunicationScheme = make_shared< blockforest::communication::UniformBufferedScheme< StencilType > >( bf );
                _cpuCommunicationScheme->addPackInfo( _gpuPackInfo );
                break;
            case GPUPackInfo_Streams:
-                _gpuPackInfo = make_shared< cuda::communication::GPUPackInfo< GPUFieldType > >( bdId );
+                _gpuPackInfo = make_shared< gpu::communication::GPUPackInfo< GPUFieldType > >( bdId );
                _cpuCommunicationScheme = make_shared< blockforest::communication::UniformBufferedScheme< StencilType > >( bf );
                _cpuCommunicationScheme->addPackInfo( _gpuPackInfo );
                break;
            case UniformGPUScheme_Baseline:
-                _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
+                _gpuCommunicationScheme = make_shared< gpu::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
                _gpuCommunicationScheme->addPackInfo( generatedPackInfo );
                break;
            case UniformGPUScheme_Memcpy:
-                _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
+                _gpuCommunicationScheme = make_shared< gpu::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
                _gpuCommunicationScheme->addPackInfo( memcpyPackInfo );
                break;
            case MPIDatatypes:
@@ -151,7 +151,7 @@ public:
 private:
    CommunicationSchemeType _commSchemeType;
    shared_ptr< blockforest::communication::UniformBufferedScheme< StencilType > > _cpuCommunicationScheme;
-    shared_ptr< cuda::communication::GPUPackInfo< GPUFieldType > > _gpuPackInfo;
+    shared_ptr< gpu::communication::GPUPackInfo< GPUFieldType > > _gpuPackInfo;
-    shared_ptr< cuda::communication::UniformGPUScheme< StencilType > > _gpuCommunicationScheme;
+    shared_ptr< gpu::communication::UniformGPUScheme< StencilType > > _gpuCommunicationScheme;
    shared_ptr< blockforest::communication::UniformDirectScheme<StencilType> > _directScheme;
 };
--- a/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py
+#!/usr/bin/env python3
+"""
+This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication.
+Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1  then run e.g.
+ ./UniformGridGPU_d3q27_aa_srt simulation_setup/benchmark_configs.py
+Look at the end of the file to select the benchmark to run
+"""
+import os
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+from copy import deepcopy
+from functools import reduce
+import operator
+import sys
+import sqlite3
+# Number of time steps run for a workload of 128^3 per GPU
+# if double as many cells are on the GPU, half as many time steps are run etc.
+# increase this to get more reliable measurements
+TIME_STEPS_FOR_128_BLOCK = 200
+DB_FILE = "gpu_benchmark.sqlite3"
+BASE_CONFIG = {
+    'DomainSetup': {
+        'cellsPerBlock': (256, 128, 128),
+        'periodic': (1, 1, 1),
+    },
+    'Parameters': {
+        'omega': 1.8,
+        'cudaEnabledMPI': False,
+        'warmupSteps': 5,
+        'outerIterations': 3,
+    }
+}
+def prod(seq):
+    return reduce(operator.mul, seq, 1)
+def num_time_steps(block_size, time_steps_for_128_block=200):
+    cells = block_size[0] * block_size[1] * block_size[2]
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    return int(time_steps)
+def cuda_block_size_ok(block_size, regs_per_threads=168):
+    """Checks if a given CUDA block size does not exceed the SM register limit.
+    168 registers per thread was obtained using cuobjdump on both SRT and Cumulant
+    kernels. You might want to validate that for your own kernels."""
+    return prod(block_size) * regs_per_threads < 64 * (2 ** 10)
+def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
+    """Checks if a single block of given size fits into GPU memory"""
+    return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem
+class Scenario:
+    def __init__(self, cells_per_block=(256, 128, 128), db_file=DB_FILE, **kwargs):
+        self.db_file = db_file
+        self.config_dict = deepcopy(BASE_CONFIG)
+        self.config_dict['Parameters'].update(kwargs)
+        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
+        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
+    @wlb.member_callback
+    def config(self):
+        from pprint import pformat
+        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
+        # Write out the configuration as text-based prm:
+        # print(toPrm(self.config_dict))
+        return self.config_dict
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        sequenceValuesToScalars(data)
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, "runs", self.db_file)
+                storeSingle(result, "runs", self.db_file)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning("Sqlite DB writing failed: try {}/{}  {}".format(num_try + 1, num_tries, str(e)))
+# -------------------------------------- Functions trying different parameter sets -----------------------------------
+def overlap_benchmark():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running different communication overlap strategies")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
+                          (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
+                          (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
+    # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
+    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+        # no overlap
+        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy,
+                               innerOuterSplit=(1, 1, 1)))
+        # overlap
+        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
+            for inner_outer_split in inner_outer_splits:
+                scenario = Scenario(timeStepStrategy=overlap_strategy,
+                                    communicationScheme=comm_strategy,
+                                    innerOuterSplit=inner_outer_split,
+                                    timesteps=num_time_steps((256, 128, 128)))
+                scenarios.add(scenario)
+def communication_compare():
+    """Tests different communication strategies"""
+    wlb.log_info_on_root("Running benchmarks to compare communication strategies")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
+        for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+            sc = Scenario(cells_per_block=block_size,
+                          gpuBlockSize=(128, 1, 1),
+                          timeStepStrategy='noOverlap',
+                          communicationScheme=comm_strategy,
+                          timesteps=num_time_steps(block_size))
+            scenarios.add(sc)
+            for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
+                # ensure that the inner part of the domain is still large enough
+                if 3 * inner_outer_split[0] > block_size[0]:
+                    continue
+                sc = Scenario(cells_per_block=block_size,
+                              gpuBlockSize=(128, 1, 1),
+                              timeStepStrategy='simpleOverlap',
+                              innerOuterSplit=inner_outer_split,
+                              communicationScheme=comm_strategy,
+                              timesteps=num_time_steps(block_size))
+                scenarios.add(sc)
+def single_gpu_benchmark():
+    """Benchmarks only the LBM compute kernel"""
+    wlb.log_info_on_root("Running single GPU benchmarks")
+    wlb.log_info_on_root("")
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+    gpu_type = os.environ.get('GPU_TYPE')
+    kwargs = {}
+    if gpu_type is not None:
+        kwargs['gpu_type'] = gpu_type
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
+                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
+                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
+                   (32, 8, 1), (64, 8, 1),
+                   (32, 16, 1)]
+    for block_size in block_sizes:
+        for cuda_block_size in cuda_blocks:
+            if not cuda_block_size_ok(cuda_block_size):
+                wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.")
+                continue
+            if not domain_block_size_ok(block_size, gpu_mem):
+                wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+                continue
+            scenario = Scenario(cells_per_block=block_size,
+                                gpuBlockSize=cuda_block_size,
+                                timeStepStrategy='kernelOnly',
+                                timesteps=num_time_steps(block_size),
+                                **kwargs)
+            scenarios.add(scenario)
+# -------------------------------------- Optional job script generation for PizDaint ---------------------------------
+job_script_header = """
+#!/bin/bash -l
+#SBATCH --job-name=scaling
+#SBATCH --time=0:30:00
+#SBATCH --nodes={nodes}
+#SBATCH -o out_scaling_{nodes}_%j.txt
+#SBATCH -e err_scaling_{nodes}_%j.txt
+#SBATCH --ntasks-per-core=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=1
+#SBATCH --partition=normal
+#SBATCH --constraint=gpu
+#SBATCH --account=d105
+cd {folder}
+source ~/env.sh
+module load daint-gpu
+module load craype-accel-nvidia60
+export MPICH_RDMA_ENABLED_CUDA=1  # allow GPU-GPU data transfer
+export CRAY_CUDA_MPS=1            # allow GPU sharing
+export MPICH_G2G_PIPELINE=256     # adapt maximum number of concurrent in-flight messages
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export CRAY_CUDA_MPS=1
+export MPICH_RANK_REORDER_METHOD=3
+export PMI_MMAP_SYNC_WAIT_TIME=300
+# grid_order -R -H -c 1,1,8 -g 16,16,8
+ulimit -c 0
+"""
+job_script_exe_part = """
+export WALBERLA_SCENARIO_IDX=0
+while srun -n {nodes} ./{app} {config}
+do
+ ((WALBERLA_SCENARIO_IDX++))
+done
+"""
+all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
+                   'UniformGridBenchmarkGPU_smagorinsky_d3q27',
+                   'UniformGridBenchmarkGPU_cumulant'
+                   'UniformGridBenchmarkGPU_cumulant_d3q27')
+def generate_jobscripts(exe_names=all_executables):
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
+        folder_name = "scaling_{:04d}".format(node_count)
+        os.makedirs(folder_name, exist_ok=True)
+        # run grid_order
+        import subprocess
+        decomposition = block_decomposition(node_count)
+        decomposition_str = ",".join(str(e) for e in decomposition)
+        subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
+        job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
+        for exe in exe_names:
+            job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count,
+                                                     config='../communication_compare.py')
+        with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
+            f.write(job_script)
+if __name__ == '__main__':
+    print("Called without waLBerla - generating job scripts for PizDaint")
+    generate_jobscripts()
+else:
+    wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE))
+    # Select the benchmark you want to run
+    single_gpu_benchmark()
+    # benchmarks different CUDA block sizes and domain sizes and measures single
+    # GPU performance of compute kernel (no communication)
+    # communication_compare(): benchmarks different communication routines, with and without overlap
+    # overlap_benchmark(): benchmarks different communication overlap options
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm
@@ -8,6 +8,7 @@ DomainSetup
 Parameters
 {
+    initShearFlow False;
    cudaEnabledMPI False;           // set to true, if MPI was compiled with CUDA
    outerIterations 3;              // number of measurements to make
    timeStepStrategy simpleOverlap; // one of simpleOverlap, noOverlap, the non-AA version also supports complexOverlap
@@ -35,3 +36,10 @@ Parameters
    //   GPUPackInfo_Baseline:       old implementation based on communication mechanism for CPUs
    //   GPUPackInfo_Streams:        same as above but with CUDA streams
 }
+Boundaries {
+    Border { direction T; walldistance -1; flag UBB; }
+    Border { direction B; walldistance -1; flag NoSlip; }
+    Border { direction W; walldistance -1; flag NoSlip; }
+    Border { direction E; walldistance -1; flag NoSlip; }
+}
--- a/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods
+++ b/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
-#!/usr/bin/env python3
-"""
-This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication.
-Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1  then run e.g.
- ./UniformGridBenchmarkGPU_AA_trt simulation_setup/benchmark_configs.py
-Look at the end of the file to select the benchmark to run
-"""
 import os
 import waLBerla as wlb
 from waLBerla.tools.config import block_decomposition
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
-from copy import deepcopy
 import sys
 import sqlite3
+from math import prod
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
 # Number of time steps run for a workload of 128^3 per GPU
 # if double as many cells are on the GPU, half as many time steps are run etc.
 # increase this to get more reliable measurements
-TIME_STEPS_FOR_128_BLOCK = 200
+TIME_STEPS_FOR_128_BLOCK = 1000
-DB_FILE = "gpu_benchmark.sqlite3"
+DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))
 BASE_CONFIG = {
    'DomainSetup': {
@@ -34,27 +39,107 @@ BASE_CONFIG = {
    }
 }
+ldc_setup = {'Border': [
+    {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+]}
-def num_time_steps(block_size):
+def num_time_steps(block_size, time_steps_for_128_block=200):
    cells = block_size[0] * block_size[1] * block_size[2]
-    time_steps = (128 ** 3 / cells) * TIME_STEPS_FOR_128_BLOCK
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    if time_steps < 10:
+        time_steps = 10
    return int(time_steps)
+def cuda_block_size_ok(block_size, regs_per_threads=168):
+    """Checks if a given CUDA block size does not exceed the SM register limit.
+    168 registers per thread was obtained using cuobjdump on both SRT and Cumulant
+    kernels. You might want to validate that for your own kernels."""
+    return prod(block_size) * regs_per_threads < 64 * (2 ** 10)
+def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
+    """Checks if a single block of given size fits into GPU memory"""
+    return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem
 class Scenario:
-    def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
+    def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(128, 1, 1),
-        self.config_dict = deepcopy(BASE_CONFIG)
+                 timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False,
-        self.config_dict['Parameters'].update(kwargs)
+                 inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3,
-        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
+                 init_shear_flow=False, boundary_setup=False,
-        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
+                 vtk_write_frequency=0, remaining_time_logger_frequency=-1,
+                 additional_info=None, blocks=None, db_file_name=None):
+        if boundary_setup:
+            init_shear_flow = False
+            periodic = (0, 0, 0)
+        self.blocks = blocks if blocks else block_decomposition(wlb.mpi.numProcesses())
+        self.cells_per_block = cells_per_block
+        self.periodic = periodic
+        self.time_step_strategy = time_step_strategy
+        self.omega = omega
+        self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
+        self.cuda_enabled_mpi = cuda_enabled_mpi
+        self.inner_outer_split = inner_outer_split
+        self.init_shear_flow = init_shear_flow
+        self.boundary_setup = boundary_setup
+        self.warmup_steps = warmup_steps
+        self.outer_iterations = outer_iterations
+        self.cuda_blocks = cuda_blocks
+        self.vtk_write_frequency = vtk_write_frequency
+        self.remaining_time_logger_frequency = remaining_time_logger_frequency
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
+        self.config_dict = self.config(print_dict=False)
+        self.additional_info = additional_info
    @wlb.member_callback
-    def config(self, **kwargs):
+    def config(self, print_dict=True):
        from pprint import pformat
-        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
+        config_dict = {
-        # Write out the configuration as text-based prm:
+            'DomainSetup': {
-        # print(toPrm(self.config_dict))
+                'blocks': self.blocks,
-        return self.config_dict
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+            },
+            'Parameters': {
+                'omega': self.omega,
+                'cudaEnabledMPI': self.cuda_enabled_mpi,
+                'warmupSteps': self.warmup_steps,
+                'outerIterations': self.outer_iterations,
+                'timeStepStrategy': self.time_step_strategy,
+                'timesteps': self.timesteps,
+                'initShearFlow': self.init_shear_flow,
+                'gpuBlockSize': self.cuda_blocks,
+                'innerOuterSplit': self.inner_outer_split,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
+            },
+            'Logging': {
+                'logLevel': 'info',  # info progress detail tracing
+            }
+        }
+        if self.boundary_setup:
+            config_dict["Boundaries"] = ldc_setup
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+            if self.additional_info:
+                wlb.log_info_on_root("Additional Info:\n" + pformat(self.additional_info))
+        return config_dict
    @wlb.member_callback
    def results_callback(self, **kwargs):
@@ -62,23 +147,53 @@ class Scenario:
        data.update(self.config_dict['Parameters'])
        data.update(self.config_dict['DomainSetup'])
        data.update(kwargs)
+        if self.additional_info is not None:
+            data.update(self.additional_info)
        data['executable'] = sys.argv[0]
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
        sequenceValuesToScalars(data)
        result = data
        sequenceValuesToScalars(result)
        num_tries = 4
        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
+        table_name = table_name.replace("-", "_")  # - not allowed for table name would lead to syntax error
        for num_try in range(num_tries):
            try:
-                checkAndUpdateSchema(result, "runs", DB_FILE)
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
-                storeSingle(result, "runs", DB_FILE)
+                storeSingle(result, table_name, self.db_file_name)
                break
            except sqlite3.OperationalError as e:
-                wlb.log_warning("Sqlite DB writing failed: try {}/{}  {}".format(num_try + 1, num_tries, str(e)))
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+# -------------------------------------- Profiling -----------------------------------
+def profiling():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running 2 timesteps for profiling")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    cells = (128, 128, 128)
+    cuda_enabled_mpi = False
+    scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly',
+                           inner_outer_split=(1, 1, 1), timesteps=2, cuda_enabled_mpi=cuda_enabled_mpi,
+                           outer_iterations=1, warmup_steps=0))
 # -------------------------------------- Functions trying different parameter sets -----------------------------------
@@ -90,158 +205,153 @@ def overlap_benchmark():
    wlb.log_info_on_root("")
    scenarios = wlb.ScenarioManager()
+    cuda_enabled_mpi = False
    inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
                          (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
                          (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
-    # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
+    # no overlap
-    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+    scenarios.add(Scenario(time_step_strategy='noOverlap',
-        # no overlap
+                           inner_outer_split=(1, 1, 1),
-        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy,
+                           cuda_enabled_mpi=cuda_enabled_mpi,
-                               innerOuterSplit=(1, 1, 1)))
+                           outer_iterations=1))
-        # overlap
-        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
-            for inner_outer_split in inner_outer_splits:
-                scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                    communicationScheme=comm_strategy,
-                                    innerOuterSplit=inner_outer_split,
-                                    timesteps=num_time_steps((256, 128, 128)))
-                scenarios.add(scenario)
-def communication_compare():
-    """Tests different communication strategies"""
-    wlb.log_info_on_root("Running benchmarks to compare communication strategies")
-    wlb.log_info_on_root("")
-    scenarios = wlb.ScenarioManager()
+    for inner_outer_split in inner_outer_splits:
-    for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
+        scenario = Scenario(time_step_strategy='simpleOverlap',
-        for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+                            inner_outer_split=inner_outer_split,
+                            cuda_enabled_mpi=cuda_enabled_mpi,
-            sc = Scenario(cells_per_block=block_size,
+                            outer_iterations=1)
-                          gpuBlockSize=(128, 1, 1),
+        scenarios.add(scenario)
-                          timeStepStrategy='noOverlap',
-                          communicationScheme=comm_strategy,
-                          timesteps=num_time_steps(block_size))
-            scenarios.add(sc)
-            for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
-                # ensure that the inner part of the domain is still large enough
-                if 3 * inner_outer_split[0] > block_size[0]:
-                    continue
-                sc = Scenario(cells_per_block=block_size,
-                              gpuBlockSize=(128, 1, 1),
-                              timeStepStrategy='simpleOverlap',
-                              innerOuterSplit=inner_outer_split,
-                              communicationScheme=comm_strategy,
-                              timesteps=num_time_steps(block_size))
-                scenarios.add(sc)
-def single_gpu_benchmark():
+def no_overlap_scaling(cuda_enabled_mpi=False):
-    """Benchmarks only the LBM compute kernel"""
+    """Tests different communication overlapping strategies"""
-    wlb.log_info_on_root("Running single GPU benchmarks")
+    wlb.log_info_on_root("Running scaling benchmark without communication hiding")
    wlb.log_info_on_root("")
    scenarios = wlb.ScenarioManager()
-    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    # no overlap
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
+    scenarios.add(Scenario(cells_per_block=(256, 256, 256),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
+                           cuda_blocks=(128, 1, 1),
-                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
+                           time_step_strategy='noOverlap',
-                   (32, 8, 1), (64, 8, 1),
+                           inner_outer_split=(1, 1, 1),
-                   (32, 16, 1)]
+                           cuda_enabled_mpi=cuda_enabled_mpi,
-    for block_size in block_sizes:
+                           outer_iterations=1))
-        for cuda_block_size in cuda_blocks:
-            scenario = Scenario(cells_per_block=block_size,
-                                gpuBlockSize=cuda_block_size,
-                                timeStepStrategy='kernelOnly',
-                                timesteps=num_time_steps(block_size))
-            scenarios.add(scenario)
-# -------------------------------------- Optional job script generation for PizDaint ---------------------------------
-job_script_header = """
-#!/bin/bash -l
-#SBATCH --job-name=scaling
-#SBATCH --time=0:30:00
-#SBATCH --nodes={nodes}
-#SBATCH -o out_scaling_{nodes}_%j.txt
-#SBATCH -e err_scaling_{nodes}_%j.txt
-#SBATCH --ntasks-per-core=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --cpus-per-task=1
-#SBATCH --partition=normal
-#SBATCH --constraint=gpu
-#SBATCH --account=d105
-cd {folder}
+def weak_scaling_overlap(cuda_enabled_mpi=False):
+    """Tests different communication overlapping strategies"""
-source ~/env.sh
+    wlb.log_info_on_root("Running scaling benchmark with communication hiding")
+    wlb.log_info_on_root("")
-module load daint-gpu
-module load craype-accel-nvidia60
-export MPICH_RDMA_ENABLED_CUDA=1  # allow GPU-GPU data transfer
-export CRAY_CUDA_MPS=1            # allow GPU sharing
-export MPICH_G2G_PIPELINE=256     # adapt maximum number of concurrent in-flight messages
-export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+    scenarios = wlb.ScenarioManager()
-export CRAY_CUDA_MPS=1
-export MPICH_RANK_REORDER_METHOD=3
+    # overlap
-export PMI_MMAP_SYNC_WAIT_TIME=300
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=(WeakX, WeakY, WeakZ),
+                               cuda_blocks=(128, 1, 1),
+                               time_step_strategy=t,
+                               inner_outer_split=(8, 8, 8),
+                               cuda_enabled_mpi=cuda_enabled_mpi,
+                               outer_iterations=1,
+                               boundary_setup=True,
+                               db_file_name="weakScalingUniformGrid.sqlite3"))
-# grid_order -R -H -c 1,1,8 -g 16,16,8
+def strong_scaling_overlap(cuda_enabled_mpi=False):
+    wlb.log_info_on_root("Running strong scaling benchmark with one block per proc with communication hiding")
+    wlb.log_info_on_root("")
-ulimit -c 0
+    scenarios = wlb.ScenarioManager()
-"""
-job_script_exe_part = """
+    domain_size = (StrongX, StrongY, StrongZ)
+    blocks = block_decomposition(wlb.mpi.numProcesses())
+    cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
-export WALBERLA_SCENARIO_IDX=0
+    # overlap
-while srun -n {nodes} ./{app} {config}
+    for t in ["noOverlap", "simpleOverlap"]:
-do
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
- ((WALBERLA_SCENARIO_IDX++))
+                               cuda_blocks=(128, 1, 1),
-done
+                               time_step_strategy=t,
-"""
+                               inner_outer_split=(1, 1, 1),
+                               cuda_enabled_mpi=cuda_enabled_mpi,
+                               outer_iterations=1,
+                               timesteps=50,
+                               blocks=blocks,
+                               boundary_setup=True,
+                               db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
-all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
+def single_gpu_benchmark():
-                   'UniformGridBenchmarkGPU_smagorinsky_d3q27',
+    """Benchmarks only the LBM compute kernel"""
-                   'UniformGridBenchmarkGPU_cumulant'
+    wlb.log_info_on_root("Running single GPU benchmarks")
-                   'UniformGridBenchmarkGPU_cumulant_d3q27')
+    wlb.log_info_on_root("")
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+    gpu_type = os.environ.get('GPU_TYPE')
-def generate_jobscripts(exe_names=all_executables):
+    additional_info = {}
-    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
+    if gpu_type is not None:
-        folder_name = "scaling_{:04d}".format(node_count)
+        additional_info['gpu_type'] = gpu_type
-        os.makedirs(folder_name, exist_ok=True)
-        # run grid_order
+    scenarios = wlb.ScenarioManager()
-        import subprocess
+    block_sizes = [(i, i, i) for i in (128, 256, 320)]
-        decomposition = block_decomposition(node_count)
+    cuda_blocks = [(128, 1, 1), ]
-        decomposition_str = ",".join(str(e) for e in decomposition)
+    for block_size in block_sizes:
-        subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
+        for cuda_block_size in cuda_blocks:
+            # cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1)
+            if cuda_block_size > block_size:
+                continue
+            if not cuda_block_size_ok(cuda_block_size):
+                wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.")
+                continue
+            if not domain_block_size_ok(block_size, gpu_mem):
+                wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+                continue
+            scenario = Scenario(cells_per_block=block_size,
+                                cuda_blocks=cuda_block_size,
+                                time_step_strategy='kernelOnly',
+                                timesteps=num_time_steps(block_size, 2000),
+                                outer_iterations=1,
+                                additional_info=additional_info)
+            scenarios.add(scenario)
-        job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
-        for exe in exe_names:
-            job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count,
-                                                     config='../communication_compare.py')
-        with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
+def validation_run():
-            f.write(job_script)
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+    wlb.log_info_on_root("")
+    time_step_strategy = "noOverlap"  # "simpleOverlap"
-if __name__ == '__main__':
+    scenarios = wlb.ScenarioManager()
-    print("Called without waLBerla - generating job scripts for PizDaint")
+    scenario = Scenario(cells_per_block=(128, 128, 128),
-    generate_jobscripts()
+                        time_step_strategy=time_step_strategy,
-else:
+                        timesteps=10001,
-    wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE))
+                        outer_iterations=1,
-    # Select the benchmark you want to run
+                        warmup_steps=0,
+                        init_shear_flow=False,
+                        boundary_setup=True,
+                        vtk_write_frequency=5000,
+                        remaining_time_logger_frequency=30)
+    scenarios.add(scenario)
+wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
+# Select the benchmark you want to run
+# single_gpu_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
+# performance of compute kernel (no communication)
+# overlap_benchmark()  # benchmarks different communication overlap options
+# profiling()  # run only two timesteps on a smaller domain for profiling only
+# validation_run()
+if BENCHMARK == 0:
    single_gpu_benchmark()
-    # benchmarks different CUDA block sizes and domain sizes and measures single
+elif BENCHMARK == 1:
-    # GPU performance of compute kernel (no communication)
+    weak_scaling_overlap(True)
-    # communication_compare(): benchmarks different communication routines, with and without overlap
+elif BENCHMARK == 2:
-    # overlap_benchmark(): benchmarks different communication overlap options
+    strong_scaling_overlap(True)
+else:
+    validation_run()
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs_RDM.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs_RDM.py
+import os
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sys
+import sqlite3
+from math import prod
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+# Number of time steps run for a workload of 128^3 per GPU
+# if double as many cells are on the GPU, half as many time steps are run etc.
+# increase this to get more reliable measurements
+TIME_STEPS_FOR_128_BLOCK = int(os.environ.get('TIME_STEPS_FOR_128_BLOCK', 1000))
+DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))
+BASE_CONFIG = {
+    'DomainSetup': {
+        'cellsPerBlock': (256, 128, 128),
+        'periodic': (1, 1, 1),
+    },
+    'Parameters': {
+        'omega': 1.8,
+        'cudaEnabledMPI': False,
+        'warmupSteps': 5,
+        'outerIterations': 3,
+    }
+}
+ldc_setup = {'Border': [
+    {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+]}
+def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
+    """
+    Calculate the number of time steps based on the block size.
+    This function computes the number of time steps required for a given block size
+    by scaling the time steps that could be executed on one process within one second 
+    for a 128x128x128 cells_per_block to the given cells_per_block size.
+    Parameters:
+    block_size (tuple): A tuple of three integers representing the dimensions of the cells_per_block (x, y, z).
+    time_steps_for_128_block (int, optional): The number of time steps for a 128x128x128 block. Default is 1000, 
+    which is approximately the number of timesteps that were executed within one second on one entire MareNostrum5-acc node.
+    Returns:
+    int: The calculated number of time steps, with a minimum value of 10.
+    """
+    cells = block_size[0] * block_size[1] * block_size[2]
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    if time_steps < 10:
+        time_steps = 10
+    return int(time_steps)
+def cuda_block_size_ok(block_size, regs_per_threads=168):
+    """
+    Checks if a given CUDA block size does not exceed the SM register limit.
+    168 registers per thread was obtained using cuobjdump on both SRT and Cumulant
+    kernels. You might want to validate that for your own kernels.
+    """
+    return prod(block_size) * regs_per_threads < 64 * (2 ** 10)
+def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
+    """
+    Checks if a single block of given size fits into GPU memory.
+    """
+    return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem
+class Scenario:
+    def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(128, 1, 1),
+                 timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False,
+                 inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3,
+                 init_shear_flow=False, boundary_setup=False,
+                 vtk_write_frequency=0, remaining_time_logger_frequency=-1,
+                 additional_info=None, blocks=None, db_file_name=None):
+        if boundary_setup:
+            init_shear_flow = False
+            periodic = (0, 0, 0)
+        self.blocks = blocks if blocks else block_decomposition(wlb.mpi.numProcesses())
+        self.cells_per_block = cells_per_block
+        self.periodic = periodic
+        self.time_step_strategy = time_step_strategy
+        self.omega = omega
+        self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
+        self.cuda_enabled_mpi = cuda_enabled_mpi
+        self.inner_outer_split = inner_outer_split
+        self.init_shear_flow = init_shear_flow
+        self.boundary_setup = boundary_setup
+        self.warmup_steps = warmup_steps
+        self.outer_iterations = outer_iterations
+        self.cuda_blocks = cuda_blocks
+        self.vtk_write_frequency = vtk_write_frequency
+        self.remaining_time_logger_frequency = remaining_time_logger_frequency
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
+        self.config_dict = self.config(print_dict=False)
+        self.additional_info = additional_info
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        from pprint import pformat
+        config_dict = {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+            },
+            'Parameters': {
+                'omega': self.omega,
+                'cudaEnabledMPI': self.cuda_enabled_mpi,
+                'warmupSteps': self.warmup_steps,
+                'outerIterations': self.outer_iterations,
+                'timeStepStrategy': self.time_step_strategy,
+                'timesteps': self.timesteps,
+                'initShearFlow': self.init_shear_flow,
+                'gpuBlockSize': self.cuda_blocks,
+                'innerOuterSplit': self.inner_outer_split,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
+            },
+            'Logging': {
+                'logLevel': 'info',  # info progress detail tracing
+            }
+        }
+        if self.boundary_setup:
+            config_dict["Boundaries"] = ldc_setup
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+            if self.additional_info:
+                wlb.log_info_on_root("Additional Info:\n" + pformat(self.additional_info))
+        return config_dict
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        if self.additional_info is not None:
+            data.update(self.additional_info)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()  # generate subclasses
+            state.update()  # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+        sequenceValuesToScalars(data)
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
+        table_name = table_name.replace("-", "_")  # - not allowed for table name would lead to syntax error
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+# -------------------------------------- Functions trying different parameter sets -----------------------------------
+def weak_scaling_overlap(cuda_enabled_mpi=False):
+    wlb.log_info_on_root("Running scaling benchmark with communication hiding")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    # overlap
+    for t in ["simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=(WeakX, WeakY, WeakZ),
+                               cuda_blocks=(128, 1, 1),
+                               time_step_strategy=t,
+                               inner_outer_split=(8, 8, 8),
+                               cuda_enabled_mpi=cuda_enabled_mpi,
+                               outer_iterations=1,
+                               boundary_setup=True,
+                               db_file_name="weakScalingUniformGrid.sqlite3"))
+def strong_scaling_overlap(cuda_enabled_mpi=False):
+    wlb.log_info_on_root("Running strong scaling benchmark with one block per proc with communication hiding")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    domain_size = (StrongX, StrongY, StrongZ)
+    blocks = block_decomposition(wlb.mpi.numProcesses())
+    cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
+    # overlap
+    for t in ["simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
+                               cuda_blocks=(128, 1, 1),
+                               time_step_strategy=t,
+                               inner_outer_split=(1, 1, 1),
+                               cuda_enabled_mpi=cuda_enabled_mpi,
+                               outer_iterations=1,
+                               timesteps=num_time_steps(cells_per_block),
+                               blocks=blocks,
+                               boundary_setup=True,
+                               db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
+def single_gpu_benchmark():
+    """Benchmarks only the LBM compute kernel"""
+    wlb.log_info_on_root("Running single GPU benchmarks")
+    wlb.log_info_on_root("")
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+    gpu_type = os.environ.get('GPU_TYPE')
+    additional_info = {}
+    if gpu_type is not None:
+        additional_info['gpu_type'] = gpu_type
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (128, 256, 320)]
+    cuda_blocks = [(128, 1, 1), ]
+    for block_size in block_sizes:
+        for cuda_block_size in cuda_blocks:
+            # cuda_block_size = (256, 1, 1) and block_size = (64, 64, 64) would be cut to cuda_block_size = (64, 1, 1)
+            if cuda_block_size > block_size:
+                continue
+            if not cuda_block_size_ok(cuda_block_size):
+                wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.")
+                continue
+            if not domain_block_size_ok(block_size, gpu_mem):
+                wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+                continue
+            scenario = Scenario(cells_per_block=block_size,
+                                cuda_blocks=cuda_block_size,
+                                time_step_strategy='kernelOnly',
+                                timesteps=num_time_steps(block_size, 2000),
+                                outer_iterations=1,
+                                additional_info=additional_info)
+            scenarios.add(scenario)
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+    wlb.log_info_on_root("")
+    time_step_strategy = "noOverlap"  # "simpleOverlap"
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(cells_per_block=(128, 128, 128),
+                        time_step_strategy=time_step_strategy,
+                        timesteps=10001,
+                        outer_iterations=1,
+                        warmup_steps=0,
+                        init_shear_flow=False,
+                        boundary_setup=True,
+                        vtk_write_frequency=5000,
+                        remaining_time_logger_frequency=30)
+    scenarios.add(scenario)
+wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
+# Select the benchmark you want to run
+# single_gpu_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
+# performance of compute kernel (no communication)
+# overlap_benchmark()  # benchmarks different communication overlap options
+# profiling()  # run only two timesteps on a smaller domain for profiling only
+# validation_run()
+if BENCHMARK == 0:
+    single_gpu_benchmark()
+elif BENCHMARK == 1:
+    weak_scaling_overlap(True)
+elif BENCHMARK == 2:
+    strong_scaling_overlap(True)
+else:
+    validation_run()
--- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
-waLBerla_link_files_to_builddir( "*.prm" )
-waLBerla_link_files_to_builddir( "*.py" )
-foreach(config trt smagorinsky mrt entropic_kbc_n4 cumulant )
-    waLBerla_generate_target_from_python(NAME UniformGridGenerated_${config}
-          CODEGEN_CFG ${config}
-          FILE UniformGridGenerated.py
-          OUT_FILES GenMacroGetter.cpp GenMacroGetter.h GenMacroSetter.cpp GenMacroSetter.h
-          GenPackInfo.cpp GenPackInfo.h GenPackInfoAAPush.cpp GenPackInfoAAPush.h GenPackInfoAAPull.cpp GenPackInfoAAPull.h
-          GenLbKernel.cpp GenLbKernel.h GenLbKernelAAEven.cpp GenLbKernelAAEven.h GenLbKernelAAOdd.cpp GenLbKernelAAOdd.h
-          GenMpiDtypeInfo.h GenMpiDtypeInfoAAPull.h GenMpiDtypeInfoAAPush.h
-          GenDefines.h)
-    waLBerla_add_executable ( NAME UniformGridBenchmarkGenerated_${config}
-          FILES UniformGridGenerated.cpp
-          DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk gui
-          UniformGridGenerated_${config} python_coupling)
-    set_target_properties(UniformGridBenchmarkGenerated_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
-endforeach()
--- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
+++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
-DomainSetup
-{
-   blocks        <  1,    1,   1 >;
-   cellsPerBlock <  64, 64, 64 >;
-   periodic      <  1,   1,   1 >;
-}
-Parameters 
-{
-	timesteps       200;   // time steps of one performance measurement
-	warmupSteps     1;      // number of steps to run before measurement starts
-    outerIterations 4;      // how many measurements to conduct
-	vtkWriteFrequency 0;           // write a VTK file every n'th step, if zero VTK output is disabled
-	timeStepMode twoField;
-	//twoFieldKernelType manualD3Q19;
-	remainingTimeLoggerFrequency 6;  // interval in seconds to log the estimated remaining time
-    directComm 0;
-	omega 1.8;
-	shearVelocityMagnitude 0.02;
-	useGui 0;
-}
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
-#include "core/Environment.h"
-#include "core/logging/Initialization.h"
-#include "core/OpenMP.h"
-#include "python_coupling/CreateConfig.h"
-#include "python_coupling/PythonCallback.h"
-#include "python_coupling/DictWrapper.h"
-#include "blockforest/Initialization.h"
-#include "field/vtk/VTKWriter.h"
-#include "field/AddToStorage.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-#include "blockforest/communication/UniformDirectScheme.h"
-#include "timeloop/all.h"
-#include "core/timing/TimingPool.h"
-#include "core/timing/RemainingTimeLogger.h"
-#include "core/waLBerlaBuildInfo.h"
-#include "domain_decomposition/SharedSweep.h"
-#include "gui/Gui.h"
-#include "InitShearVelocity.h"
-#include "ManualKernels.h"
-#include "lbm/lattice_model/D3Q19.h"
-#include "GenDefines.h"
-#include "GenMacroGetter.h"
-#include "GenMacroSetter.h"
-#include "GenLbKernel.h"
-#include "GenLbKernelAAEven.h"
-#include "GenLbKernelAAOdd.h"
-#include "GenPackInfo.h"
-#include "GenPackInfoAAPush.h"
-#include "GenPackInfoAAPull.h"
-#include "GenMpiDtypeInfo.h"
-#include "GenMpiDtypeInfoAAPull.h"
-#include "GenMpiDtypeInfoAAPush.h"
-#include <iomanip>
-using namespace walberla;
-using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
-using VelocityField_T = GhostLayerField< real_t, 3 >;
-int main( int argc, char **argv )
-{
-   mpi::Environment env( argc, argv );
-   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
-   {
-      WALBERLA_MPI_WORLD_BARRIER()
-      auto config = *cfg;
-      logging::configureLogging( config );
-      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
-      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
-      // Reading parameters
-      auto parameters = config->getOneBlock( "Parameters" );
-      const std::string timeStepMode = parameters.getParameter<std::string>( "timeStepMode", "twoField");
-      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
-            uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 ));
-      const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", real_t(0.08));
-      const bool directComm = parameters.getParameter<bool>("directComm", false);
-      auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage * const storage) {
-          return new PdfField_T(storage->getNumberOfXCells(*block),
-                                storage->getNumberOfYCells(*block),
-                                storage->getNumberOfZCells(*block),
-                                uint_t(1),
-                                field::fzyx,
-                                make_shared<field::AllocateAligned<real_t, 64>>());
-      };
-      // Creating fields
-      BlockDataID pdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
-      BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
-      pystencils::GenMacroSetter setterKernel(pdfFieldId, velFieldId);
-      pystencils::GenMacroGetter getterKernel(pdfFieldId, velFieldId);
-      if( shearVelocityMagnitude > 0 )
-          initShearVelocity(blocks, velFieldId, shearVelocityMagnitude);
-      for( auto & b : *blocks)
-          setterKernel(&b);
-      // Buffered Comm
-      blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks );
-      twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) );
-      blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm(blocks);
-      aaPullComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPull>(pdfFieldId));
-      blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks);
-      aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId));
-      // Direct Comm
-      blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect(blocks);
-      twoFieldCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfo>(pdfFieldId));
-      blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect(blocks);
-      aaPullCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPull>(pdfFieldId));
-      blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect(blocks);
-      aaPushCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPush>(pdfFieldId));
-      const std::string twoFieldKernelType = parameters.getParameter<std::string>( "twoFieldKernelType", "generated");
-      std::function<void(IBlock*)> twoFieldKernel;
-      if( twoFieldKernelType == "generated") {
-          twoFieldKernel = pystencils::GenLbKernel(pdfFieldId, omega);
-      } else if (twoFieldKernelType == "manualGeneric") {
-          using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
-          BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
-          twoFieldKernel = StreamPullCollideGeneric<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
-      } else if (twoFieldKernelType == "manualD3Q19") {
-          using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
-          BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
-          twoFieldKernel = StreamPullCollideD3Q19<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
-      } else {
-          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid option for \"twoFieldKernelType\", "
-                                       "valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\"")
-      }
-      using F = std::function<void()>;
-      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
-      if( timeStepMode == "twoField")
-      {
-          timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
-                         << Sweep( twoFieldKernel, "LB stream & collide1" );
-          timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
-                         << Sweep( twoFieldKernel, "LB stream & collide2" );
-      } else if ( timeStepMode == "twoFieldKernelOnly") {
-          timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide1" );
-          timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" );
-      } else if ( timeStepMode == "aa") {
-          timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
-          timeLoop.add() << BeforeFunction( directComm ? F(aaPullCommDirect) : F(aaPullComm) )
-                         << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd")
-                         << AfterFunction( directComm ? F(aaPushCommDirect) : F(aaPushComm) );
-      } else if ( timeStepMode == "aaKernelOnly") {
-          timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
-          timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd");
-      } else {
-          WALBERLA_ABORT("Invalid value for timeStepMode")
-      }
-      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
-      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
-      for(int i=0; i < warmupSteps; ++i )
-         timeLoop.singleStep();
-      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
-      if (remainingTimeLoggerFrequency > 0) {
-          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
-          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
-      }
-      // VTK
-      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
-      if( vtkWriteFrequency > 0 )
-      {
-          auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                           "simulation_step", false, true, true, false, 0 );
-          auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" );
-          vtkOutput->addCellDataWriter( velWriter );
-          vtkOutput->addBeforeFunction( [&]()
-                                        { for( auto & b : *blocks)
-                                            getterKernel(&b);
-                                        } );
-          timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
-      }
-      bool useGui = parameters.getParameter<bool>( "useGui", false );
-      if( useGui )
-      {
-          GUI gui( timeLoop, blocks, argc, argv);
-          gui.run();
-      }
-      else
-      {
-          for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
-          {
-              timeLoop.setCurrentTimeStepToZero();
-              WcTimer simTimer;
-              auto threads = omp_get_max_threads();
-              simTimer.start();
-              timeLoop.run();
-              simTimer.end();
-              auto time = simTimer.last();
-              auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-              auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-              using std::setw;
-              WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode <<
-                                                     "  procs: " << setw(6) << MPIManager::instance()->numProcesses() <<
-                                                     "  threads: " << threads <<
-                                                     "  direct_comm: " << directComm <<
-                                                     "  time steps: " << timesteps <<
-                                                     setw(15) << "  block size: " << cellsPerBlock <<
-                                                     "  mlups/core:  " << int(mlupsPerProcess/ threads) <<
-                                                     "  mlups:  " << int(mlupsPerProcess) *  MPIManager::instance()->numProcesses())
-              WALBERLA_ROOT_SECTION()
-              {
-                  python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                  if ( pythonCallbackResults.isCallable())
-                  {
-                      pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                      pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
-                      pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
-                      pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode );
-                      pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType );
-                      pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict );
-                      pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() );
-                      pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() );
-                      pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() );
-                      // Call Python function to report results
-                      pythonCallbackResults();
-                  }
-              }
-          }
-      }
-   }
-   return 0;
-}
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
-import sympy as sp
-import pystencils as ps
-from lbmpy.creationfunctions import create_lb_update_rule, create_lb_collision_rule
-from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep,\
-    generate_mpidtype_info_from_kernel
-from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
-from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
-omega = sp.symbols('omega')
-omega_free = sp.Symbol('omega_free')
-omega_fill = sp.symbols('omega_:10')
-options_dict = {
-    'srt': {
-        'method': 'srt',
-        'stencil': 'D3Q19',
-        'relaxation_rate': omega,
-        'compressible': False,
-    },
-    'trt': {
-        'method': 'trt',
-        'stencil': 'D3Q19',
-        'compressible': False,
-        'relaxation_rate': omega,
-    },
-    'mrt': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [0.0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7],
-    },
-    'mrt_full': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
-                             omega_fill[3], omega_fill[4], omega_fill[5]],
-    },
-    'entropic': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
-        'entropic': True,
-    },
-    'entropic_kbc_n4': {
-        'method': 'trt-kbc-n4',
-        'stencil': 'D3Q27',
-        'compressible': True,
-        'relaxation_rates': [omega, omega_free],
-        'entropic': True,
-    },
-    'smagorinsky': {
-        'method': 'srt',
-        'stencil': 'D3Q19',
-        'smagorinsky': True,
-        'relaxation_rate': omega,
-    },
-    'cumulant': {
-        'method': 'cumulant',
-        'stencil': 'D3Q19',
-        'compressible': True,
-        'relaxation_rate': omega,
-    },
-}
-info_header = """
-#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
-const char * infoStencil = "{stencil}";
-const char * infoConfigName = "{configName}";
-const char * optimizationDict = "{optimizationDict}";
-"""
-with CodeGeneration() as ctx:
-    common_options = {
-        'field_name': 'pdfs',
-        'temporary_field_name': 'pdfs_tmp',
-    }
-    opts = {
-        'two_field_cse_pdfs': False,
-        'two_field_cse_global': False,
-        'two_field_split': True,
-        'two_field_nt_stores': True,
-        'aa_even_cse_pdfs': False,
-        'aa_even_cse_global': False,
-        'aa_even_split': False,
-        'aa_even_nt_stores': False,
-        'aa_odd_cse_pdfs': False,
-        'aa_odd_cse_global': False,
-        'aa_odd_split': True,
-        'aa_odd_nt_stores': False,
-        'compiled_in_boundaries': False,
-    }
-    config_name = ctx.config
-    noopt = False
-    d3q27 = False
-    if config_name.endswith('_d3q27'):
-        d3q27 = True
-        config_name = config_name[:-len('_d3q27')]
-    if config_name == '':
-        config_name = 'trt'
-    options = options_dict[config_name]
-    options.update(common_options)
-    options = options.copy()
-    if d3q27:
-        options['stencil'] = 'D3Q27'
-    dtype_string = 'float64' if ctx.double_accuracy else 'float32'
-    stencil_str = options['stencil']
-    q = int(stencil_str[stencil_str.find('Q') + 1:])
-    pdfs, velocity_field = ps.fields(f'pdfs({q}), velocity(3) : {dtype_string}[3D]', layout='fzyx')
-    update_rule_two_field = create_lb_update_rule(optimization={'symbolic_field': pdfs,
-                                                                'split': opts['two_field_split'],
-                                                                'cse_global': opts['two_field_cse_global'],
-                                                                'cse_pdfs': opts['two_field_cse_pdfs']}, **options)
-    if opts['compiled_in_boundaries']:
-        from lbmpy.boundaries import NoSlip, UBB
-        from lbmpy.boundaries.boundaries_in_kernel import update_rule_with_push_boundaries
-        from collections import OrderedDict
-        boundaries = OrderedDict((
-            ((1, 0, 0), NoSlip()),
-            ((-1, 0, 0), NoSlip()),
-            ((0, 1, 0), NoSlip()),
-            ((0, -1, 0), NoSlip()),
-            ((0, 0, 1), UBB([0.05, 0, 0])),
-            ((0, 0, -1), NoSlip()),
-        ))
-        cr_even = create_lb_collision_rule(stencil='D3Q19', compressible=False,
-                                           optimization={'cse_global': opts['aa_even_cse_global'],
-                                                         'cse_pdfs': opts['aa_even_cse_pdfs']})
-        cr_odd = create_lb_collision_rule(stencil='D3Q19', compressible=False,
-                                          optimization={'cse_global': opts['aa_odd_cse_global'],
-                                                        'cse_pdfs': opts['aa_odd_cse_pdfs']})
-        update_rule_aa_even = update_rule_with_push_boundaries(cr_even, pdfs, boundaries,
-                                                               AAEvenTimeStepAccessor, AAOddTimeStepAccessor.read)
-        update_rule_aa_odd = update_rule_with_push_boundaries(cr_odd, pdfs, boundaries,
-                                                              AAOddTimeStepAccessor, AAEvenTimeStepAccessor.read)
-    else:
-        update_rule_aa_even = create_lb_update_rule(kernel_type=AAEvenTimeStepAccessor(),
-                                                    optimization={'symbolic_field': pdfs,
-                                                                  'split': opts['aa_even_split'],
-                                                                  'cse_global': opts['aa_even_cse_global'],
-                                                                  'cse_pdfs': opts['aa_even_cse_pdfs']}, **options)
-        update_rule_aa_odd = create_lb_update_rule(kernel_type=AAOddTimeStepAccessor(),
-                                                   optimization={'symbolic_field': pdfs,
-                                                                 'split': opts['aa_odd_split'],
-                                                                 'cse_global': opts['aa_odd_cse_global'],
-                                                                 'cse_pdfs': opts['aa_odd_cse_pdfs']}, **options)
-    vec = {'assume_aligned': True, 'assume_inner_stride_one': True}
-    # check if openmp is enabled in cmake
-    if ctx.openmp:
-        openmp_enabled = True
-    else:
-        openmp_enabled = False
-    # Sweeps
-    vec['nontemporal'] = opts['two_field_nt_stores']
-    generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')],
-                   cpu_vectorize_info=vec)
-    vec['nontemporal'] = opts['aa_even_nt_stores']
-    generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec,
-                   cpu_openmp=openmp_enabled, ghost_layers=1)
-    vec['nontemporal'] = opts['aa_odd_nt_stores']
-    generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec,
-                   cpu_openmp=openmp_enabled, ghost_layers=1)
-    setter_assignments = macroscopic_values_setter(update_rule_two_field.method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=1.0)
-    getter_assignments = macroscopic_values_getter(update_rule_two_field.method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=None)
-    generate_sweep(ctx, 'GenMacroSetter', setter_assignments, cpu_openmp=openmp_enabled)
-    generate_sweep(ctx, 'GenMacroGetter', getter_assignments, cpu_openmp=openmp_enabled)
-    # Communication
-    generate_pack_info_from_kernel(ctx, 'GenPackInfo', update_rule_two_field,
-                                   cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False)
-    generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPull', update_rule_aa_odd, kind='pull',
-                                   cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False)
-    generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPush', update_rule_aa_odd, kind='push',
-                                   cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False)
-    generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfo', update_rule_two_field)
-    generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPull', update_rule_aa_odd, kind='pull')
-    generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPush', update_rule_aa_odd, kind='push')
-    # Info Header
-    infoHeaderParams = {
-        'stencil': stencil_str,
-        'q': q,
-        'configName': ctx.config,
-        'optimizationDict': str(opts),
-    }
-    ctx.write_file('GenDefines.h', info_header.format(**infoHeaderParams))
--- a/apps/benchmarks/UniformGridGenerated/params.py
+++ b/apps/benchmarks/UniformGridGenerated/params.py
-import math
-import os
-import operator
-import waLBerla as wlb
-from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
-from waLBerla.tools.config import block_decomposition
-from functools import reduce
-import sqlite3
-def prod(seq):
-    return reduce(operator.mul, seq, 1)
-def get_block_decomposition(block_decomposition, num_processes):
-    bx = by = bz = 1
-    blocks_per_axis = int(math.log(num_processes, 2))
-    for i in range(blocks_per_axis):
-        decomposition_axis = block_decomposition[i % len(block_decomposition)]
-        if decomposition_axis == 'y':
-            by *= 2
-        elif decomposition_axis == 'z':
-            bz *= 2
-        elif decomposition_axis == 'x':
-            bx *= 2
-    assert (bx * by * bz) == num_processes
-    return bx, by, bz
-def calculate_time_steps(runtime, expected_mlups, domain_size):
-    cells = prod(domain_size)
-    time_steps_per_second = expected_mlups * 1e6 / cells
-    return int(time_steps_per_second * runtime)
-def domain_decomposition_func_z(processes, threads, block_size):
-    return {
-        'blocks': (1, 1, processes),
-        'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads)
-    }
-def domain_decomposition_func_full(processes, threads, block_size):
-    return {
-        'blocks': block_decomposition(processes),
-        'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads)
-    }
-class BenchmarkScenario:
-    def __init__(self, block_size=(256, 128, 128), direct_comm=True,
-                 time_step_mode='aa', two_field_kernel_type='generated',
-                 domain_decomposition_func=domain_decomposition_func_z,
-                 db_file_name='uniform_grid_gen.sqlite'):
-        self.block_size = block_size
-        self.direct_comm = direct_comm
-        self.time_step_mode = time_step_mode
-        self.two_field_kernel_type = two_field_kernel_type
-        self.domain_decomposition_func = domain_decomposition_func
-        self.threads = int(os.environ['OMP_NUM_THREADS'])
-        self.processes = wlb.mpi.numProcesses()
-        self.db_file_name = db_file_name
-    @wlb.member_callback
-    def config(self, **kwargs):
-        time_steps_for_128_cubed = 10
-        time_steps = int(128**3 / prod(self.block_size) * time_steps_for_128_cubed)
-        time_steps = max(10, time_steps)
-        cfg = {
-            'DomainSetup': {
-                'periodic': (1, 1, 1),
-            },
-            'Parameters': {
-                'timesteps': time_steps,
-                'warmupSteps': 2,
-                'outerIterations': 4,
-                'vtkWriteFrequency': 0,
-                'remainingTimeLoggerFrequency': 0,
-                'omega': 1.6,
-                'timeStepMode': self.time_step_mode,
-                'twoFieldKernelType': self.two_field_kernel_type,
-                'directComm': self.direct_comm,
-            }
-        }
-        cfg['DomainSetup'].update(self.domain_decomposition_func(self.processes, self.threads, self.block_size))
-        return cfg
-    @wlb.member_callback
-    def results_callback(self, mlupsPerProcess, optimizations, **kwargs):
-        cfg = self.config()
-        result = {
-            'block_size': self.block_size,
-            'mlups_per_core': mlupsPerProcess / self.threads,
-            'threads': self.threads,
-            'processes': self.processes,
-            'time_step_mode': self.time_step_mode,
-            'direct_comm': self.direct_comm,
-            'time_steps': cfg['Parameters']['timesteps'],
-            'I_MPI_PIN_CELL': os.environ.get('I_MPI_PIN_CELL', ''),
-            'I_MPI_PIN_DOMAIN': os.environ.get('I_MPI_PIN_CELL', ''),
-        }
-        optimizations = eval(optimizations)
-        result.update(optimizations)
-        result.update(kwargs)
-        sequenceValuesToScalars(result)
-        num_tries = 4
-        for num_try in range(num_tries):
-            try:
-                checkAndUpdateSchema(result, "runs", self.db_file_name)
-                storeSingle(result, "runs", self.db_file_name)
-                break
-            except sqlite3.OperationalError as e:
-                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/4 " + str(e))
-def block_size_ok(sc):
-    block_size = sc.config()['DomainSetup']['cellsPerBlock']
-    return prod(block_size) * 19 < 2 ** 31
-def single_node_benchmark():
-    scenarios = wlb.ScenarioManager()
-    for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64),
-                       (1024, 64, 32), (2048, 64, 16),
-                       (64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]:
-        for direct_comm in (True, False):
-            for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']:
-                if time_step_mode == 'twoField':
-                    for kt in ['generated', 'manualGeneric', 'manualD3Q19']:
-                        sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
-                                               time_step_mode=time_step_mode, two_field_kernel_type=kt,
-                                               domain_decomposition_func=domain_decomposition_func_z
-                                               )
-                        if not block_size_ok(sc):
-                            continue
-                        scenarios.add(sc)
-                else:
-                    sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
-                                           domain_decomposition_func=domain_decomposition_func_z,
-                                           time_step_mode=time_step_mode)
-                    if not block_size_ok(sc):
-                        continue
-                        # scenarios.add(sc)
-def single_node_benchmark_small():
-    scenarios = wlb.ScenarioManager()
-    for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64),
-                       (1024, 64, 32), (2048, 64, 16), (64, 32, 32), (32, 32, 32), (16, 16, 16),
-                       (256, 128, 64), (512, 128, 32)]:
-        for direct_comm in (True, False):
-            for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']:
-                sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, time_step_mode=time_step_mode)
-                if not block_size_ok(sc):
-                    continue
-                scenarios.add(sc)
-def weak_scaling():
-    scenarios = wlb.ScenarioManager()
-    for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64),
-                       (1024, 64, 32), (2048, 64, 16), (64, 32, 32), (32, 32, 32), (16, 16, 16),
-                       (256, 128, 64), (512, 128, 32)]:
-        for direct_comm in (True, False):
-            sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
-                                   domain_decomposition_func=domain_decomposition_func_full)
-            if not block_size_ok(sc):
-                continue
-            scenarios.add(sc)
-single_node_benchmark()
--- a/apps/pythonmodule/CMakeLists.txt
+++ b/apps/pythonmodule/CMakeLists.txt
 # waLBerla Python module
+if ( NOT TARGET walberla::python_coupling )
+    message( WARNING "python module ist not build since the walberla::python_coupling target is non-existent" )
+else()
 if ( WALBERLA_BUILD_WITH_PYTHON )
-    if ( WALBERLA_BUILD_WITH_CUDA )
+    if ( WALBERLA_BUILD_WITH_GPU_SUPPORT )
-        set(PYTHON_MODULE_DEPENDENCIES blockforest boundary domain_decomposition core field python_coupling timeloop vtk cuda)
+        set( PYTHON_MODULE_DEPENDENCIES walberla::blockforest walberla::boundary walberla::domain_decomposition walberla::core walberla::field walberla::python_coupling walberla::timeloop walberla::vtk walberla::gpu )
    else()
-        set(PYTHON_MODULE_DEPENDENCIES blockforest boundary domain_decomposition core field python_coupling timeloop vtk)
+        set( PYTHON_MODULE_DEPENDENCIES walberla::blockforest walberla::boundary walberla::domain_decomposition walberla::core walberla::field walberla::python_coupling walberla::timeloop walberla::vtk )
    endif()
    if( WALBERLA_CXX_COMPILER_IS_MSVC )
@@ -25,10 +26,10 @@ if ( WALBERLA_BUILD_WITH_PYTHON )
            SUFFIX "${PYTHON_MODULE_EXTENSION}"
    )
    set_target_properties( walberla_cpp PROPERTIES MACOSX_RPATH TRUE )
-    set_target_properties( walberla_cpp PROPERTIES CXX_VISIBILITY_PRESET hidden)
    configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/setup.py  ${CMAKE_CURRENT_BINARY_DIR}/setup.py )
-    add_custom_target( pythonModule        ALL ${PYTHON_EXECUTABLE} setup.py build   DEPENDS walberla_cpp )
+    add_custom_target( pythonModule        ALL ${Python_EXECUTABLE} setup.py build   DEPENDS walberla_cpp )
-    add_custom_target( pythonModuleInstall     ${PYTHON_EXECUTABLE} setup.py install --user DEPENDS walberla_cpp )
+    add_custom_target( pythonModuleInstall     ${Python_EXECUTABLE} setup.py install --user DEPENDS walberla_cpp )
 endif()
+endif()
\ No newline at end of file
--- a/apps/pythonmodule/PythonModule.cpp
+++ b/apps/pythonmodule/PythonModule.cpp
@@ -28,8 +28,8 @@
 #include "stencil/all.h"
-#ifdef WALBERLA_BUILD_WITH_CUDA
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
- #include "python_coupling/export/CUDAExport.h"
+ #include "python_coupling/export/GPUExport.h"
 #endif
@@ -75,11 +75,11 @@ struct InitObject
      pythonManager->addExporterFunction(blockforest::exportModuleToPython<stencil::D2Q5, stencil::D2Q9, stencil::D3Q7, stencil::D3Q19, stencil::D3Q27>);
      // VTK
      pythonManager->addExporterFunction( vtk::exportModuleToPython );
-      #ifdef WALBERLA_BUILD_WITH_CUDA
+      #ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
-            using walberla::cuda::GPUField;
+            using walberla::gpu::GPUField;
-            pythonManager->addExporterFunction( cuda::exportModuleToPython<GPU_FIELD_TYPES> );
+            pythonManager->addExporterFunction(gpu::exportModuleToPython<GPU_FIELD_TYPES> );
-            pythonManager->addExporterFunction( cuda::exportCopyFunctionsToPython<FIELD_TYPES> );
+            pythonManager->addExporterFunction(gpu::exportCopyFunctionsToPython<FIELD_TYPES> );
            pythonManager->addBlockDataConversion<GPU_FIELD_TYPES>();
      #endif
      //

--- a/apps/showcases/Antidunes/Antidunes.cpp
+++ b/apps/showcases/Antidunes/Antidunes.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AntiDunes.cpp
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//! \author Jonas Plewinski <jonas.plewinski@fau.de>
+//! \author Christoph Rettinger <christoph.rettinger@fau.de>
+//! \author Christoph Schwarzmeier <christoph.schwarzmeier@fau.de>
+//
+// This showcase simulates antidunes, i.e., particulate dunes that travel in opposite stream-wise direction. See
+// simulation results published in https://doi.org/10.1017/jfm.2023.262.
+//======================================================================================================================
+#include "blockforest/Initialization.h"
+#include "core/Environment.h"
+#include "core/SharedFunctor.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "domain_decomposition/SharedSweep.h"
+#include "field/StabilityChecker.h"
+#include "lbm/PerformanceLogger.h"
+#include "lbm/field/AddToStorage.h"
+#include "lbm/free_surface/BlockStateDetectorSweep.h"
+#include "lbm/free_surface/SurfaceMeshWriter.h"
+#include "lbm/free_surface/TotalMassComputer.h"
+#include "lbm/free_surface/VtkWriter.h"
+#include "lbm/free_surface/dynamics/CellConversionSweep.h"
+#include "lbm/free_surface/dynamics/ConversionFlagsResetSweep.h"
+#include "lbm/free_surface/dynamics/ExcessMassDistributionSweep.h"
+#include "lbm/free_surface/dynamics/PdfRefillingSweep.h"
+#include "lbm/free_surface/dynamics/StreamReconstructAdvectSweep.h"
+#include "lbm/free_surface/surface_geometry/CurvatureSweep.h"
+#include "lbm/free_surface/surface_geometry/NormalSweep.h"
+#include "lbm/free_surface/surface_geometry/SmoothingSweep.h"
+#include "lbm/free_surface/surface_geometry/Utility.h"
+#include "lbm_mesapd_coupling/mapping/ParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/MovingParticleMapping.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/PdfReconstructionManager.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/reconstruction/Reconstructor.h"
+#include "lbm_mesapd_coupling/utility/AddHydrodynamicInteractionKernel.h"
+#include "lbm_mesapd_coupling/utility/AverageHydrodynamicForceTorqueKernel.h"
+#include "lbm_mesapd_coupling/utility/InitializeHydrodynamicForceTorqueForAveragingKernel.h"
+#include "lbm_mesapd_coupling/utility/LubricationCorrectionKernel.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+#include "lbm_mesapd_coupling/utility/ResetHydrodynamicForceTorqueKernel.h"
+#include "mesa_pd/collision_detection/AnalyticContactDetection.h"
+#include "mesa_pd/data/ParticleAccessorWithBaseShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDataHandling.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/kernel/DoubleCast.h"
+#include "mesa_pd/kernel/LinearSpringDashpot.h"
+#include "mesa_pd/kernel/ParticleSelector.h"
+#include "mesa_pd/kernel/VelocityVerlet.h"
+#include "mesa_pd/mpi/ClearGhostOwnerSync.h"
+#include "mesa_pd/mpi/ClearNextNeighborSync.h"
+#include "mesa_pd/mpi/ContactFilter.h"
+#include "mesa_pd/mpi/ReduceContactHistory.h"
+#include "mesa_pd/mpi/ReduceProperty.h"
+#include "mesa_pd/mpi/SyncGhostOwners.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/mpi/notifications/ForceTorqueNotification.h"
+#include "mesa_pd/mpi/notifications/HydrodynamicForceTorqueNotification.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+#include "vtk/VTKOutput.h"
+#include <core/waLBerlaBuildInfo.h>
+#include "AntidunesBoundaryHandling.h"
+#include "AntidunesLatticeModel.h"
+#include "PIDController.h"
+#include "Utility.h"
+namespace walberla
+{
+namespace antidunes
+{
+using ScalarField_T          = GhostLayerField< real_t, 1 >;
+using VectorField_T          = GhostLayerField< Vector3< real_t >, 1 >;
+using VectorFieldFlattened_T = GhostLayerField< real_t, 3 >;
+using LatticeModel_T        = lbm::AntidunesLatticeModel;
+using LatticeModelStencil_T = LatticeModel_T::Stencil;
+using PdfField_T            = lbm::PdfField< LatticeModel_T >;
+using PdfCommunication_T    = blockforest::SimpleCommunication< LatticeModelStencil_T >;
+// the geometry computations in SurfaceGeometryHandler require meaningful values in the ghost layers in corner
+// directions (flag field and fill level field); this holds, even if the lattice model uses a D3Q19 stencil
+using CommunicationStencil_T =
+   std::conditional_t< LatticeModel_T::Stencil::D == uint_t(2), stencil::D2Q9, stencil::D3Q27 >;
+using Communication_T = blockforest::SimpleCommunication< CommunicationStencil_T >;
+using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithBaseShape;
+using flag_t      = uint32_t;
+using FlagField_T = FlagField< flag_t >;
+using AntidunesBoundaryHandling_T =
+   free_surface::AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >;
+using StateSweep = walberla::free_surface::BlockStateDetectorSweep< FlagField_T >;
+const FlagUID FormerMO_Flag("former moving obstacle");
+// empty sweep required for using selectors (e.g. StateSweep::fullFreeSurface)
+struct emptySweep
+{
+   void operator()(IBlock*) {}
+};
+// data handling for loading a field of type ScalarField_T from file
+template< typename ScalarField_T >
+class ScalarFieldHandling : public field::BlockDataHandling< ScalarField_T >
+{
+ public:
+   ScalarFieldHandling(const weak_ptr< StructuredBlockStorage >& blocks, uint_t numberGhostLayer)
+      : blocks_(blocks), numberGhostLayer_(numberGhostLayer)
+   {}
+ protected:
+   ScalarField_T* allocate(IBlock* const block) override { return allocateDispatch(block); }
+   ScalarField_T* reallocate(IBlock* const block) override { return allocateDispatch(block); }
+ private:
+   weak_ptr< StructuredBlockStorage > blocks_;
+   uint_t numberGhostLayer_;
+   ScalarField_T* allocateDispatch(IBlock* const block)
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR(block);
+      auto blocks = blocks_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks);
+      return new ScalarField_T(blocks->getNumberOfXCells(*block), blocks->getNumberOfYCells(*block),
+                               blocks->getNumberOfZCells(*block), numberGhostLayer_, real_c(0), field::fzyx);
+   }
+}; // class ScalarFieldHandling
+// data handling for loading a field of type VectorFieldFlattened_T from file
+template< typename VectorFieldFlattened_T >
+class VectorFieldFlattenedHandling : public field::BlockDataHandling< VectorFieldFlattened_T >
+{
+ public:
+   VectorFieldFlattenedHandling(const weak_ptr< StructuredBlockStorage >& blocks, uint_t numberGhostLayer)
+      : blocks_(blocks), numberGhostLayer_(numberGhostLayer)
+   {}
+ protected:
+   VectorFieldFlattened_T* allocate(IBlock* const block) override { return allocateDispatch(block); }
+   VectorFieldFlattened_T* reallocate(IBlock* const block) override { return allocateDispatch(block); }
+ private:
+   weak_ptr< StructuredBlockStorage > blocks_;
+   uint_t numberGhostLayer_;
+   VectorFieldFlattened_T* allocateDispatch(IBlock* const block)
+   {
+      WALBERLA_ASSERT_NOT_NULLPTR(block);
+      auto blocks = blocks_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks);
+      return new VectorFieldFlattened_T(blocks->getNumberOfXCells(*block), blocks->getNumberOfYCells(*block),
+                                        blocks->getNumberOfZCells(*block), numberGhostLayer_, field::fzyx);
+   }
+}; // class VectorFieldFlattenedHandling
+// sweep for computing the force density from the fluid density and fill level
+template< typename LatticeModel_T, typename FlagField_T, typename VectorFieldFlattened_T, typename ScalarField_T >
+class ForceDensityCodegenSweep
+{
+ public:
+   ForceDensityCodegenSweep(BlockDataID forceDensityFieldID, ConstBlockDataID pdfFieldID, ConstBlockDataID flagFieldID,
+                            ConstBlockDataID fillFieldID,
+                            const walberla::free_surface::FlagInfo< FlagField_T >& flagInfo,
+                            std::shared_ptr< Vector3< real_t > > globalAcceleration)
+      : forceDensityFieldID_(forceDensityFieldID), pdfFieldID_(pdfFieldID), flagFieldID_(flagFieldID),
+        fillFieldID_(fillFieldID), flagInfo_(flagInfo), globalAcceleration_(globalAcceleration)
+   {}
+   void operator()(IBlock* const block)
+   {
+      VectorFieldFlattened_T* const forceDensityField = block->getData< VectorFieldFlattened_T >(forceDensityFieldID_);
+      const PdfField_T* const pdfField                = block->getData< const PdfField_T >(pdfFieldID_);
+      const FlagField_T* const flagField              = block->getData< const FlagField_T >(flagFieldID_);
+      const ScalarField_T* const fillField            = block->getData< const ScalarField_T >(fillFieldID_);
+      WALBERLA_FOR_ALL_CELLS(forceDensityFieldIt, forceDensityField, pdfFieldIt, pdfField, flagFieldIt, flagField,
+                             fillFieldIt, fillField, {
+                                flag_t flag = *flagFieldIt;
+                                // set force density in cells to acceleration * density * fillLevel (see equation 15
+                                // in Koerner et al., 2005);
+                                if (flagInfo_.isInterface(flag))
+                                {
+                                   const real_t density   = pdfField->getDensity(pdfFieldIt.cell());
+                                   forceDensityFieldIt[0] = (*globalAcceleration_)[0] * *fillFieldIt * density;
+                                   forceDensityFieldIt[1] = (*globalAcceleration_)[1] * *fillFieldIt * density;
+                                   forceDensityFieldIt[2] = (*globalAcceleration_)[2] * *fillFieldIt * density;
+                                }
+                                else
+                                {
+                                   if (flagInfo_.isLiquid(flag))
+                                   {
+                                      const real_t density   = pdfField->getDensity(pdfFieldIt.cell());
+                                      forceDensityFieldIt[0] = (*globalAcceleration_)[0] * density;
+                                      forceDensityFieldIt[1] = (*globalAcceleration_)[1] * density;
+                                      forceDensityFieldIt[2] = (*globalAcceleration_)[2] * density;
+                                   }
+                                }
+                             }) // WALBERLA_FOR_ALL_CELLS
+   }
+ private:
+   using flag_t = typename FlagField_T::flag_t;
+   BlockDataID forceDensityFieldID_;
+   ConstBlockDataID pdfFieldID_;
+   ConstBlockDataID flagFieldID_;
+   ConstBlockDataID fillFieldID_;
+   walberla::free_surface::FlagInfo< FlagField_T > flagInfo_;
+   std::shared_ptr< Vector3< real_t > > globalAcceleration_;
+}; // class ForceDensitySweep
+// function describing the global initialization profile
+inline real_t initializationProfile(real_t x, real_t amplitude, real_t offset, real_t wavelength)
+{
+   return amplitude * std::cos(x / wavelength * real_c(2) * math::pi + math::pi) + offset;
+}
+real_t getHydrostaticDensity(real_t height, real_t referenceHeight, real_t gravitationalAcceleration)
+{
+   return real_c(1) + real_c(3) * gravitationalAcceleration * (height - referenceHeight);
+}
+void initializePoiseuilleProfile(StructuredBlockForest& forest, const BlockDataID& pdfFieldID,
+                                 const ConstBlockDataID& fillFieldID, const real_t& averageBedHeight,
+                                 const real_t& averageFluidHeight, const real_t& accelerationX, const real_t& viscosity,
+                                 real_t amplitude, real_t wavelength)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Initializing Poiseuille velocity profile");
+   const real_t rho = real_c(1);
+   for (auto blockIt = forest.begin(); blockIt != forest.end(); ++blockIt)
+   {
+      PdfField_T* const pdfField           = blockIt->getData< PdfField_T >(pdfFieldID);
+      const ScalarField_T* const fillField = blockIt->getData< const ScalarField_T >(fillFieldID);
+      WALBERLA_FOR_ALL_CELLS_XYZ(
+         pdfField, const Vector3< real_t > coord = forest.getBlockLocalCellCenter(*blockIt, Cell(x, y, z));
+         Vector3< real_t > velocity(real_c(0));
+         auto localBedHeight = initializationProfile(coord[0], amplitude, averageBedHeight, wavelength);
+         auto heightAboveBed = coord[2] - localBedHeight;
+         const real_t fillLevel = fillField->get(x, y, z);
+         if (heightAboveBed >= real_c(0) && fillLevel > real_c(0)) {
+            velocity[0] = accelerationX / (real_c(2) * viscosity) * heightAboveBed *
+                          (real_c(2) * averageFluidHeight - heightAboveBed);
+         } pdfField->setToEquilibrium(x, y, z, velocity, rho);)
+   }
+}
+/***********************************************************************************************************************
+ * Initialize the hydrostatic pressure in the direction in which a force is acting in ALL cells (regardless of a cell's
+ * flag). The velocity remains unchanged.
+ *
+ * The force vector must have only one component, i.e., the direction of the force can only be in x-, y- or z-axis.
+ * The variable fluidHeight determines the height at which the density is equal to reference density (=1).
+ **********************************************************************************************************************/
+template< typename PdfField_T >
+void initHydrostaticPressure(const std::weak_ptr< StructuredBlockForest >& blockForestPtr,
+                             const BlockDataID& pdfFieldID,
+                             std::function< real_t(const Vector3< real_t >&) > hydrostaticDensityFct)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Initializing hydrostatic pressure");
+   const auto blockForest = blockForestPtr.lock();
+   WALBERLA_CHECK_NOT_NULLPTR(blockForest);
+   for (auto blockIt = blockForest->begin(); blockIt != blockForest->end(); ++blockIt)
+   {
+      PdfField_T* const pdfField = blockIt->getData< PdfField_T >(pdfFieldID);
+      CellInterval local = pdfField->xyzSizeWithGhostLayer(); // block-, i.e., process-local cell interval
+      for (const auto& cell : local)
+      {
+         // initialize the (hydrostatic) pressure, i.e., LBM density
+         // Bernoulli: p = p0 + density * gravity * height
+         // => LBM (density=1): rho = rho0 + gravity * height = 1 + 1/cs^2 * g * h = 1 + 3 * g * h
+         // shift global cell by 0.5 since density is set for cell center
+         Vector3< real_t > cellCenter = blockForest->getBlockLocalCellCenter(*blockIt, cell);
+         const real_t rho             = hydrostaticDensityFct(cellCenter);
+         const Vector3< real_t > velocity = pdfField->getVelocity(cell);
+         pdfField->setDensityAndVelocity(cell, velocity, rho);
+      }
+   }
+}
+template< typename FreeSurfaceBoundaryHandling_T, typename PdfField_T, typename FlagField_T >
+class MeanVelocityComputer
+{
+ public:
+   MeanVelocityComputer(const std::weak_ptr< const StructuredBlockForest >& blockForest,
+                        const std::weak_ptr< const FreeSurfaceBoundaryHandling_T >& freeSurfaceBoundaryHandling,
+                        const ConstBlockDataID& pdfFieldID, const std::shared_ptr< Vector3< real_t > >& meanVelocity,
+                        real_t averagingFactor)
+      : blockForest_(blockForest), freeSurfaceBoundaryHandling_(freeSurfaceBoundaryHandling), pdfFieldID_(pdfFieldID),
+        meanVelocity_(meanVelocity), averagingFactor_(averagingFactor)
+   {}
+   void operator()()
+   {
+      auto blockForest = blockForest_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blockForest);
+      auto freeSurfaceBoundaryHandling = freeSurfaceBoundaryHandling_.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(freeSurfaceBoundaryHandling);
+      getMeanVelocity(blockForest, freeSurfaceBoundaryHandling);
+   }
+   void getMeanVelocity(const std::shared_ptr< const StructuredBlockForest >& blockForest,
+                        const std::shared_ptr< const FreeSurfaceBoundaryHandling_T >& freeSurfaceBoundaryHandling)
+   {
+      const BlockDataID flagFieldID = freeSurfaceBoundaryHandling->getFlagFieldID();
+      const typename FreeSurfaceBoundaryHandling_T::FlagInfo_T& flagInfo = freeSurfaceBoundaryHandling->getFlagInfo();
+      // use separate variables for the velocity in each direction; syntax meanVelocity[0] does not work in OMP-macro
+      real_t meanVelocityX = real_c(0);
+      real_t meanVelocityY = real_c(0);
+      real_t meanVelocityZ = real_c(0);
+      for (auto blockIt = blockForest->begin(); blockIt != blockForest->end(); ++blockIt)
+      {
+         const FlagField_T* const flagField = blockIt->template getData< const FlagField_T >(flagFieldID);
+         const PdfField_T* const pdfField   = blockIt->template getData< const PdfField_T >(pdfFieldID_);
+         WALBERLA_FOR_ALL_CELLS_OMP(flagFieldIt, flagField, pdfFieldIt, pdfField,
+                                    omp parallel for schedule(static) reduction(+:meanVelocityX)
+                                       reduction(+:meanVelocityY) reduction(+:meanVelocityZ),
+                                    {
+            if (flagInfo.isLiquid(flagFieldIt) || flagInfo.isInterface(flagFieldIt))
+            {
+               const Vector3< real_t > velocity = pdfField->getVelocity(pdfFieldIt.cell());
+               meanVelocityX += velocity[0];
+               meanVelocityY += velocity[1];
+               meanVelocityZ += velocity[2];
+               //++cellCount;
+            }
+                                    }) // WALBERLA_FOR_ALL_CELLS_OMP
+      }
+      Vector3< real_t > meanVelocity(meanVelocityX, meanVelocityY, meanVelocityZ);
+      mpi::allReduceInplace< real_t >(meanVelocity, mpi::SUM);
+      // mpi::allReduceInplace< uint_t >(cellCount, mpi::SUM);
+      meanVelocity *= averagingFactor_;
+      *meanVelocity_ = meanVelocity;
+   };
+ private:
+   std::weak_ptr< const StructuredBlockForest > blockForest_;
+   std::weak_ptr< const FreeSurfaceBoundaryHandling_T > freeSurfaceBoundaryHandling_;
+   const ConstBlockDataID pdfFieldID_;
+   std::shared_ptr< Vector3< real_t > > meanVelocity_;
+   real_t averagingFactor_;
+}; // class MeanVelocityComputer
+class ForcingAdjuster
+{
+ public:
+   ForcingAdjuster(const shared_ptr< StructuredBlockStorage >& blocks, real_t targetVelocity, real_t externalForcing,
+                   real_t proportionalGain, real_t derivativeGain, real_t integralGain, real_t maxRamp,
+                   real_t minActuatingVariable, real_t maxActuatingVariable)
+      : blocks_(blocks), currentExternalForcing_(externalForcing),
+        pid_(targetVelocity, externalForcing, proportionalGain, derivativeGain, integralGain, maxRamp,
+             minActuatingVariable, maxActuatingVariable)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Creating PID controller with pg = " << pid_.getProportionalGain()
+                                                                     << ", dg = " << pid_.getDerivateGain()
+                                                                     << ", ig = " << pid_.getIntegralGain());
+   }
+   void operator()(const real_t currentMeanVelocity)
+   {
+      // compute new forcing value on root (since flow rate only known on root)
+      WALBERLA_ROOT_SECTION()
+      {
+         real_t newExternalForcing = pid_.update(currentMeanVelocity);
+         currentExternalForcing_   = newExternalForcing;
+      }
+      // send updated external forcing to all other processes
+      mpi::broadcastObject(currentExternalForcing_);
+   }
+   real_t getExternalForcing() { return currentExternalForcing_; }
+   void storePIDSnapshot(std::string filename)
+   {
+      WALBERLA_ROOT_SECTION() { pid_.writeStateToFile(filename); }
+   }
+   void loadPIDSnapshot(std::string filename) { pid_.readStateFromFile(filename); }
+ private:
+   shared_ptr< StructuredBlockStorage > blocks_;
+   real_t currentExternalForcing_;
+   PIDController pid_;
+}; // ForcingAdjuster
+int main(int argc, char** argv)
+{
+   Environment walberlaEnv(argc, argv);
+   WALBERLA_LOG_INFO_ON_ROOT("waLBerla Revision: " << std::string(WALBERLA_GIT_SHA1).substr(0, 8))
+   if (argc < 2) { WALBERLA_ABORT("Please specify a parameter file as input argument.") }
+   WALBERLA_LOG_DEVEL_ON_ROOT("Using generated lattice model.");
+   auto configPtr = walberlaEnv.config();
+   // print content of parameter file
+   WALBERLA_LOG_INFO_ON_ROOT(*configPtr);
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream file;
+      file.open("parameterConfiguration.cfg");
+      file << *configPtr;
+      file.close();
+   }
+   // get block forest parameters from parameter file
+   auto blockForestParameters            = configPtr->getOneBlock("BlockForestParameters");
+   const Vector3< uint_t > cellsPerBlock = blockForestParameters.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   const Vector3< bool > periodicity     = blockForestParameters.getParameter< Vector3< bool > >("periodicity");
+   const bool loadSnapshot               = blockForestParameters.getParameter< bool >("loadSnapshot");
+   const bool storeSnapshot              = blockForestParameters.getParameter< bool >("storeSnapshot");
+   const uint_t snapshotFrequency        = blockForestParameters.getParameter< uint_t >("snapshotFrequency");
+   const std::string snapshotBaseFolder  = blockForestParameters.getParameter< std::string >("snapshotBaseFolder");
+   // get domain parameters from parameter file
+   auto domainParameters              = configPtr->getOneBlock("DomainParameters");
+   const Vector3< uint_t > domainSize = domainParameters.getParameter< Vector3< uint_t > >("domainSize");
+   const uint_t wavePeriods           = domainParameters.getParameter< uint_t >("wavePeriods");
+   const real_t liquidHeightFactor    = domainParameters.getParameter< real_t >("liquidHeightFactor");
+   const real_t floorHeightFactor     = domainParameters.getParameter< real_t >("floorHeightFactor");
+   const real_t initialAmplitude      = domainParameters.getParameter< real_t >("initialAmplitude");
+   // compute number of blocks as defined by domainSize and cellsPerBlock
+   Vector3< uint_t > numBlocks;
+   for (uint_t i = uint_c(0); i <= uint_c(2); ++i)
+   {
+      numBlocks[i] = domainSize[i] / cellsPerBlock[i];
+      WALBERLA_CHECK_EQUAL(numBlocks[i] * cellsPerBlock[i], domainSize[i],
+                           "Domain size in direction " << i << " is not a multiple of cells per block.")
+   }
+   // get number of (MPI) processes
+   const uint_t numProcesses = uint_c(MPIManager::instance()->numProcesses());
+   WALBERLA_CHECK_EQUAL(numProcesses, numBlocks[0] * numBlocks[1] * numBlocks[2],
+                        "The number of MPI processes is different from the number of blocks as defined by "
+                        "\"domainSize/cellsPerBlock\".")
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(numProcesses);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(numBlocks);
+   // get PID controller parameters
+   auto PIDParameters                       = configPtr->getOneBlock("PIDParameters");
+   const real_t targetMeanVelocityMagnitude = PIDParameters.getParameter< real_t >("targetMeanVelocityMagnitude");
+   const real_t proportionalGain            = PIDParameters.getParameter< real_t >("proportionalGain");
+   const real_t derivativeGain              = PIDParameters.getParameter< real_t >("derivativeGain");
+   const real_t integralGain                = PIDParameters.getParameter< real_t >("integralGain");
+   const real_t maxRamp                     = PIDParameters.getParameter< real_t >("maxRamp");
+   const real_t minActuatingVariable        = PIDParameters.getParameter< real_t >("minActuatingVariable");
+   const real_t maxActuatingVariable        = PIDParameters.getParameter< real_t >("maxActuatingVariable");
+   // read particle infos
+   const auto particleParameters               = configPtr->getOneBlock("ParticleParameters");
+   const std::string particleInFileName        = particleParameters.getParameter< std::string >("inFileName");
+   const uint_t bedCopiesInX                   = particleParameters.getParameter< uint_t >("bedCopiesInX");
+   const uint_t bedCopiesInY                   = particleParameters.getParameter< uint_t >("bedCopiesInY");
+   const real_t particleDensityRatio           = particleParameters.getParameter< real_t >("densityRatio");
+   const real_t particleFixingHeightFactor     = particleParameters.getParameter< real_t >("fixingHeightFactor");
+   const real_t particleFrictionCoefficient    = particleParameters.getParameter< real_t >("frictionCoefficient");
+   const real_t particleRestitutionCoefficient = particleParameters.getParameter< real_t >("restitutionCoefficient");
+   const uint_t particleNumSubCycles           = particleParameters.getParameter< uint_t >("numSubCycles");
+   const bool useLubricationCorrection         = particleParameters.getParameter< bool >("useLubricationCorrection");
+   const bool useNoSlipParticles               = particleParameters.getParameter< bool >("useNoSlipParticles");
+   const real_t particlePoissonsRatio          = real_c(0.22);
+   const real_t particleKappa = real_c(2) * (real_c(1) - particlePoissonsRatio) / (real_c(2) - particlePoissonsRatio);
+   real_t particleCollisionTimeNonDim = real_c(4);
+   bool useOpenMP                     = false;
+   const uint_t vtkSpacingParticles =
+      configPtr->getOneBlock("VTK").getOneBlock("fluid_field").getParameter< uint_t >("writeFrequency");
+   const std::string vtkFolder =
+      configPtr->getOneBlock("VTK").getOneBlock("fluid_field").getParameter< std::string >("baseFolder");
+   // get physics parameters from parameter file
+   auto physicsParameters   = configPtr->getOneBlock("PhysicsParameters");
+   const bool enableWetting = physicsParameters.getParameter< bool >("enableWetting");
+   const uint_t timesteps   = physicsParameters.getParameter< uint_t >("timesteps");
+   const real_t Re          = physicsParameters.getParameter< real_t >("Re");
+   const real_t Fr          = physicsParameters.getParameter< real_t >("Fr");
+   const real_t We          = physicsParameters.getParameter< real_t >("We");
+   // get avgDiameter and scaling factor
+   real_t avgParticleDiameter   = real_c(0);
+   real_t particleScalingFactor = real_c(0);
+   getAvgDiameterScalingFactor(particleInFileName, domainSize, bedCopiesInX, bedCopiesInY, avgParticleDiameter,
+                               particleScalingFactor);
+   const real_t liquidHeight         = avgParticleDiameter * liquidHeightFactor;
+   const real_t floorHeight          = avgParticleDiameter * floorHeightFactor;
+   const real_t particleFixingHeight = particleFixingHeightFactor * avgParticleDiameter;
+   WALBERLA_CHECK_FLOAT_UNEQUAL(liquidHeight, 0.0)
+   WALBERLA_CHECK_FLOAT_UNEQUAL(floorHeight, 0.0)
+   const real_t absoluteLiquidHeight = liquidHeight + floorHeight;
+   const real_t viscosity      = targetMeanVelocityMagnitude * liquidHeight / Re;
+   const real_t relaxationRate = real_c(1.0) / (real_c(3) * viscosity + real_c(0.5));
+   const real_t gravity       = (targetMeanVelocityMagnitude / Fr) * (targetMeanVelocityMagnitude / Fr) / liquidHeight;
+   const real_t accelerationX = real_c(3) * targetMeanVelocityMagnitude * viscosity / (liquidHeight * liquidHeight);
+   std::shared_ptr< Vector3< real_t > > acceleration =
+      std::make_shared< Vector3< real_t > >(accelerationX, real_c(0.0), -gravity);
+   const real_t surfaceTension =
+      real_c(1.0) * targetMeanVelocityMagnitude * targetMeanVelocityMagnitude * liquidHeight / We;
+   // compute SI dx and dt
+   const real_t viscosity_SI = real_c(1.0016e-6); // kinemtic viscosity of water at 20°C at 1 bar
+   const real_t dx_SI        = real_c(1) / particleScalingFactor;
+   const real_t dt_SI        = viscosity / viscosity_SI * dx_SI * dx_SI;
+   WALBERLA_LOG_INFO_ON_ROOT("\nPhysical parameters:")
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(liquidHeight);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(floorHeight);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(dx_SI);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(dt_SI);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(*acceleration);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(relaxationRate);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(viscosity);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(absoluteLiquidHeight);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(avgParticleDiameter);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(particleScalingFactor);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(particleFixingHeight);
+   WALBERLA_LOG_INFO_ON_ROOT("\nFree surface physical parameters")
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(surfaceTension);
+   if ((periodicity[0] && numBlocks[0] < uint_c(3)) || (periodicity[1] && numBlocks[1] < uint_c(3)) ||
+       (periodicity[2] && numBlocks[2] < uint_c(3)))
+   {
+      WALBERLA_ABORT("When using particles, use at least three blocks per periodic direction.");
+   }
+   // read model parameters from parameter file
+   const auto modelParameters               = configPtr->getOneBlock("ModelParameters");
+   const std::string pdfReconstructionModel = modelParameters.getParameter< std::string >("pdfReconstructionModel");
+   const std::string excessMassDistributionModel =
+      modelParameters.getParameter< std::string >("excessMassDistributionModel");
+   const walberla::free_surface::ExcessMassDistributionModel excessMassModel(excessMassDistributionModel);
+   const bool useSimpleMassExchange          = modelParameters.getParameter< bool >("useSimpleMassExchange");
+   const real_t cellConversionThreshold      = modelParameters.getParameter< real_t >("cellConversionThreshold");
+   const real_t cellConversionForceThreshold = modelParameters.getParameter< real_t >("cellConversionForceThreshold");
+   // read evaluation parameters from parameter file
+   const auto evaluationParameters      = configPtr->getOneBlock("EvaluationParameters");
+   const uint_t performanceLogFrequency = evaluationParameters.getParameter< uint_t >("performanceLogFrequency");
+   const uint_t evaluationFrequency     = evaluationParameters.getParameter< uint_t >("evaluationFrequency");
+   const std::string baseFolderName     = evaluationParameters.getParameter< std::string >("baseFolderName");
+   uint_t beginTimeStep = 0;
+   const std::string checkpointConfigFile("antidunesCheckpointConfig.file");
+   if (loadSnapshot)
+   {
+      WALBERLA_ROOT_SECTION()
+      {
+         std::ifstream file;
+         file.open(snapshotBaseFolder + "/" + checkpointConfigFile);
+         if (file.fail()) WALBERLA_ABORT("Error: " << checkpointConfigFile << " could not be opened!");
+         file >> beginTimeStep;
+         file >> (*acceleration)[0];
+         file.close();
+      }
+      mpi::broadcastObject(beginTimeStep);
+      mpi::broadcastObject(*acceleration);
+      WALBERLA_LOG_INFO_ON_ROOT("Successfully read config parameters from checkpoint config file:")
+      WALBERLA_LOG_INFO_ON_ROOT(" - beginTimeStep = " << beginTimeStep)
+      WALBERLA_LOG_INFO_ON_ROOT(" - acceleration = < " << (*acceleration)[0] << ", " << (*acceleration)[1] << ", "
+                                                       << (*acceleration)[2] << " >")
+   }
+   if (loadSnapshot)
+   {
+      // modify config file to start VTK output from "loadFromTimestep" rather than from 0
+      std::vector< config::Config::Block* > configVTKBlock;
+      configPtr->getWritableGlobalBlock().getWritableBlocks("VTK", configVTKBlock, 1, 1);
+      std::vector< config::Config::Block* > configVTKFluidFieldBlock;
+      configVTKBlock[0]->getWritableBlocks("fluid_field", configVTKFluidFieldBlock, 1, 1);
+      configVTKFluidFieldBlock[0]->setOrAddParameter("initialExecutionCount", std::to_string(beginTimeStep));
+   }
+   WALBERLA_ROOT_SECTION()
+   {
+      // create base directories if they do not yet exist
+      filesystem::path tpath(baseFolderName);
+      if (!filesystem::exists(tpath)) filesystem::create_directory(tpath);
+      filesystem::path snapshotPath(snapshotBaseFolder);
+      if (!filesystem::exists(snapshotPath)) filesystem::create_directory(snapshotPath);
+   }
+   std::shared_ptr< StructuredBlockForest > blockForest(nullptr);
+   const std::string blockForestFile("blockForest.file");
+   if (loadSnapshot)
+   {
+      // load block forest from file
+      MPIManager::instance()->useWorldComm();
+      blockForest = make_shared< StructuredBlockForest >(
+         make_shared< BlockForest >(uint_c(MPIManager::instance()->rank()),
+                                    (std::string(snapshotBaseFolder + "/" + blockForestFile)).c_str(), true, false),
+         cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+      blockForest->createCellBoundingBoxes();
+   }
+   else
+   {
+      // create uniform block forest
+      blockForest = blockforest::createUniformBlockGrid(numBlocks[0], numBlocks[1], numBlocks[2],             // blocks
+                                                        cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2], // cells
+                                                        real_c(1.0),                                          // dx
+                                                        true, // one block per process
+                                                        periodicity[0], periodicity[1], periodicity[2]); // periodicity
+   }
+   // save block forest to file (but do not overwrite existing file if snapshot is loaded)
+   if (storeSnapshot && !loadSnapshot)
+   {
+      blockForest->getBlockForest().saveToFile(snapshotBaseFolder + "/" + blockForestFile);
+   }
+   const auto vtkParameters      = configPtr->getOneBlock("VTK");
+   const auto vtkFluidParameters = vtkParameters.getOneBlock("fluid_field");
+   BlockDataID pdfFieldID;
+   const std::string pdfFieldFile("pdfField.file");
+   BlockDataID fillFieldID;
+   const std::string fillFieldFile("fillField.file");
+   BlockDataID excessMassFieldID;
+   const std::string excessMassFieldFile("excessMassField.file");
+   // force density field must be added here to create lattice model
+   BlockDataID forceDensityFieldID =
+      field::addToStorage< VectorFieldFlattened_T >(blockForest, "Force field", real_c(0), field::fzyx, uint_c(1));
+   if (excessMassModel.isEvenlyAllInterfaceFallbackLiquidType())
+   {
+      // add additional field for storing excess mass in liquid cells
+      excessMassFieldID =
+         field::addToStorage< ScalarField_T >(blockForest, "Excess mass", real_c(0), field::fzyx, uint_c(1));
+   }
+   LatticeModel_T latticeModel = LatticeModel_T(forceDensityFieldID, relaxationRate);
+   if (loadSnapshot)
+   {
+      // load PDF field from file
+      shared_ptr< lbm::internal::PdfFieldHandling< LatticeModel_T > > pdfFieldDataHandling =
+         make_shared< lbm::internal::PdfFieldHandling< LatticeModel_T > >(
+            blockForest, latticeModel, false, Vector3< real_t >(real_c(0)), real_c(1), uint_c(1), field::fzyx);
+      pdfFieldID = (blockForest->getBlockStorage())
+                      .loadBlockData(snapshotBaseFolder + "/" + pdfFieldFile, pdfFieldDataHandling, "PDF field");
+      // load fill level field from file
+      std::shared_ptr< ScalarFieldHandling< ScalarField_T > > fillFieldDataHandling =
+         std::make_shared< ScalarFieldHandling< ScalarField_T > >(blockForest, uint_c(2));
+      fillFieldID =
+         (blockForest->getBlockStorage())
+            .loadBlockData(snapshotBaseFolder + "/" + fillFieldFile, fillFieldDataHandling, "Fill level field");
+      // load fill level field from file
+      std::shared_ptr< ScalarFieldHandling< ScalarField_T > > excessMassFieldDataHandling =
+         std::make_shared< ScalarFieldHandling< ScalarField_T > >(blockForest, uint_c(1));
+      excessMassFieldID = (blockForest->getBlockStorage())
+                             .loadBlockData(snapshotBaseFolder + "/" + excessMassFieldFile, excessMassFieldDataHandling,
+                                            "Excess mass field");
+   }
+   else
+   {
+      // add PDF field
+      pdfFieldID =
+         lbm::addPdfFieldToStorage(blockForest, "PDF field", latticeModel,
+                                   Vector3< real_t >(targetMeanVelocityMagnitude, 0, 0), real_c(1.0), field::fzyx);
+      // add fill level field (initialized with 0, i.e., gas everywhere)
+      fillFieldID =
+         field::addToStorage< ScalarField_T >(blockForest, "Fill level field", real_c(0.0), field::fzyx, uint_c(2));
+      // add fill level field (initialized with 0, i.e., gas everywhere)
+      excessMassFieldID =
+         field::addToStorage< ScalarField_T >(blockForest, "Excess mass field", real_c(0.0), field::fzyx, uint_c(1));
+   }
+   // MesaPD data structures
+   auto particleStorage  = std::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto particleAccessor = std::make_shared< mesa_pd::data::ParticleAccessorWithBaseShape >(particleStorage);
+   auto mesapdDomain     = std::make_shared< mesa_pd::domain::BlockForestDomain >(blockForest->getBlockForestPointer());
+   BlockDataID particleStorageID;
+   const std::string particleStorageFile("particleStorageFile.file");
+   if (loadSnapshot)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Initializing particles from checkpointing file!");
+      particleStorageID = blockForest->loadBlockData(snapshotBaseFolder + "/" + particleStorageFile,
+                                                     mesa_pd::domain::createBlockForestDataHandling(particleStorage),
+                                                     "Particle Storage");
+      mesa_pd::mpi::ClearNextNeighborSync CNNS;
+      CNNS(*particleAccessor);
+      mesa_pd::mpi::ClearGhostOwnerSync CGOS;
+      CGOS(*particleAccessor);
+   }
+   else
+   {
+      particleStorageID =
+         blockForest->addBlockData(mesa_pd::domain::createBlockForestDataHandling(particleStorage), "Particle Storage");
+   }
+   BlockDataID particleFieldID = field::addToStorage< lbm_mesapd_coupling::ParticleField_T >(
+      blockForest, "Particle field", particleAccessor->getInvalidUid(), field::fzyx, uint_c(2));
+   auto densityReferenceHeight = absoluteLiquidHeight;
+   auto hydrostaticDensityFct  = [acceleration, densityReferenceHeight](const Vector3< real_t >& position) {
+      uint_t forceComponent = uint_c(2); // gravity is here strictly only acting in z direction!
+      return getHydrostaticDensity(position[forceComponent], densityReferenceHeight, (*acceleration)[forceComponent]);
+   };
+   // add boundary handling
+   const std::shared_ptr< AntidunesBoundaryHandling_T > antidunesBoundaryHandling =
+      std::make_shared< AntidunesBoundaryHandling_T >(blockForest, pdfFieldID, fillFieldID, particleFieldID,
+                                                      particleAccessor, hydrostaticDensityFct);
+   const BlockDataID flagFieldID                                    = antidunesBoundaryHandling->getFlagFieldID();
+   const typename AntidunesBoundaryHandling_T::FlagInfo_T& flagInfo = antidunesBoundaryHandling->getFlagInfo();
+   real_t sinusAmplitude  = real_c(0.5) * initialAmplitude;
+   real_t sinusOffset     = floorHeight;
+   real_t sinusWavelength = real_c(domainSize[0]) / real_c(wavePeriods);
+   if (!loadSnapshot)
+   {
+      // samples used in the Monte-Carlo like estimation of the fill level
+      const uint_t fillLevelInitSamples = uint_c(100); // actually there will be 101 since 0 is also included
+      const uint_t numTotalPoints = (fillLevelInitSamples + uint_c(1)) * (fillLevelInitSamples + uint_c(1));
+      const real_t stepsize       = real_c(1) / real_c(fillLevelInitSamples);
+      // initialize free-surface sine profile
+      for (auto blockIt = blockForest->begin(); blockIt != blockForest->end(); ++blockIt)
+      {
+         ScalarField_T* const fillField = blockIt->getData< ScalarField_T >(fillFieldID);
+         WALBERLA_FOR_ALL_CELLS(fillFieldIt, fillField, {
+            // cell in block-local coordinates
+            const Cell localCell = fillFieldIt.cell();
+            // get cell in global coordinates
+            Cell globalCell = localCell;
+            blockForest->transformBlockLocalToGlobalCell(globalCell, *blockIt, localCell);
+            // Monte-Carlo like estimation of the fill level:
+            // create uniformly-distributed sample points in each cell and count the number of points below the sine
+            // profile; this fraction of points is used as the fill level to initialize the profile
+            uint_t numPointsBelow = uint_c(0);
+            for (uint_t xSample = uint_c(0); xSample <= fillLevelInitSamples; ++xSample)
+            {
+               // Pascal et al. (2021) defined the amplitude to span from minimum peak to maximum peak; in
+               // initializationProfile(), the amplitude is defined to range from the average to the maximum peak
+               const real_t functionValue =
+                  initializationProfile(real_c(globalCell[0]) + real_c(xSample) * stepsize, sinusAmplitude,
+                                        absoluteLiquidHeight + real_c(0.5), sinusWavelength);
+               for (uint_t zSample = uint_c(0); zSample <= fillLevelInitSamples; ++zSample)
+               {
+                  const real_t zPoint = real_c(globalCell[2]) + real_c(zSample) * stepsize;
+                  // with operator <, a fill level of 1 can not be reached when the line is equal to the cell's top
+                  // border; with operator <=, a fill level of 0 can not be reached when the line is equal to the cell's
+                  // bottom border
+                  if (zPoint < functionValue) { ++numPointsBelow; }
+               }
+            }
+            // fill level is fraction of points below sine profile
+            fillField->get(localCell) = real_c(numPointsBelow) / real_c(numTotalPoints);
+         }) // WALBERLA_FOR_ALL_CELLS
+      }
+      initializePoiseuilleProfile(*blockForest, pdfFieldID, fillFieldID, floorHeight, liquidHeight + real_c(0.5),
+                                  (*acceleration)[0], viscosity, sinusAmplitude, sinusWavelength);
+   }
+   // initialize domain boundary conditions from config file
+   const auto boundaryParameters = configPtr->getOneBlock("BoundaryParameters");
+   antidunesBoundaryHandling->initFromConfig(boundaryParameters);
+   std::function< void(void) > syncCall;
+   auto simulationDomainAABB = blockForest->getDomain();
+   lbm_mesapd_coupling::ParticleMappingKernel< AntidunesBoundaryHandling_T::BoundaryHandling_T > particleMappingKernel(
+      blockForest, antidunesBoundaryHandling->getHandlingID());
+   lbm_mesapd_coupling::MovingParticleMappingKernel< AntidunesBoundaryHandling_T::BoundaryHandling_T >
+      movingParticleMappingKernel(blockForest, antidunesBoundaryHandling->getHandlingID(), particleFieldID);
+   uint_t numParticles = uint_c(0);
+   // initialize bottom solid sine profile
+   if (!loadSnapshot)
+   {
+      auto createParticleFct = [sinusAmplitude, sinusOffset, sinusWavelength](Vector3< real_t > pos) {
+         return pos[2] < initializationProfile(pos[0], sinusAmplitude, sinusOffset, sinusWavelength);
+      };
+      real_t maxParticleHeight = real_c(0);
+      initSpheresFromFile(particleInFileName, *particleStorage, *mesapdDomain, particleDensityRatio, domainSize,
+                          createParticleFct, simulationDomainAABB, bedCopiesInX, bedCopiesInY, numParticles,
+                          maxParticleHeight, particleScalingFactor);
+      WALBERLA_LOG_INFO_ON_ROOT("Max particle height " << maxParticleHeight);
+      if ((sinusOffset + sinusAmplitude) > maxParticleHeight)
+         WALBERLA_ABORT("Created particle bed is below desired sinus shape!");
+      if (real_c(2) * sinusAmplitude > (maxParticleHeight - particleFixingHeight))
+         WALBERLA_ABORT("Created mobile particle bed is not high enough for desired sinus shape!");
+      if (useNoSlipParticles && (particleFixingHeight < maxParticleHeight))
+         WALBERLA_ABORT("You are using no-slip BCs on particles (which does not set hydrodynamic forces) but do not "
+                        "fix all particles - this leads to wrong behavior and is not permitted!")
+      // fix lower particles
+      particleStorage->forEachParticle(
+         useOpenMP, mesa_pd::kernel::SelectAll(), *particleAccessor,
+         [particleFixingHeight](const size_t idx, auto& ac) {
+            if (ac.getPosition(idx)[2] < particleFixingHeight)
+               mesa_pd::data::particle_flags::set(ac.getFlagsRef(idx), mesa_pd::data::particle_flags::FIXED);
+         },
+         *particleAccessor);
+   }
+   else
+   {
+      real_t avgParticleDiameterTest = real_c(0);
+      particleStorage->forEachParticle(
+         false, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+         [&numParticles, &avgParticleDiameterTest](const size_t idx, auto& ac) {
+            auto sp = static_cast< mesa_pd::data::Sphere* >(ac.getBaseShape(idx).get());
+            ++numParticles;
+            avgParticleDiameterTest += real_c(2) * sp->getRadius();
+         },
+         *particleAccessor);
+      mpi::allReduceInplace(numParticles, mpi::SUM);
+      mpi::allReduceInplace(avgParticleDiameterTest, mpi::SUM);
+      avgParticleDiameterTest /= real_c(numParticles);
+      WALBERLA_LOG_INFO_ON_ROOT("Read particles from check pointing file with avg diameter of "
+                                << avgParticleDiameterTest)
+      if (std::abs(avgParticleDiameterTest - avgParticleDiameter) / avgParticleDiameterTest > real_c(0.05))
+      {
+         WALBERLA_ABORT("Particle diameters not correct.")
+      }
+   }
+   WALBERLA_LOG_INFO_ON_ROOT("Created " << numParticles << " particles");
+   // create planes
+   createPlane(*particleStorage, simulationDomainAABB.minCorner(), Vector3< real_t >(real_c(0), real_c(0), real_c(1)));
+   createPlane(*particleStorage, simulationDomainAABB.maxCorner(), Vector3< real_t >(real_c(0), real_c(0), real_c(-1)));
+   const real_t blockSyncExtension    = real_c(2.5);
+   real_t maxPossibleParticleDiameter = avgParticleDiameter * real_c(1.1);
+   if (maxPossibleParticleDiameter < real_c(2) * real_c(cellsPerBlock.min()) - blockSyncExtension)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Using next neighbor sync for particles");
+      syncCall = [particleStorage, mesapdDomain, blockSyncExtension]() {
+         mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+         syncNextNeighborFunc(*particleStorage, *mesapdDomain, blockSyncExtension);
+      };
+      syncCall();
+   }
+   else
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("Using ghost owner sync for particles")
+      syncCall = [particleStorage, mesapdDomain, blockSyncExtension]() {
+         mesa_pd::mpi::SyncGhostOwners syncGhostOwnersFunc;
+         syncGhostOwnersFunc(*particleStorage, *mesapdDomain, blockSyncExtension);
+      };
+      for (uint_t i = uint_c(0); i < uint_c(std::ceil(maxPossibleParticleDiameter / real_c(cellsPerBlock.min()))); ++i)
+         syncCall();
+   }
+   if (useNoSlipParticles)
+   {
+      particleStorage->forEachParticle(useOpenMP, SphereSelector(), *particleAccessor, particleMappingKernel,
+                                       *particleAccessor, AntidunesBoundaryHandling_T::noSlipFlagID);
+   }
+   else
+   {
+      particleStorage->forEachParticle(useOpenMP, SphereSelector(), *particleAccessor, movingParticleMappingKernel,
+                                       *particleAccessor, AntidunesBoundaryHandling_T::movingObstacleFlagID);
+   }
+   // IMPORTANT REMARK: this must be only called after every solid flag has been set; otherwise, the boundary handling
+   // might not detect solid flags correctly
+   antidunesBoundaryHandling->initFlagsFromFillLevel();
+   // initialize hydrostatic pressure
+   if (!loadSnapshot) { initHydrostaticPressure< PdfField_T >(blockForest, pdfFieldID, hydrostaticDensityFct); }
+   // initialize force density field
+   walberla::free_surface::initForceDensityFieldCodegen< PdfField_T, FlagField_T, VectorFieldFlattened_T,
+                                                         ScalarField_T >(
+      blockForest, forceDensityFieldID, fillFieldID, pdfFieldID, flagFieldID, flagInfo, *acceleration);
+   // communication after initialization
+   Communication_T communication(blockForest, flagFieldID, fillFieldID, forceDensityFieldID);
+   communication();
+   PdfCommunication_T pdfCommunication(blockForest, pdfFieldID);
+   pdfCommunication();
+   // add bubble model
+   std::shared_ptr< walberla::free_surface::bubble_model::BubbleModelBase > bubbleModel =
+      std::make_shared< walberla::free_surface::bubble_model::BubbleModelConstantPressure >(real_c(1));
+   // set density in non-liquid or non-interface cells to 1 (after initializing with hydrostatic pressure)
+   // setDensityInNonFluidCellsToOne< FlagField_T, PdfField_T >(blockForest, flagInfo, flagFieldID, pdfFieldID);
+   // create timeloop
+   SweepTimeloop timeloop(blockForest, timesteps);
+   timeloop.setCurrentTimeStep(beginTimeStep);
+   timeloop.addFuncBeforeTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+   // Laplace pressure = 2 * surface tension * curvature; curvature computation is not necessary with no surface
+   // tension
+   bool computeCurvature = false;
+   if (!realIsEqual(surfaceTension, real_c(0), real_c(1e-14))) { computeCurvature = true; }
+   auto blockStateUpdate = StateSweep(blockForest, flagInfo, flagFieldID);
+   // add surface geometry handler
+   BlockDataID curvatureFieldID =
+      field::addToStorage< ScalarField_T >(blockForest, "Curvature field", real_c(0), field::fzyx, uint_c(1));
+   BlockDataID normalFieldID = field::addToStorage< VectorField_T >(
+      blockForest, "Normal field", Vector3< real_t >(real_c(0)), field::fzyx, uint_c(1));
+   BlockDataID obstacleNormalFieldID = field::addToStorage< VectorField_T >(
+      blockForest, "Obstacle normal field", Vector3< real_t >(real_c(0)), field::fzyx, uint_c(1));
+   // add field for smoothed fill levels
+   BlockDataID smoothFillFieldID =
+      field::addToStorage< ScalarField_T >(blockForest, "Smooth fill level field", real_c(0), field::fzyx, uint_c(1));
+   // smooth fill level field for decreasing error in finite difference normal and curvature computation (see
+   // dissertation of S. Bogner, 2017 (section 4.4.2.1))
+   walberla::free_surface::SmoothingSweep< CommunicationStencil_T, FlagField_T, ScalarField_T, VectorField_T >
+      smoothingSweep(smoothFillFieldID, fillFieldID, flagFieldID,
+                     walberla::free_surface::flagIDs::liquidInterfaceGasFlagIDs, flagInfo.getObstacleIDSet(),
+                     enableWetting);
+   // IMPORTANT REMARK: SmoothingSweep must be executed on all blocks, because the algorithm works on all liquid,
+   // interface and gas cells. This is necessary since the normals are not only computed in interface cells, but also
+   // in the neighborhood of interface cells. Therefore, meaningful values for the fill levels of the second
+   // neighbors of interface cells are also required in NormalSweep.
+   timeloop.add() << Sweep(smoothingSweep, "Sweep: fill level smoothing")
+                  << AfterFunction(Communication_T(blockForest, smoothFillFieldID),
+                                   "Communication: after smoothing sweep");
+   // compute interface normals (using smoothed fill level field)
+   walberla::free_surface::NormalSweep< CommunicationStencil_T, FlagField_T, ScalarField_T, VectorField_T > normalSweep(
+      normalFieldID, smoothFillFieldID, flagFieldID, walberla::free_surface::flagIDs::interfaceFlagID,
+      walberla::free_surface::flagIDs::liquidInterfaceGasFlagIDs, flagInfo.getObstacleIDSet(), true, false, true,
+      false);
+   timeloop.add() << Sweep(normalSweep, "Sweep: normal computation", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: normal")
+                  << AfterFunction(Communication_T(blockForest, normalFieldID), "Communication: after normal sweep");
+   if (computeCurvature)
+   {
+      // compute interface curvature using finite differences according to Brackbill et al.
+      walberla::free_surface::CurvatureSweepFiniteDifferences< CommunicationStencil_T, FlagField_T, ScalarField_T,
+                                                               VectorField_T >
+         curvSweep(curvatureFieldID, normalFieldID, obstacleNormalFieldID, flagFieldID,
+                   walberla::free_surface::flagIDs::interfaceFlagID,
+                   walberla::free_surface::flagIDs::liquidInterfaceGasFlagIDs, flagInfo.getObstacleIDSet(), false,
+                   real_c(0));
+      timeloop.add() << Sweep(curvSweep, "Sweep: curvature computation (finite difference method)",
+                              StateSweep::fullFreeSurface)
+                     << Sweep(emptySweep(), "Empty sweep: curvature")
+                     << AfterFunction(Communication_T(blockForest, curvatureFieldID),
+                                      "Communication: after curvature sweep");
+   }
+   // add surface dynamics handler
+   // add standard waLBerla boundary handling
+   timeloop.add() << Sweep(antidunesBoundaryHandling->getBoundarySweep(), "Sweep: boundary handling",
+                           Set< SUID >::emptySet(), StateSweep::onlyGasAndBoundary)
+                  << Sweep(emptySweep(), "Empty sweep: boundary handling", StateSweep::onlyGasAndBoundary);
+   // add sweep for weighting force in interface cells with fill level and density
+   // different version for codegen because pystencils does not support 'Ghostlayerfield<Vector3(), 1>'
+   const ForceDensityCodegenSweep< LatticeModel_T, FlagField_T, VectorFieldFlattened_T, ScalarField_T >
+      forceDensityCodegenSweep(forceDensityFieldID, pdfFieldID, flagFieldID, fillFieldID, flagInfo, acceleration);
+   timeloop.add() << Sweep(forceDensityCodegenSweep, "Sweep: force weighting", Set< SUID >::emptySet(),
+                           StateSweep::onlyGasAndBoundary)
+                  << Sweep(emptySweep(), "Empty sweep: force weighting", StateSweep::onlyGasAndBoundary)
+                  << AfterFunction(Communication_T(blockForest, forceDensityFieldID),
+                                   "Communication: after force weighting sweep");
+   // sweep for
+   // - reconstruction of PDFs in interface cells
+   // - streaming of PDFs in interface cells (and liquid cells on the same block)
+   // - advection of mass
+   // - update bubble volumes
+   // - marking interface cells for conversion
+   const walberla::free_surface::StreamReconstructAdvectSweep<
+      LatticeModel_T, typename AntidunesBoundaryHandling_T::BoundaryHandling_T, FlagField_T,
+      typename AntidunesBoundaryHandling_T::FlagInfo_T, ScalarField_T, VectorField_T, true >
+      streamReconstructAdvectSweep(surfaceTension, antidunesBoundaryHandling->getHandlingID(), fillFieldID, flagFieldID,
+                                   pdfFieldID, normalFieldID, curvatureFieldID, flagInfo, bubbleModel.get(),
+                                   pdfReconstructionModel, useSimpleMassExchange, cellConversionThreshold,
+                                   cellConversionForceThreshold);
+   // sweep acts only on blocks with at least one interface cell (due to StateSweep::fullFreeSurface)
+   timeloop.add() << Sweep(streamReconstructAdvectSweep, "Sweep: StreamReconstructAdvect", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: StreamReconstructAdvect")
+                  // do not communicate PDFs here:
+                  // - stream on blocks with "StateSweep::fullFreeSurface" was performed here using post-collision PDFs
+                  // - stream on other blocks is performed below and should also use post-collision PDFs
+                  // => if PDFs were communicated here, the ghost layer of other blocks would have post-stream PDFs
+                  << AfterFunction(Communication_T(blockForest, fillFieldID, flagFieldID),
+                                   "Communication: after StreamReconstructAdvect sweep")
+                  << AfterFunction(blockforest::UpdateSecondGhostLayer< ScalarField_T >(blockForest, fillFieldID),
+                                   "Second ghost layer update: after StreamReconstructAdvect sweep (fill level field)")
+                  << AfterFunction(blockforest::UpdateSecondGhostLayer< FlagField_T >(blockForest, flagFieldID),
+                                   "Second ghost layer update: after StreamReconstructAdvect sweep (flag field)");
+   auto lbmSweepGenerated = typename LatticeModel_T::Sweep(pdfFieldID);
+   // temporary class for being able to call the LBM collision with operator()
+   class CollideSweep
+   {
+    public:
+      CollideSweep(const typename LatticeModel_T::Sweep& sweep) : sweep_(sweep){};
+      void operator()(IBlock* const block, const uint_t numberOfGhostLayersToInclude = uint_t(0))
+      {
+         sweep_.collide(block, numberOfGhostLayersToInclude);
+      }
+    private:
+      typename LatticeModel_T::Sweep sweep_;
+   };
+   timeloop.add() << Sweep(CollideSweep(lbmSweepGenerated), "Sweep: collision (generated)", StateSweep::fullFreeSurface)
+                  << Sweep(lbmSweepGenerated, "Sweep: streamCollide (generated)", StateSweep::onlyLBM)
+                  << Sweep(emptySweep(), "Empty sweep: streamCollide (generated)")
+                  << AfterFunction(PdfCommunication_T(blockForest, pdfFieldID),
+                                   "Communication: after streamCollide (generated)");
+   // convert cells
+   // - according to the flags from StreamReconstructAdvectSweep (interface -> gas/liquid)
+   // - to ensure a closed layer of interface cells (gas/liquid -> interface)
+   // - detect and register bubble merges/splits (bubble volumes are already updated in StreamReconstructAdvectSweep)
+   // - convert cells and initialize PDFs near inflow boundaries
+   const walberla::free_surface::CellConversionSweep< LatticeModel_T, AntidunesBoundaryHandling_T::BoundaryHandling_T,
+                                                      ScalarField_T >
+      cellConvSweep(antidunesBoundaryHandling->getHandlingID(), pdfFieldID, flagInfo, bubbleModel.get());
+   timeloop.add() << Sweep(cellConvSweep, "Sweep: cell conversion", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: cell conversion")
+                  //<< AfterFunction(PdfCommunication_T(blockForest, pdfFieldID),
+                  //
+                  //                 "Communication: after cell conversion sweep (PDF field)")
+                  // communicate the flag field also in corner directions
+                  << AfterFunction(Communication_T(blockForest, flagFieldID),
+                                   "Communication: after cell conversion sweep (flag field)")
+                  << AfterFunction(blockforest::UpdateSecondGhostLayer< FlagField_T >(blockForest, flagFieldID),
+                                   "Second ghost layer update: after cell conversion sweep (flag field)");
+   // reinitialize PDFs, i.e., refill cells that were converted from gas to interface
+   // - when the flag "convertedFromGasToInterface" has been set (by CellConversionSweep)
+   // - according to the method specified with pdfRefillingModel_
+   const walberla::free_surface::EquilibriumRefillingSweep< LatticeModel_T, FlagField_T > equilibriumRefillingSweep(
+      pdfFieldID, flagFieldID, flagInfo, true);
+   timeloop.add() << Sweep(equilibriumRefillingSweep, "Sweep: EquilibriumRefilling", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: EquilibriumRefilling")
+                  << AfterFunction(PdfCommunication_T(blockForest, pdfFieldID),
+                                   "Communication: after EquilibriumRefilling sweep");
+   // distribute excess mass:
+   // - excess mass: mass that is free after conversion from interface to gas/liquid cells
+   // - update the bubble model
+   // IMPORTANT REMARK: this sweep computes the mass via the density, i.e., the PDF field must be up-to-date and the
+   // PdfRefillingSweep must have been performed
+   if (excessMassModel.isEvenlyType())
+   {
+      const walberla::free_surface::ExcessMassDistributionSweepInterfaceEvenly< LatticeModel_T, FlagField_T,
+                                                                                ScalarField_T, VectorField_T >
+         distributeMassSweep(excessMassModel, fillFieldID, flagFieldID, pdfFieldID, flagInfo);
+      timeloop.add()
+         << Sweep(distributeMassSweep, "Sweep: excess mass distribution", StateSweep::fullFreeSurface)
+         << Sweep(emptySweep(), "Empty sweep: distribute excess mass")
+         << AfterFunction(Communication_T(blockForest, fillFieldID),
+                          "Communication: after excess mass distribution sweep")
+         << AfterFunction(blockforest::UpdateSecondGhostLayer< ScalarField_T >(blockForest, fillFieldID),
+                          "Second ghost layer update: after excess mass distribution sweep (fill level field)")
+         // update bubble model, i.e., perform registered bubble merges/splits; bubble merges/splits are
+         // already detected and registered by CellConversionSweep
+         << AfterFunction([&bubbleModel]() { bubbleModel->update(); }, "Sweep: bubble model update");
+   }
+   else
+   {
+      if (excessMassModel.isWeightedType())
+      {
+         const walberla::free_surface::ExcessMassDistributionSweepInterfaceWeighted< LatticeModel_T, FlagField_T,
+                                                                                     ScalarField_T, VectorField_T >
+            distributeMassSweep(excessMassModel, fillFieldID, flagFieldID, pdfFieldID, flagInfo, normalFieldID);
+         timeloop.add()
+            << Sweep(distributeMassSweep, "Sweep: excess mass distribution", StateSweep::fullFreeSurface)
+            << Sweep(emptySweep(), "Empty sweep: distribute excess mass")
+            << AfterFunction(Communication_T(blockForest, fillFieldID),
+                             "Communication: after excess mass distribution sweep")
+            << AfterFunction(blockforest::UpdateSecondGhostLayer< ScalarField_T >(blockForest, fillFieldID),
+                             "Second ghost layer update: after excess mass distribution sweep (fill level field)")
+            // update bubble model, i.e., perform registered bubble merges/splits; bubble merges/splits
+            // are already detected and registered by CellConversionSweep
+            << AfterFunction([&]() { bubbleModel->update(); }, "Sweep: bubble model update");
+      }
+      else
+      {
+         if (excessMassModel.isEvenlyAllInterfaceFallbackLiquidType())
+         {
+            const walberla::free_surface::ExcessMassDistributionSweepInterfaceAndLiquid< LatticeModel_T, FlagField_T,
+                                                                                         ScalarField_T, VectorField_T >
+               distributeMassSweep(excessMassModel, fillFieldID, flagFieldID, pdfFieldID, flagInfo, excessMassFieldID);
+            timeloop.add()
+               // perform this sweep also on "onlyLBM" blocks because liquid cells also exchange excess mass here
+               << Sweep(distributeMassSweep, "Sweep: excess mass distribution", StateSweep::fullFreeSurface)
+               << Sweep(distributeMassSweep, "Sweep: excess mass distribution", StateSweep::onlyLBM)
+               << Sweep(emptySweep(), "Empty sweep: distribute excess mass")
+               << AfterFunction(Communication_T(blockForest, fillFieldID, excessMassFieldID),
+                                "Communication: after excess mass distribution sweep")
+               << AfterFunction(blockforest::UpdateSecondGhostLayer< ScalarField_T >(blockForest, fillFieldID),
+                                "Second ghost layer update: after excess mass distribution sweep (fill level field)")
+               // update bubble model, i.e., perform registered bubble merges/splits; bubble
+               // merges/splits are already detected and registered by CellConversionSweep
+               << AfterFunction([&]() { bubbleModel->update(); }, "Sweep: bubble model update");
+         }
+      }
+   }
+   // reset all flags that signal cell conversions (except "keepInterfaceForWettingFlag")
+   walberla::free_surface::ConversionFlagsResetSweep< FlagField_T > resetConversionFlagsSweep(flagFieldID, flagInfo);
+   timeloop.add() << Sweep(resetConversionFlagsSweep, "Sweep: conversion flag reset", StateSweep::fullFreeSurface)
+                  << Sweep(emptySweep(), "Empty sweep: conversion flag reset")
+                  << AfterFunction(Communication_T(blockForest, flagFieldID),
+                                   "Communication: after excess mass distribution sweep")
+                  << AfterFunction(blockforest::UpdateSecondGhostLayer< FlagField_T >(blockForest, flagFieldID),
+                                   "Second ghost layer update: after excess mass distribution sweep (flag field)");
+   // update block states
+   timeloop.add() << Sweep(blockStateUpdate, "Sweep: block state update");
+   // add VTK output
+   walberla::free_surface::addVTKOutput< LatticeModel_T, AntidunesBoundaryHandling_T, PdfField_T, FlagField_T,
+                                         ScalarField_T, VectorField_T >(
+      blockForest, timeloop, configPtr, flagInfo, pdfFieldID, flagFieldID, fillFieldID, BlockDataID(), curvatureFieldID,
+      normalFieldID, obstacleNormalFieldID);
+   // add triangle mesh output of free surface
+   walberla::free_surface::SurfaceMeshWriter< ScalarField_T, FlagField_T > surfaceMeshWriter(
+      blockForest, fillFieldID, flagFieldID, walberla::free_surface::flagIDs::liquidInterfaceGasFlagIDs, real_c(0),
+      configPtr);
+   surfaceMeshWriter(); // write initial mesh
+   timeloop.addFuncAfterTimeStep(surfaceMeshWriter, "Writer: surface mesh");
+   if (vtkSpacingParticles != uint_t(0))
+   {
+      // particle field
+      auto particleFieldVTK =
+         vtk::createVTKOutput_BlockData(blockForest, "particle_field", vtkSpacingParticles, 0, false, vtkFolder);
+      auto cellBB_filterParameters             = vtkFluidParameters.getOneBlock("CellBB_filter");
+      const Vector3< uint_t > cellBB_filterMin = cellBB_filterParameters.getParameter< Vector3< uint_t > >("min");
+      const Vector3< uint_t > cellBB_filterMax = cellBB_filterParameters.getParameter< Vector3< uint_t > >("max");
+      AABB sliceAABB(real_c(cellBB_filterMin[0]), real_c(cellBB_filterMin[1]), real_c(cellBB_filterMin[2]),
+                     real_c(cellBB_filterMax[0] + uint_t(1)), real_c(cellBB_filterMax[1] + uint_t(1)),
+                     real_c(cellBB_filterMax[2] + uint_t(1)));
+      particleFieldVTK->addCellInclusionFilter(vtk::AABBCellFilter(sliceAABB));
+      particleFieldVTK->addCellDataWriter(
+         make_shared< field::VTKWriter< GhostLayerField< walberla::id_t, 1 > > >(particleFieldID, "particleField"));
+      particleFieldVTK->setSamplingResolution(vtkFluidParameters.getParameter< real_t >("samplingResolution"));
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(particleFieldVTK), "VTK (particle field data");
+   }
+   if (vtkSpacingParticles != uint_t(0))
+   {
+      // sphere
+      auto particleVtkOutput = make_shared< mesa_pd::vtk::ParticleVtkOutput >(particleStorage);
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleUid >("uid");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleLinearVelocity >("velocity");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleInteractionRadius >("radius");
+      // limit output to process-local spheres
+      particleVtkOutput->setParticleSelector([](const mesa_pd::data::ParticleStorage::iterator& pIt) {
+         using namespace walberla::mesa_pd::data::particle_flags;
+         return (pIt->getBaseShape()->getShapeType() == mesa_pd::data::Sphere::SHAPE_TYPE) &&
+                !isSet(pIt->getFlags(), GHOST);
+      });
+      auto particleVtkWriter =
+         vtk::createVTKOutput_PointData(particleVtkOutput, "particles", vtkSpacingParticles, vtkFolder,
+                                        std::string("simulation_step"), false, true, true, true, beginTimeStep);
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(particleVtkWriter), "VTK (sphere data)");
+   }
+   // add logging for computational performance
+   const lbm::PerformanceLogger< FlagField_T > performanceLogger(
+      blockForest, flagFieldID, walberla::free_surface::flagIDs::liquidInterfaceFlagIDs, performanceLogFrequency);
+   timeloop.addFuncAfterTimeStep(performanceLogger, "Evaluator: performance logging");
+   // LBM stability check
+   timeloop.addFuncAfterTimeStep(makeSharedFunctor(field::makeStabilityChecker< PdfField_T, FlagField_T >(
+                                    walberlaEnv.config(), blockForest, pdfFieldID, flagFieldID,
+                                    walberla::free_surface::flagIDs::liquidInterfaceFlagIDs)),
+                                 "LBM stability check");
+   // add sweep for evaluating the fluid's mean velocity
+   const std::shared_ptr< Vector3< real_t > > meanVelocity = std::make_shared< Vector3< real_t > >(real_c(0));
+   const real_t velocityAveragingFactor = real_c(1) / (liquidHeight * real_c(domainSize[0]) * real_c(domainSize[1]));
+   MeanVelocityComputer< AntidunesBoundaryHandling_T, PdfField_T, FlagField_T > meanVelocityComputer(
+      blockForest, antidunesBoundaryHandling, pdfFieldID, meanVelocity, velocityAveragingFactor);
+   // PID Controller
+   shared_ptr< ForcingAdjuster > forcingAdjuster =
+      make_shared< ForcingAdjuster >(blockForest, targetMeanVelocityMagnitude, (*acceleration)[0], proportionalGain,
+                                     derivativeGain, integralGain, maxRamp, minActuatingVariable, maxActuatingVariable);
+   if (loadSnapshot) { forcingAdjuster->loadPIDSnapshot(snapshotBaseFolder + "/" + "pidState.file"); }
+   WcTimingPool timingPool;
+   // this is carried out after the particle integration, it corrects the flag field and restores missing PDF
+   // information then, the checkpointing file can be written, as otherwise some cells are invalid and can not be
+   // recovered
+   SweepTimeloop timeloopAfterParticles(blockForest, timesteps);
+   timeloopAfterParticles.setCurrentTimeStep(beginTimeStep);
+   // sweep for updating the particle mapping into the LBM simulation
+   bool strictlyConserveMomentum = false;
+   timeloopAfterParticles.add() << Sweep(
+      lbm_mesapd_coupling::makeMovingParticleMapping< PdfField_T, AntidunesBoundaryHandling_T::BoundaryHandling_T >(
+         blockForest, pdfFieldID, antidunesBoundaryHandling->getHandlingID(), particleFieldID, particleAccessor,
+         AntidunesBoundaryHandling_T::movingObstacleFlagID, FormerMO_Flag,
+         lbm_mesapd_coupling::RegularParticlesSelector(), strictlyConserveMomentum),
+      "Particle Mapping");
+   // sweep for restoring PDFs in cells previously occupied by particles
+   bool reconstruction_recomputeTargetDensity = false;
+   bool reconstruction_useCentralDifferences  = true;
+   auto gradReconstructor =
+      lbm_mesapd_coupling::makeGradsMomentApproximationReconstructor< AntidunesBoundaryHandling_T::BoundaryHandling_T >(
+         blockForest, antidunesBoundaryHandling->getHandlingID(), relaxationRate, reconstruction_recomputeTargetDensity,
+         reconstruction_useCentralDifferences);
+   timeloopAfterParticles.add()
+      << Sweep(makeSharedSweep(
+                  lbm_mesapd_coupling::makePdfReconstructionManager< PdfField_T,
+                                                                     AntidunesBoundaryHandling_T::BoundaryHandling_T >(
+                     blockForest, pdfFieldID, antidunesBoundaryHandling->getHandlingID(), particleFieldID,
+                     particleAccessor, FormerMO_Flag, walberla::free_surface::flagIDs::liquidFlagID, gradReconstructor,
+                     strictlyConserveMomentum)),
+               "PDF Restore")
+      << AfterFunction(Communication_T(blockForest, flagFieldID, particleFieldID),
+                       "Communication: after PDF reconstruction sweep") // unsure if necessary but added for consistency
+      << AfterFunction(pdfCommunication, "PDF Communication");
+   real_t timeStepSizeParticles = real_c(1) / real_c(particleNumSubCycles);
+   mesa_pd::kernel::VelocityVerletPreForceUpdate vvIntegratorPreForce(timeStepSizeParticles);
+   mesa_pd::kernel::VelocityVerletPostForceUpdate vvIntegratorPostForce(timeStepSizeParticles);
+   mesa_pd::kernel::LinearSpringDashpot collisionResponse(1);
+   collisionResponse.setFrictionCoefficientDynamic(size_t(0), size_t(0), particleFrictionCoefficient);
+   mesa_pd::mpi::ReduceProperty reduceProperty;
+   mesa_pd::mpi::ReduceContactHistory reduceAndSwapContactHistory;
+   lbm_mesapd_coupling::ResetHydrodynamicForceTorqueKernel resetHydrodynamicForceTorque;
+   lbm_mesapd_coupling::AverageHydrodynamicForceTorqueKernel averageHydrodynamicForceTorque;
+   real_t particleCollisionTime = particleCollisionTimeNonDim * avgParticleDiameter;
+   lbm_mesapd_coupling::LubricationCorrectionKernel lubricationCorrectionKernel(
+      viscosity, [](real_t r) { return (real_c(0.001 + real_c(0.00007) * r)) * r; });
+   WALBERLA_LOG_INFO_ON_ROOT("Will use particle time step size of "
+                             << timeStepSizeParticles << " and collision time of " << particleCollisionTime);
+   AverageDataSliceEvaluator< PdfField_T, AntidunesBoundaryHandling_T, FlagField_T, ScalarField_T >
+      averageDataSliceEvaluator(blockForest, flagFieldID, fillFieldID, pdfFieldID);
+   std::shared_ptr< real_t > totalFluidMass = std::make_shared< real_t >(real_c(0));
+   walberla::free_surface::TotalMassComputer< AntidunesBoundaryHandling_T, PdfField_T, FlagField_T, ScalarField_T >
+      totalFluidMassEvaluator(blockForest, antidunesBoundaryHandling, pdfFieldID, fillFieldID, evaluationFrequency,
+                              totalFluidMass);
+   BedloadTransportEvaluator< ParticleAccessor_T > bedloadTransportEvaluator(
+      particleAccessor, real_c(1) / real_c(domainSize[0] * domainSize[1]), numParticles);
+   auto bedLoadTransportFileName = baseFolderName + "/bedload.txt";
+   WALBERLA_LOG_INFO_ON_ROOT("Writing bedload info to file " << bedLoadTransportFileName);
+   auto fluidInfoFileName = baseFolderName + "/fluidInfo.txt";
+   WALBERLA_LOG_INFO_ON_ROOT("Writing fluid info to file " << fluidInfoFileName);
+   // write info file
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream evalInfoFile(baseFolderName + "/info.txt");
+      evalInfoFile << evaluationFrequency << "\n";
+      evalInfoFile << gravity << "\n";
+      evalInfoFile << viscosity << "\n";
+      evalInfoFile << particleDensityRatio << "\n";
+      evalInfoFile << avgParticleDiameter << "\n";
+      evalInfoFile << domainSize[0] << "\n";
+      evalInfoFile << domainSize[1] << "\n";
+      evalInfoFile << domainSize[2] << "\n";
+      evalInfoFile << numParticles << "\n";
+      evalInfoFile << dx_SI << "\n";
+      evalInfoFile << dt_SI << "\n";
+      evalInfoFile.close();
+   }
+   Vector3< real_t > totalHydrodynamicForceOnParticles(real_c(0)); // only root will have valid values
+   for (uint_t t = beginTimeStep; t != timesteps; ++t)
+   {
+      timeloop.singleStep(timingPool, true);
+      timingPool["Mesa_pd"].start();
+      reduceProperty.operator()< mesa_pd::HydrodynamicForceTorqueNotification >(*particleStorage);
+      if (t == 0)
+      {
+         lbm_mesapd_coupling::InitializeHydrodynamicForceTorqueForAveragingKernel
+            initializeHydrodynamicForceTorqueForAveragingKernel;
+         particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                          initializeHydrodynamicForceTorqueForAveragingKernel, *particleAccessor);
+      }
+      particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                       averageHydrodynamicForceTorque, *particleAccessor);
+      for (auto subCycle = uint_t(0); subCycle < particleNumSubCycles; ++subCycle)
+      {
+         particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                          vvIntegratorPreForce, *particleAccessor);
+         syncCall();
+         if (useLubricationCorrection)
+         {
+            // lubrication correction
+            particleStorage->forEachParticlePairHalf(
+               useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *particleAccessor,
+               [&lubricationCorrectionKernel, &mesapdDomain](const size_t idx1, const size_t idx2, auto& ac) {
+                  mesa_pd::collision_detection::AnalyticContactDetection acd;
+                  acd.getContactThreshold() = lubricationCorrectionKernel.getNormalCutOffDistance();
+                  mesa_pd::kernel::DoubleCast double_cast;
+                  mesa_pd::mpi::ContactFilter contact_filter;
+                  if (double_cast(idx1, idx2, ac, acd, ac))
+                  {
+                     if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *mesapdDomain))
+                     {
+                        double_cast(acd.getIdx1(), acd.getIdx2(), ac, lubricationCorrectionKernel, ac,
+                                    acd.getContactNormal(), acd.getPenetrationDepth());
+                     }
+                  }
+               },
+               *particleAccessor);
+         }
+         // collision response
+         particleStorage->forEachParticlePairHalf(
+            useOpenMP, mesa_pd::kernel::ExcludeInfiniteInfinite(), *particleAccessor,
+            [&collisionResponse, &mesapdDomain, timeStepSizeParticles, particleRestitutionCoefficient,
+             particleCollisionTime, particleKappa](const size_t idx1, const size_t idx2, auto& ac) {
+               mesa_pd::collision_detection::AnalyticContactDetection acd;
+               mesa_pd::kernel::DoubleCast double_cast;
+               mesa_pd::mpi::ContactFilter contact_filter;
+               if (double_cast(idx1, idx2, ac, acd, ac))
+               {
+                  if (contact_filter(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), *mesapdDomain))
+                  {
+                     auto meff = real_c(1) / (ac.getInvMass(idx1) + ac.getInvMass(idx2));
+                     collisionResponse.setStiffnessAndDamping(0, 0, particleRestitutionCoefficient,
+                                                              particleCollisionTime, particleKappa, meff);
+                     collisionResponse(acd.getIdx1(), acd.getIdx2(), ac, acd.getContactPoint(), acd.getContactNormal(),
+                                       acd.getPenetrationDepth(), timeStepSizeParticles);
+                  }
+               }
+            },
+            *particleAccessor);
+         reduceAndSwapContactHistory(*particleStorage);
+         // add hydrodynamic force
+         lbm_mesapd_coupling::AddHydrodynamicInteractionKernel addHydrodynamicInteraction;
+         particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                          addHydrodynamicInteraction, *particleAccessor);
+         // add external forces
+         particleStorage->forEachParticle(
+            useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+            [particleDensityRatio, acceleration](const size_t idx, auto& ac) {
+               mesa_pd::addForceAtomic(idx, ac,
+                                       ac.getVolume(idx) *
+                                          Vector3< real_t >((*acceleration)[0], (*acceleration)[1],
+                                                            (particleDensityRatio - real_c(1)) * (*acceleration)[2]));
+            },
+            *particleAccessor);
+         reduceProperty.operator()< mesa_pd::ForceTorqueNotification >(*particleStorage);
+         particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectLocal(), *particleAccessor,
+                                          vvIntegratorPostForce, *particleAccessor);
+         syncCall();
+      }
+      // has to be evaluated here before the force info is erased from particles
+      if (t % evaluationFrequency == uint_c(0))
+         totalHydrodynamicForceOnParticles = getTotalHydrodynamicForceOnParticles(particleAccessor);
+      particleStorage->forEachParticle(useOpenMP, mesa_pd::kernel::SelectAll(), *particleAccessor,
+                                       resetHydrodynamicForceTorque, *particleAccessor);
+      timingPool["Mesa_pd"].end();
+      // update particle mapping
+      timeloopAfterParticles.singleStep(timingPool, true);
+      timingPool["Evaluation"].start();
+      if (t % evaluationFrequency == uint_c(0))
+      {
+         averageDataSliceEvaluator();
+         totalFluidMassEvaluator.computeMass(blockForest, antidunesBoundaryHandling);
+         bedloadTransportEvaluator();
+         meanVelocityComputer();
+         WALBERLA_ROOT_SECTION()
+         {
+            write2DVectorToFile(averageDataSliceEvaluator.getSolidVolumeFractionVector(),
+                                averageDataSliceEvaluator.getXLen(), averageDataSliceEvaluator.getZLen(),
+                                baseFolderName + "/svfSlice_" + std::to_string(t) + ".txt");
+            write2DVectorToFile(averageDataSliceEvaluator.getFillLevelVector(), averageDataSliceEvaluator.getXLen(),
+                                averageDataSliceEvaluator.getZLen(),
+                                baseFolderName + "/fillSlice_" + std::to_string(t) + ".txt");
+            write2DVectorToFile(averageDataSliceEvaluator.getVelocityXVector(), averageDataSliceEvaluator.getXLen(),
+                                averageDataSliceEvaluator.getZLen(),
+                                baseFolderName + "/velXSlice_" + std::to_string(t) + ".txt");
+            std::ofstream bedloadFile(bedLoadTransportFileName, std::ofstream::app);
+            bedloadFile << t << " " << bedloadTransportEvaluator.getTransportRate() << " "
+                        << bedloadTransportEvaluator.getAverageVelocity() << " " << totalHydrodynamicForceOnParticles[0]
+                        << " " << totalHydrodynamicForceOnParticles[1] << " " << totalHydrodynamicForceOnParticles[2]
+                        << "\n";
+            bedloadFile.close();
+            WALBERLA_LOG_DEVEL("____________________________________________________________________");
+            WALBERLA_LOG_DEVEL("time step = " << t);
+            const real_t froudeNumber =
+               (*meanVelocity)[0] / real_c(std::sqrt(liquidHeight * std::abs((*acceleration)[2])));
+            const real_t reynoldsNumber = (*meanVelocity)[0] * liquidHeight / viscosity;
+            const real_t weberNumber =
+               real_c(1.0) * (*meanVelocity)[0] * (*meanVelocity)[0] * liquidHeight / surfaceTension;
+            WALBERLA_LOG_DEVEL(" - Total fluid mass = " << std::setprecision(16) << (*totalFluidMass));
+            auto maxFluidZPos = averageDataSliceEvaluator.getMaxFluidZPos();
+            WALBERLA_LOG_DEVEL(" - Max fluid z-position = " << maxFluidZPos);
+            WALBERLA_LOG_DEVEL(" - Froude number = " << froudeNumber);
+            WALBERLA_LOG_DEVEL(" - Reynolds number = " << reynoldsNumber);
+            WALBERLA_LOG_DEVEL(" - We = " << weberNumber);
+            WALBERLA_LOG_DEVEL(" - meanVelocity = " << *meanVelocity);
+            std::ofstream fluidInfoFile(fluidInfoFileName, std::ofstream::app);
+            fluidInfoFile << t << " " << (*acceleration)[0] << " " << (*meanVelocity)[0] << " " << maxFluidZPos << " "
+                          << std::setprecision(16) << (*totalFluidMass) << "\n";
+            fluidInfoFile.close();
+            if (std::isnan(reynoldsNumber)) WALBERLA_ABORT("reynoldsNumber is inf!")
+         }
+         WALBERLA_LOG_DEVEL_ON_ROOT(
+            " -> CurrentExternalAcceleration in x-direction before update = " << (*acceleration)[0]);
+         (*forcingAdjuster)(meanVelocity->length());
+         (*acceleration)[0] = forcingAdjuster->getExternalForcing();
+         WALBERLA_LOG_DEVEL_ON_ROOT(
+            " -> CurrentExternalAcceleration in x-direction after update  = " << (*acceleration)[0]);
+      }
+      timingPool["Evaluation"].end();
+      if (storeSnapshot)
+      {
+         if (t % snapshotFrequency == uint_c(0) && t > uint_c(0))
+         {
+            WALBERLA_LOG_INFO_ON_ROOT("Writing checkpointing file in time step " << t)
+            blockForest->saveBlockData(snapshotBaseFolder + std::string("/tmp_") += pdfFieldFile, pdfFieldID);
+            blockForest->saveBlockData(snapshotBaseFolder + "/tmp_" += fillFieldFile, fillFieldID);
+            blockForest->saveBlockData(snapshotBaseFolder + "/tmp_" += excessMassFieldFile, excessMassFieldID);
+            blockForest->saveBlockData(snapshotBaseFolder + "/tmp_" += particleStorageFile, particleStorageID);
+            WALBERLA_ROOT_SECTION()
+            {
+               std::string tmpCheckpointConfigFile = snapshotBaseFolder + "/tmp_" += checkpointConfigFile;
+               std::ofstream file;
+               file.open(tmpCheckpointConfigFile.c_str());
+               file << std::setprecision(16);
+               file << t + 1 << "\n";
+               file << (*acceleration)[0] << "\n";
+               file.close();
+            }
+            forcingAdjuster->storePIDSnapshot(snapshotBaseFolder + "/" + "pidState.file");
+            WALBERLA_MPI_BARRIER();
+            // rename checkpoint files to "real" ones
+            // otherwise, the checkpointed state might be incomplete if the simulation stops due to over time during
+            // checkpointing
+            WALBERLA_ROOT_SECTION()
+            {
+               renameFile(snapshotBaseFolder + "/tmp_" += pdfFieldFile, snapshotBaseFolder + "/" += pdfFieldFile);
+               renameFile(snapshotBaseFolder + "/tmp_" += fillFieldFile, snapshotBaseFolder + "/" += fillFieldFile);
+               renameFile(snapshotBaseFolder + "/tmp_" += excessMassFieldFile,
+                          snapshotBaseFolder + "/" += excessMassFieldFile);
+               renameFile(snapshotBaseFolder + "/tmp_" += particleStorageFile,
+                          snapshotBaseFolder + "/" += particleStorageFile);
+               renameFile(snapshotBaseFolder + "/tmp_" += checkpointConfigFile,
+                          snapshotBaseFolder + "/" += checkpointConfigFile);
+            }
+         }
+      }
+      if (t % performanceLogFrequency == uint_c(0) && t > uint_c(0)) { timingPool.logResultOnRoot(); }
+   }
+   return EXIT_SUCCESS;
+}
+} // namespace antidunes
+} // namespace walberla
+int main(int argc, char** argv) { return walberla::antidunes::main(argc, argv); }
--- a/apps/showcases/Antidunes/Antidunes.prm
+++ b/apps/showcases/Antidunes/Antidunes.prm
+BlockForestParameters
+{
+   cellsPerBlock                 < 50, 20, 40 >;
+   periodicity                   < 1, 1, 0 >;
+   loadSnapshot                  false;
+   storeSnapshot                 true;
+   snapshotFrequency             10000;
+   snapshotBaseFolder            snapshots;
+}
+DomainParameters
+{
+   domainSize         <3200, 60, 160>;
+   wavePeriods        1;         // never set to 0 -> division by zero, even if you initialize a flat particle bed
+   liquidHeightFactor 2.862;    // h_0 / d (water height / avg particle diameter) -> from experiment [E1=2.862, E2=3.1724, E3=3.27586, E4=3.5862]
+   floorHeightFactor  4.1393;    // from domain bottom to sine's average
+   initialAmplitude   0;         // defined from minimum peak to maximum peak as by Pascal et al. (2021)
+}
+PIDParameters
+{
+    targetMeanVelocityMagnitude 0.02;
+    proportionalGain            2e-4;
+    derivativeGain              1e-6;
+    integralGain                2e-4;
+    maxRamp                     1e-4;
+    minActuatingVariable        0;
+    maxActuatingVariable        1e-3;
+}
+PhysicsParameters
+{
+   enableWetting     false;
+   timesteps         2000000;
+   Re                3100;       // [E1=3100, E2=3772, E3=4180, E4=4800]
+   Fr                1.31;       // [E1=1.31, E2=1.38, E3=1.44, E4=1.45]
+   We                15.6188;    // [E1=15.6188, E2=21.48, E3=25.54, E4=30.2493]
+}
+ParticleParameters
+{
+    inFileName spheres_out.dat;
+    bedCopiesInX 1;
+    bedCopiesInY 1;
+    densityRatio 2.55;
+    fixingHeightFactor 1.5; // proportional to the mean particle diameter
+    frictionCoefficient 0.5;
+    restitutionCoefficient 0.97;
+    numSubCycles 10;
+    useLubricationCorrection true;
+    useNoSlipParticles false;
+}
+ModelParameters
+{
+   pdfReconstructionModel        OnlyMissing;
+   pdfRefillingModel             EquilibriumRefilling;
+   excessMassDistributionModel   EvenlyNewInterfaceFallbackLiquid;
+   curvatureModel                FiniteDifferenceMethod;
+   useSimpleMassExchange         false;
+   cellConversionThreshold       1e-2;
+   cellConversionForceThreshold  1e-1;
+}
+EvaluationParameters
+{
+   performanceLogFrequency 10000;
+   evaluationFrequency     1000;
+   baseFolderName          eval;
+}
+StabilityChecker
+{
+   checkFrequency 0;
+   streamOutput   false;
+   vtkOutput      true;
+}
+BoundaryParameters
+{
+   // X
+   //Border { direction W;  walldistance -1; NoSlip{} }
+   //Border { direction E;  walldistance -1; NoSlip{} }
+   // Y
+   //Border { direction N;  walldistance -1; NoSlip{} }
+   //Border { direction S;  walldistance -1; NoSlip{} }
+   // Z
+   Border { direction T;  walldistance -1; NoSlip{} }
+   Border { direction B;  walldistance -1; NoSlip{} }
+}
+MeshOutputParameters
+{
+   writeFrequency 0;
+   baseFolder     mesh-out;
+}
+VTK
+{
+   fluid_field
+   {
+      writeFrequency       1000;
+      ghostLayers          0;
+      baseFolder           vtk-out;
+      samplingResolution   1;
+      writers
+      {
+         fill_level;
+         mapped_flag;
+         velocity;
+         density;
+         //curvature;
+         //normal;
+         //obstacle_normal;
+         //pdf;
+         //flag;
+         //force;
+      }
+      CellBB_filter {
+         min < 0, 29, 0 >;
+         max < 3200, 30, 160 >;
+      }
+      inclusion_filters
+      {
+         CellBB_filter;
+         //liquidInterfaceFilter; // only include liquid and interface cells in VTK output
+      }
+      before_functions
+      {
+         //ghost_layer_synchronization; // only needed if writing the ghost layer
+         gas_cell_zero_setter;          // sets velocity=0 and density=1 all gas cells before writing VTK output
+      }
+   }
+   domain_decomposition
+   {
+      writeFrequency             10000000000;
+      baseFolder                 vtk-out;
+      outputDomainDecomposition  true;
+   }
+}
--- a/apps/showcases/Antidunes/AntidunesBoundaryHandling.h
+++ b/apps/showcases/Antidunes/AntidunesBoundaryHandling.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Boundary.h
+//! \ingroup free_surface
+//! \author Martin Bauer
+//! \author Christoph Schwarzmeier <christoph.schwarzmeier@fau.de>
+//! \brief Boundary handling for the free surface LBM module.
+//
+//======================================================================================================================
+#pragma once
+#include "blockforest/StructuredBlockForest.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "boundary/BoundaryHandling.h"
+#include "field/GhostLayerField.h"
+#include "geometry/initializer/InitializationManager.h"
+#include "geometry/initializer/OverlapFieldFromBody.h"
+#include "lbm/boundary/all.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/free_surface/FlagInfo.h"
+#include "lbm/free_surface/InitFunctions.h"
+#include "lbm/free_surface/InterfaceFromFillLevel.h"
+#include "lbm/free_surface/boundary/SimplePressureWithFreeSurface.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/boundary/CurvedLinear.h"
+#include "lbm_mesapd_coupling/momentum_exchange_method/boundary/SimpleBB.h"
+namespace walberla
+{
+namespace antidunes
+{
+namespace free_surface
+{
+/***********************************************************************************************************************
+ * Boundary handling for the free surface LBM extension.
+ **********************************************************************************************************************/
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+class AntidunesBoundaryHandling
+{
+ public:
+   using flag_t    = typename FlagField_T::value_type;
+   using Stencil_T = typename LatticeModel_T::Stencil;
+   using CommunicationStencil_T =
+      std::conditional_t< LatticeModel_T::Stencil::D == uint_t(2), stencil::D2Q9, stencil::D3Q27 >;
+   using PdfField_T = lbm::PdfField< LatticeModel_T >;
+   // boundary
+   using NoSlip_T   = lbm::NoSlip< LatticeModel_T, flag_t >;
+   using FreeSlip_T = lbm::FreeSlip< LatticeModel_T, FlagField_T >;
+   using UBB_T      = lbm::UBB< LatticeModel_T, flag_t >;
+   using Pressure_T = walberla::free_surface::SimplePressureWithFreeSurface< LatticeModel_T, FlagField_T >;
+   using Outlet_T   = lbm::Outlet< LatticeModel_T, FlagField_T, 4, 3 >;
+   using UBB_Inflow_T =
+      lbm::UBB< LatticeModel_T, flag_t >; // creates interface cells in the direction of the prescribed velocity, i.e.,
+                                          // represents an inflow boundary condition
+   using MovingObstacle_T = lbm_mesapd_coupling::SimpleBB< LatticeModel_T, FlagField_T, ParticleAccessor_T >;
+   // handling type
+   using BoundaryHandling_T =
+      BoundaryHandling< FlagField_T, Stencil_T, NoSlip_T, UBB_T, UBB_Inflow_T, Pressure_T, Pressure_T, Outlet_T,
+                        FreeSlip_T,
+                        MovingObstacle_T >; // 2 pressure boundaries with different densities, e.g., inflow-outflow
+   using FlagInfo_T = walberla::free_surface::FlagInfo< FlagField_T >;
+   // constructor
+   AntidunesBoundaryHandling(const std::shared_ptr< StructuredBlockForest >& blockForest, BlockDataID pdfFieldID,
+                             BlockDataID fillLevelID, BlockDataID particleFieldID,
+                             const shared_ptr< ParticleAccessor_T >& ac,
+                             std::function< real_t(const Vector3< real_t >&) > hydrostaticDensityFct);
+   // initialize fluid field from config file using (quotes indicate the string to be used in the file):
+   // - "CellInterval" (see src/geometry/initializer/BoundaryFromCellInterval.h)
+   // - "Border" (see src/geometry/initializer/BoundaryFromDomainBorder.h)
+   // - "Image" (see src/geometry/initializer/BoundaryFromImage.h)
+   // - "Body" (see src/geometry/initializer/OverlapFieldFromBody.h)
+   inline void initFromConfig(const Config::BlockHandle& block);
+   // initialize free surface object from geometric body (see src/geometry/initializer/OverlapFieldFromBody.h)
+   template< typename Body_T >
+   inline void addFreeSurfaceObject(const Body_T& body, bool addOrSubtract = false);
+   // clear and initialize flags in every cell according to the fill level and initialize fill level in boundaries (with
+   // value 1) and obstacles such that the bubble model does not detect obstacles as gas cells
+   void initFlagsFromFillLevel();
+   inline void setNoSlipAtBorder(stencil::Direction d, cell_idx_t wallDistance = cell_idx_c(0));
+   inline void setNoSlipAtAllBorders(cell_idx_t wallDistance = cell_idx_c(0));
+   void setNoSlipInCell(const Cell& globalCell);
+   inline void setFreeSlipAtBorder(stencil::Direction d, cell_idx_t wallDistance = cell_idx_c(0));
+   inline void setFreeSlipAtAllBorders(cell_idx_t wallDistance = cell_idx_c(0));
+   void setFreeSlipInCell(const Cell& globalCell);
+   void setUBBInCell(const Cell& globalCell, const Vector3< real_t >& velocity);
+   // UBB that generates interface cells to resemble an inflow boundary
+   void setInflowInCell(const Cell& globalCell, const Vector3< real_t >& velocity);
+   inline void setPressure(real_t density);
+   void setPressureOutflow(real_t density);
+   void setBodyForce(const Vector3< real_t >& bodyForce);
+   void enableBubbleOutflow(BubbleModelBase* bubbleModel);
+   // checks if an obstacle cell is located in an outermost ghost layer (corners are explicitly ignored, as they do not
+   // influence periodic communication)
+   Vector3< bool > isObstacleInGlobalGhostLayer();
+   // flag management
+   const walberla::free_surface::FlagInfo< FlagField_T >& getFlagInfo() const { return flagInfo_; }
+   // flag IDs
+   static const field::FlagUID noSlipFlagID;
+   static const field::FlagUID ubbFlagID;
+   static const field::FlagUID ubbInflowFlagID;
+   static const field::FlagUID pressureFlagID;
+   static const field::FlagUID pressureOutflowFlagID;
+   static const field::FlagUID outletFlagID;
+   static const field::FlagUID freeSlipFlagID;
+   static const field::FlagUID movingObstacleFlagID;
+   // boundary IDs
+   static const BoundaryUID noSlipBoundaryID;
+   static const BoundaryUID ubbBoundaryID;
+   static const BoundaryUID ubbInflowBoundaryID;
+   static const BoundaryUID pressureBoundaryID;
+   static const BoundaryUID pressureOutflowBoundaryID;
+   static const BoundaryUID outletBoundaryID;
+   static const BoundaryUID freeSlipBoundaryID;
+   static const BoundaryUID movingObstacleBoundaryID;
+   inline BlockDataID getHandlingID() const { return handlingID_; }
+   inline BlockDataID getPdfFieldID() const { return pdfFieldID_; }
+   inline BlockDataID getFillFieldID() const { return fillFieldID_; }
+   inline BlockDataID getFlagFieldID() const { return flagFieldID_; }
+   inline BlockDataID getParticleFieldID() const { return particleFieldID_; }
+   inline std::shared_ptr< StructuredBlockForest > getBlockForest() const { return blockForest_; };
+   inline shared_ptr< ParticleAccessor_T > getParticleAccessor() const { return ac_; }
+   inline std::function< real_t(const Vector3< real_t >&) > getHydrostaticDensityFct() const
+   {
+      return hydrostaticDensityFct_;
+   }
+   // executes standard waLBerla boundary handling
+   class ExecuteBoundaryHandling
+   {
+    public:
+      ExecuteBoundaryHandling(const BlockDataID& collection) : handlingID_(collection) {}
+      void operator()(IBlock* const block) const
+      {
+         BoundaryHandling_T* const handling = block->getData< BoundaryHandling_T >(handlingID_);
+         // reset "near boundary" flags
+         handling->refresh();
+         (*handling)();
+      }
+    protected:
+      BlockDataID handlingID_;
+   }; // class ExecuteBoundaryHandling
+   ExecuteBoundaryHandling getBoundarySweep() const { return ExecuteBoundaryHandling(getHandlingID()); }
+ private:
+   walberla::free_surface::FlagInfo< FlagField_T > flagInfo_;
+   // register standard waLBerla initializers
+   geometry::initializer::InitializationManager getInitManager();
+   std::shared_ptr< StructuredBlockForest > blockForest_;
+   BlockDataID flagFieldID_;
+   BlockDataID pdfFieldID_;
+   BlockDataID fillFieldID_;
+   BlockDataID particleFieldID_;
+   shared_ptr< ParticleAccessor_T > ac_;
+   std::function< real_t(const Vector3< real_t >&) > hydrostaticDensityFct_;
+   BlockDataID handlingID_;
+   blockforest::communication::UniformBufferedScheme< CommunicationStencil_T > comm_;
+}; // class AntidunesBoundaryHandling
+} // namespace free_surface
+} // namespace antidunes
+} // namespace walberla
+#include "AntidunesBoundaryHandling.impl.h"
--- a/apps/showcases/Antidunes/AntidunesBoundaryHandling.impl.h
+++ b/apps/showcases/Antidunes/AntidunesBoundaryHandling.impl.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AntidunesBoundaryHandling.impl.h
+//! \ingroup free_surface
+//! \author Martin Bauer
+//! \author Christoph Schwarzmeier <christoph.schwarzmeier@fau.de>
+//! \brief Boundary handling for the free surface LBM module.
+//
+//======================================================================================================================
+#pragma once
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/communication/PackInfo.h"
+#include "geometry/initializer/BoundaryFromCellInterval.h"
+#include "geometry/initializer/BoundaryFromDomainBorder.h"
+#include "geometry/initializer/BoundaryFromImage.h"
+#include "geometry/structured/GrayScaleImage.h"
+#include "lbm/free_surface/FlagInfo.h"
+#include "lbm/free_surface/InterfaceFromFillLevel.h"
+#include "lbm/lattice_model/CollisionModel.h"
+#include "AntidunesBoundaryHandling.h"
+namespace walberla
+{
+namespace antidunes
+{
+namespace free_surface
+{
+namespace internal
+{
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+class BoundaryBlockDataHandling
+   : public domain_decomposition::BlockDataHandling< typename AntidunesBoundaryHandling<
+        LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::BoundaryHandling_T >
+{
+ public:
+   using BoundaryHandling_T =
+      typename AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                          ParticleAccessor_T >::BoundaryHandling_T; // handling as defined in
+                                                                                    // AntidunesBoundaryHandling.h
+   BoundaryBlockDataHandling(
+      const AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >* boundary)
+      : boundary_(boundary)
+   {}
+   // initialize standard waLBerla boundary handling
+   BoundaryHandling_T* initialize(IBlock* const block)
+   {
+      using B      = AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >;
+      using flag_t = typename B::flag_t;
+      // get fields
+      FlagField_T* const flagField           = block->getData< FlagField_T >(boundary_->getFlagFieldID());
+      typename B::PdfField_T* const pdfField = block->getData< typename B::PdfField_T >(boundary_->getPdfFieldID());
+      lbm_mesapd_coupling::ParticleField_T* const particleField =
+         block->getData< lbm_mesapd_coupling::ParticleField_T >(boundary_->getParticleFieldID());
+      auto interfaceFlag = flag_t(flagField->getFlag(walberla::free_surface::flagIDs::interfaceFlagID));
+      auto liquidFlag    = flag_t(flagField->getFlag(walberla::free_surface::flagIDs::liquidFlagID));
+      // domainMask is used to identify liquid and interface cells
+      auto domainMask = flag_t(liquidFlag | interfaceFlag);
+      WALBERLA_ASSERT(domainMask != 0);
+      // initialize boundary conditions
+      typename B::UBB_T ubb(B::ubbBoundaryID, B::ubbFlagID, pdfField, flagField);
+      typename B::UBB_Inflow_T ubbInflow(B::ubbInflowBoundaryID, B::ubbInflowFlagID, pdfField, flagField);
+      typename B::NoSlip_T noslip(B::noSlipBoundaryID, B::noSlipFlagID, pdfField);
+      typename B::Pressure_T pressure(B::pressureBoundaryID, B::pressureFlagID, block, pdfField, flagField,
+                                      interfaceFlag, real_c(1.0));
+      typename B::Pressure_T pressureOutflow(B::pressureOutflowBoundaryID, B::pressureOutflowFlagID, block, pdfField,
+                                             flagField, interfaceFlag, real_c(1.0));
+      typename B::Outlet_T outlet(B::outletBoundaryID, B::outletFlagID, pdfField, flagField, domainMask);
+      typename B::FreeSlip_T freeSlip(B::freeSlipBoundaryID, B::freeSlipFlagID, pdfField, flagField, domainMask);
+      typename B::MovingObstacle_T curvedLinear(B::movingObstacleBoundaryID, B::movingObstacleFlagID, pdfField,
+                                                flagField, particleField, boundary_->getParticleAccessor(), domainMask,
+                                                *boundary_->getBlockForest(), *block,
+                                                boundary_->getHydrostaticDensityFct());
+      return new BoundaryHandling_T("Boundary Handling", flagField, domainMask, noslip, ubb, ubbInflow, pressure,
+                                    pressureOutflow, outlet, freeSlip, curvedLinear);
+   }
+   void serialize(IBlock* const block, const BlockDataID& id, mpi::SendBuffer& buffer)
+   {
+      BoundaryHandling_T* const boundaryHandlingPtr = block->getData< BoundaryHandling_T >(id);
+      CellInterval everyCell                        = boundaryHandlingPtr->getFlagField()->xyzSizeWithGhostLayer();
+      boundaryHandlingPtr->pack(buffer, everyCell, true);
+   }
+   BoundaryHandling_T* deserialize(IBlock* const block) { return initialize(block); }
+   void deserialize(IBlock* const block, const BlockDataID& id, mpi::RecvBuffer& buffer)
+   {
+      BoundaryHandling_T* const boundaryHandlingPtr = block->getData< BoundaryHandling_T >(id);
+      CellInterval everyCell                        = boundaryHandlingPtr->getFlagField()->xyzSizeWithGhostLayer();
+      boundaryHandlingPtr->unpack(buffer, everyCell, true);
+   }
+ private:
+   const AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >* boundary_;
+}; // class BoundaryBlockDataHandling
+// helper function wrapper for adding the flag field to the block storage; since the input parameter for an
+// initialization function in field::addFlagFieldToStorage() is a std::function<void(FlagField_T*,IBlock* const)>, we
+// need a function wrapper that has both these input parameters; as FlagInfo< FlagField_T >::registerFlags() does not
+// have both of these input parameters, a function wrapper with both input parameters is created and the second input
+// parameter is simply ignored inside the function wrapper
+template< typename FlagField_T >
+void flagFieldInitFunction(FlagField_T* flagField, IBlock* const, const Set< field::FlagUID >& obstacleIDs,
+                           const Set< field::FlagUID >& outflowIDs, const Set< field::FlagUID >& inflowIDs,
+                           const Set< field::FlagUID >& freeSlipIDs)
+{
+   // register flags in the flag field
+   walberla::free_surface::FlagInfo< FlagField_T >::registerFlags(flagField, obstacleIDs, outflowIDs, inflowIDs,
+                                                                  freeSlipIDs);
+}
+} // namespace internal
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::AntidunesBoundaryHandling(
+   const std::shared_ptr< StructuredBlockForest >& blockForest, BlockDataID pdfFieldID, BlockDataID fillLevelID,
+   BlockDataID particleFieldID, const shared_ptr< ParticleAccessor_T >& ac,
+   std::function< real_t(const Vector3< real_t >&) > hydrostaticDensityFct)
+   : blockForest_(blockForest), pdfFieldID_(pdfFieldID), fillFieldID_(fillLevelID), particleFieldID_(particleFieldID),
+     ac_(ac), hydrostaticDensityFct_(std::move(hydrostaticDensityFct)), comm_(blockForest)
+{
+   // initialize obstacleIDs
+   Set< FlagUID > obstacleIDs;
+   obstacleIDs += noSlipFlagID;
+   obstacleIDs += ubbFlagID;
+   obstacleIDs += ubbInflowFlagID;
+   obstacleIDs += pressureFlagID;
+   obstacleIDs += pressureOutflowFlagID;
+   obstacleIDs += freeSlipFlagID;
+   obstacleIDs += outletFlagID;
+   obstacleIDs += movingObstacleFlagID;
+   // initialize outflowIDs
+   Set< FlagUID > outflowIDs;
+   outflowIDs += pressureOutflowFlagID;
+   outflowIDs += outletFlagID;
+   // initialize outflowIDs
+   Set< FlagUID > inflowIDs;
+   inflowIDs += ubbInflowFlagID;
+   // initialize freeSlipIDs
+   Set< FlagUID > freeSlipIDs;
+   freeSlipIDs += freeSlipFlagID;
+   // create callable function wrapper with input arguments 1 and 2 unset, whereas arguments 3, 4 and 5 are set to be
+   // obstacleIDs, outflowIDs, and inflowIDs, respectively; this is necessary for field::addFlagFieldToStorage()
+   auto ffInitFunc = std::bind(internal::flagFieldInitFunction< FlagField_T >, std::placeholders::_1,
+                               std::placeholders::_2, obstacleIDs, outflowIDs, inflowIDs, freeSlipIDs);
+   // IMPORTANT REMARK: The flag field needs two ghost layers because of function advectMass(). There, the flags of all
+   // D3Q* neighbors are determined for each cell, including cells in the first ghost layer. Therefore, all D3Q*
+   // neighbors of the first ghost layer must be accessible. This requires a second ghost layer.
+   flagFieldID_ = field::addFlagFieldToStorage< FlagField_T >(blockForest, "Flags", uint_c(2), true, ffInitFunc);
+   // create FlagInfo
+   flagInfo_ = walberla::free_surface::FlagInfo< FlagField_T >(obstacleIDs, outflowIDs, inflowIDs, freeSlipIDs);
+   WALBERLA_ASSERT(flagInfo_.isConsistentAcrossBlocksAndProcesses(blockForest, flagFieldID_));
+   // add boundary handling to blockForest
+   handlingID_ = blockForest_->addBlockData(
+      std::make_shared<
+         internal::BoundaryBlockDataHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T > >(this),
+      "Boundary Handling");
+   // create communication object with fill level field, since fill levels determine the flags during the simulation
+   comm_.addPackInfo(std::make_shared< field::communication::PackInfo< ScalarField_T > >(fillFieldID_));
+}
+// define IDs (static const variables)
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::noSlipFlagID =
+      field::FlagUID("NoSlip");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::ubbFlagID =
+      field::FlagUID("UBB");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::ubbInflowFlagID =
+      field::FlagUID("UBB_Inflow");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::pressureFlagID =
+      field::FlagUID("Pressure");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::pressureOutflowFlagID =
+      field::FlagUID("PressureOutflow");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::outletFlagID =
+      field::FlagUID("Outlet");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::freeSlipFlagID =
+      field::FlagUID("FreeSlip");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const field::FlagUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::movingObstacleFlagID =
+      field::FlagUID("MovingObstacle");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::noSlipBoundaryID =
+      BoundaryUID("NoSlip");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::ubbBoundaryID =
+      BoundaryUID("UBB");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::ubbInflowBoundaryID =
+      BoundaryUID("UBB_Inflow");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::pressureBoundaryID =
+      BoundaryUID("Pressure");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                             ParticleAccessor_T >::pressureOutflowBoundaryID =
+   BoundaryUID("PressureOutflow");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::outletBoundaryID =
+      BoundaryUID("Outlet");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::freeSlipBoundaryID =
+      BoundaryUID("FreeSlip");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+const BoundaryUID AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                             ParticleAccessor_T >::movingObstacleBoundaryID =
+   BoundaryUID("MovingObstacle");
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+geometry::initializer::InitializationManager
+   AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::getInitManager()
+{
+   using namespace geometry::initializer;
+   InitializationManager initManager(blockForest_->getBlockStorage());
+   // define initializers
+   auto cellIntvInit = std::make_shared< BoundaryFromCellInterval< BoundaryHandling_T > >(*blockForest_, handlingID_);
+   auto borderInit   = std::make_shared< BoundaryFromDomainBorder< BoundaryHandling_T > >(*blockForest_, handlingID_);
+   auto imgInit =
+      std::make_shared< BoundaryFromImage< BoundaryHandling_T, geometry::GrayScaleImage > >(*blockForest_, handlingID_);
+   auto bodyInit = std::make_shared< OverlapFieldFromBody >(*blockForest_, fillFieldID_);
+   // register initializers
+   initManager.registerInitializer("CellInterval", cellIntvInit);
+   initManager.registerInitializer("Border", borderInit);
+   initManager.registerInitializer("Image", imgInit);
+   initManager.registerInitializer("Body", bodyInit);
+   return initManager;
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::initFromConfig(
+   const Config::BlockHandle& configBlock)
+{
+   // initialize from config file
+   getInitManager().init(configBlock);
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+template< typename Body_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::addFreeSurfaceObject(
+   const Body_T& body, bool addOrSubtract)
+{
+   geometry::initializer::OverlapFieldFromBody(*blockForest_, fillFieldID_).init(body, addOrSubtract);
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setNoSlipAtBorder(
+   stencil::Direction d, cell_idx_t wallDistance)
+{
+   geometry::initializer::BoundaryFromDomainBorder< BoundaryHandling_T > init(*blockForest_, handlingID_);
+   init.init(noSlipFlagID, d, wallDistance);
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setNoSlipAtAllBorders(
+   cell_idx_t wallDistance)
+{
+   geometry::initializer::BoundaryFromDomainBorder< BoundaryHandling_T > init(*blockForest_, handlingID_);
+   init.initAllBorders(noSlipFlagID, wallDistance);
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setNoSlipInCell(
+   const Cell& globalCell)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+      // transform cell in global coordinates to cell in block local coordinates
+      Cell blockLocalCell;
+      blockForest_->transformGlobalToBlockLocalCell(blockLocalCell, *blockIt, globalCell);
+      handling->forceBoundary(noSlipFlagID, blockLocalCell[0], blockLocalCell[1], blockLocalCell[2]);
+   }
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setFreeSlipAtBorder(
+   stencil::Direction d, cell_idx_t wallDistance)
+{
+   geometry::initializer::BoundaryFromDomainBorder< BoundaryHandling_T > init(*blockForest_, handlingID_);
+   init.init(freeSlipFlagID, d, wallDistance);
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                ParticleAccessor_T >::setFreeSlipAtAllBorders(cell_idx_t wallDistance)
+{
+   geometry::initializer::BoundaryFromDomainBorder< BoundaryHandling_T > init(*blockForest_, handlingID_);
+   init.initAllBorders(freeSlipFlagID, wallDistance);
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setFreeSlipInCell(
+   const Cell& globalCell)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+      // transform cell in global coordinates to cell in block local coordinates
+      Cell blockLocalCell;
+      blockForest_->transformGlobalToBlockLocalCell(blockLocalCell, *blockIt, globalCell);
+      handling->forceBoundary(freeSlipFlagID, blockLocalCell[0], blockLocalCell[1], blockLocalCell[2]);
+   }
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setPressure(
+   real_t density)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+      Pressure_T& pressure =
+         handling->template getBoundaryCondition< Pressure_T >(handling->getBoundaryUID(pressureFlagID));
+      pressure.setLatticeDensity(density);
+   }
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setUBBInCell(
+   const Cell& globalCell, const Vector3< real_t >& velocity)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+      typename UBB_Inflow_T::Velocity ubbVel(velocity);
+      // transform cell in global coordinates to cell in block-local coordinates
+      Cell blockLocalCell;
+      blockForest_->transformGlobalToBlockLocalCell(blockLocalCell, *blockIt, globalCell);
+      // get block cell bounding box to check if cell is contained in block
+      CellInterval blockCellBB = blockForest_->getBlockCellBB(*blockIt);
+      // flag field has two ghost layers so blockCellBB is actually larger than returned; this is relevant for setups
+      // where the UBB is set in a ghost layer cell
+      blockCellBB.expand(cell_idx_c(2));
+      if (blockCellBB.contains(globalCell))
+      {
+         handling->forceBoundary(ubbFlagID, blockLocalCell[0], blockLocalCell[1], blockLocalCell[2], ubbVel);
+      }
+   }
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setInflowInCell(
+   const Cell& globalCell, const Vector3< real_t >& velocity)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+      typename UBB_Inflow_T::Velocity ubbVel(velocity);
+      // transform cell in global coordinates to cell in block-local coordinates
+      Cell blockLocalCell;
+      blockForest_->transformGlobalToBlockLocalCell(blockLocalCell, *blockIt, globalCell);
+      // get block cell bounding box to check if cell is contained in block
+      CellInterval blockCellBB = blockForest_->getBlockCellBB(*blockIt);
+      // flag field has two ghost layers so blockCellBB is actually larger than returned; this is relevant for setups
+      // where the UBB is set in a ghost layer cell
+      blockCellBB.expand(cell_idx_c(2));
+      if (blockCellBB.contains(globalCell))
+      {
+         handling->forceBoundary(ubbInflowFlagID, blockLocalCell[0], blockLocalCell[1], blockLocalCell[2], ubbVel);
+      }
+   }
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::setPressureOutflow(
+   real_t density)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+      Pressure_T& pressure =
+         handling->template getBoundaryCondition< Pressure_T >(handling->getBoundaryUID(pressureOutflowFlagID));
+      pressure.setLatticeDensity(density);
+   }
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T, ParticleAccessor_T >::enableBubbleOutflow(
+   BubbleModelBase* bubbleModel)
+{
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      BoundaryHandling_T* const handling = blockIt->template getData< BoundaryHandling_T >(handlingID_);
+      // get pressure from boundary handling
+      Pressure_T& pressure =
+         handling->template getBoundaryCondition< Pressure_T >(handling->getBoundaryUID(pressureFlagID));
+      Pressure_T& pressureOutflow =
+         handling->template getBoundaryCondition< Pressure_T >(handling->getBoundaryUID(pressureOutflowFlagID));
+      // set pressure in bubble model
+      pressure.setBubbleModel(bubbleModel);
+      pressureOutflow.setBubbleModel(bubbleModel);
+   }
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+Vector3< bool > AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                           ParticleAccessor_T >::isObstacleInGlobalGhostLayer()
+{
+   Vector3< bool > isObstacleInGlobalGhostLayer(false, false, false);
+   for (auto blockIt = blockForest_->begin(); blockIt != blockForest_->end(); ++blockIt)
+   {
+      const FlagField_T* const flagField = blockIt->template getData< const FlagField_T >(flagFieldID_);
+      const CellInterval domainCellBB = blockForest_->getDomainCellBB();
+      // disable OpenMP such that loop termination works correctly
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ_OMP(flagField, uint_c(1), omp critical, {
+         // get cell in global coordinates
+         Cell globalCell = Cell(x, y, z);
+         blockForest_->transformBlockLocalToGlobalCell(globalCell, *blockIt);
+         // check if the current cell is located in a global ghost layer
+         const bool isCellInGlobalGhostLayerX =
+            globalCell[0] < domainCellBB.xMin() || globalCell[0] > domainCellBB.xMax();
+         const bool isCellInGlobalGhostLayerY =
+            globalCell[1] < domainCellBB.yMin() || globalCell[1] > domainCellBB.yMax();
+         const bool isCellInGlobalGhostLayerZ =
+            globalCell[2] < domainCellBB.zMin() || globalCell[2] > domainCellBB.zMax();
+         // skip corners, as they do not influence periodic communication
+         if ((isCellInGlobalGhostLayerX && (isCellInGlobalGhostLayerY || isCellInGlobalGhostLayerZ)) ||
+             (isCellInGlobalGhostLayerY && isCellInGlobalGhostLayerZ))
+         {
+            continue;
+         }
+         if (!isObstacleInGlobalGhostLayer[0] && isCellInGlobalGhostLayerX &&
+             isPartOfMaskSet(flagField->get(x, y, z), flagField->getMask(flagInfo_.getObstacleIDSet())))
+         {
+            isObstacleInGlobalGhostLayer[0] = true;
+         }
+         if (!isObstacleInGlobalGhostLayer[1] && isCellInGlobalGhostLayerY &&
+             isPartOfMaskSet(flagField->get(x, y, z), flagField->getMask(flagInfo_.getObstacleIDSet())))
+         {
+            isObstacleInGlobalGhostLayer[1] = true;
+         }
+         if (!isObstacleInGlobalGhostLayer[2] && isCellInGlobalGhostLayerZ &&
+             isPartOfMaskSet(flagField->get(x, y, z), flagField->getMask(flagInfo_.getObstacleIDSet())))
+         {
+            isObstacleInGlobalGhostLayer[2] = true;
+         }
+         if (isObstacleInGlobalGhostLayer[0] && isObstacleInGlobalGhostLayer[1] && isObstacleInGlobalGhostLayer[2])
+         {
+            break; // there is no need to check other cells on this block
+         }
+      }) // WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ_OMP
+   }
+   mpi::allReduceInplace(isObstacleInGlobalGhostLayer, mpi::LOGICAL_OR);
+   return isObstacleInGlobalGhostLayer;
+}
+template< typename LatticeModel_T, typename FlagField_T, typename ScalarField_T, typename ParticleAccessor_T >
+void AntidunesBoundaryHandling< LatticeModel_T, FlagField_T, ScalarField_T,
+                                ParticleAccessor_T >::initFlagsFromFillLevel()
+{
+   const Vector3< bool > isObstacleInGlobalGhostLayer = this->isObstacleInGlobalGhostLayer();
+   WALBERLA_ROOT_SECTION()
+   {
+      if ((blockForest_->isXPeriodic() && isObstacleInGlobalGhostLayer[0]) ||
+          (blockForest_->isYPeriodic() && isObstacleInGlobalGhostLayer[1]) ||
+          (blockForest_->isZPeriodic() && isObstacleInGlobalGhostLayer[2]))
+      {
+         WALBERLA_LOG_WARNING_ON_ROOT(
+            "WARNING: An obstacle cell is located in a global outermost ghost layer in a periodic "
+            "direction. Be aware that due to periodicity, this obstacle cell will be "
+            "overwritten during communication.");
+      }
+   }
+   // communicate fill level (neighborhood is used in initialization)
+   comm_();
+   // initialize fill level in boundaries (with value 1), i.e., obstacles such that the bubble model does not detect
+   // obstacles as gas cells
+   walberla::free_surface::initFillLevelsInBoundaries< BoundaryHandling_T, typename LatticeModel_T::Stencil,
+                                                       ScalarField_T >(blockForest_, handlingID_, fillFieldID_);
+   // clear and initialize flags in every cell according to the fill level
+   walberla::free_surface::initFlagsFromFillLevels< BoundaryHandling_T, typename LatticeModel_T::Stencil, FlagField_T,
+                                                    const ScalarField_T >(blockForest_, flagInfo_, handlingID_,
+                                                                          fillFieldID_);
+}
+} // namespace free_surface
+} // namespace antidunes
+} // namespace walberla
--- a/apps/showcases/Antidunes/AntidunesLatticeModelGeneration.py
+++ b/apps/showcases/Antidunes/AntidunesLatticeModelGeneration.py
+import sympy as sp
+import pystencils as ps
+from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, create_lb_collision_rule
+from lbmpy.enums import ForceModel, Method, Stencil
+from lbmpy.stencils import LBStencil
+from pystencils_walberla import CodeGeneration
+from lbmpy_walberla import generate_lattice_model
+with CodeGeneration() as ctx:
+    # general parameters
+    layout = 'fzyx'
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    stencil = LBStencil(Stencil.D3Q27)
+    omega = sp.Symbol('omega')
+    force_field = ps.fields(f"force(3): {data_type}[3D]", layout=layout)
+    # method definition
+    lbm_config = LBMConfig(stencil=stencil,
+                           method=Method.CUMULANT,
+                           relaxation_rate=omega,
+                           compressible=True,
+                           force=force_field,
+                           zero_centered=False,
+                           streaming_pattern='pull',    # free surface implementation only works with pull pattern
+                           galilean_correction=True)
+    # optimizations to be used by the code generator
+    lbm_opt = LBMOptimisation(cse_global=True,
+                              field_layout=layout)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config,
+                                              lbm_optimisation=lbm_opt)
+    generate_lattice_model(ctx, "AntidunesLatticeModel", collision_rule, field_layout=layout)
No results found