9977512f · 9977512f · 9977512f · 9977512f · 9977512f · 9977512f
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonUniformGridGPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/ErrorChecking.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/NonUniformGPUScheme.h"
+
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
+
+#include <cmath>
+
+#include "GridGeneration.h"
+#include "LdcSetup.h"
+#include "NonUniformGridGPUInfoHeader.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/gpu/AddToStorage.h"
+#include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.h"
+#include "lbm_generated/gpu/GPUPdfField.h"
+#include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h"
+using namespace walberla;
+
+using StorageSpecification_T = lbm::NonUniformGridGPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+
+using PdfField_T           = lbm_generated::PdfField< StorageSpecification_T >;
+using GPUPdfField_T        = lbm_generated::GPUPdfField< StorageSpecification_T >;
+using FlagField_T          = FlagField< uint8_t >;
+using BoundaryCollection_T = lbm::NonUniformGridGPUBoundaryCollection< FlagField_T >;
+
+using SweepCollection_T = lbm::NonUniformGridGPUSweepCollection;
+
+using gpu::communication::NonUniformGPUScheme;
+
+int main(int argc, char** argv)
+{
+   const mpi::Environment env(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+   gpu::selectDeviceBasedOnMpiRank();
+
+   const std::string input_filename(argv[1]);
+   const bool inputIsPython = string_ends_with(input_filename, ".py");
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      WALBERLA_MPI_WORLD_BARRIER()
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                        SETUP AND CONFIGURATION                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto config = *cfg;
+      logging::configureLogging(config);
+      auto domainSetup        = config->getOneBlock("DomainSetup");
+      auto blockForestSetup   = config->getOneBlock("SetupBlockForest");
+      const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);
+
+      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+      // Reading parameters
+      auto parameters   = config->getOneBlock("Parameters");
+      const real_t omega             = parameters.getParameter< real_t >("omega", real_c(1.95));
+      const uint_t timesteps         = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const bool gpuEnabledMPI       = parameters.getParameter< bool >("gpuEnabledMPI", false);
+
+      shared_ptr< BlockForest > bfs;
+      createBlockForest(bfs, domainSetup, blockForestSetup);
+
+      if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+         return EXIT_SUCCESS;
+      }
+
+      auto blocks =
+         std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+      blocks->createCellBoundingBoxes();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks() << " on " << blocks->getNumberOfLevels() << " refinement levels")
+      for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level))
+      }
+
+      WALBERLA_LOG_INFO_ON_ROOT("Start field allocation")
+      // Creating fields
+      const StorageSpecification_T StorageSpec = StorageSpecification_T();
+      auto allocator                           = make_shared< gpu::HostFieldAllocator< real_t > >();
+      const BlockDataID pdfFieldCpuID =
+         lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx, allocator);
+      const BlockDataID velFieldCpuID =
+         field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx, uint_c(2), allocator);
+      const BlockDataID densityFieldCpuID =
+         field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(2), allocator);
+      const BlockDataID flagFieldID =
+         field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
+
+      const BlockDataID pdfFieldGpuID =
+         lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, StorageSpec, "pdfs on GPU", true);
+      const BlockDataID velFieldGpuID =
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
+      const BlockDataID densityFieldGpuID =
+         gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldCpuID, "velocity on GPU", true);
+      WALBERLA_LOG_INFO_ON_ROOT("Finished field allocation")
+
+      const Cell innerOuterSplit =
+         Cell(parameters.getParameter< Vector3< cell_idx_t > >("innerOuterSplit", Vector3< cell_idx_t >(1, 1, 1)));
+      Vector3< int32_t > gpuBlockSize =
+         parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
+      SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0],
+                                        gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit);
+
+      for (auto& iBlock : *blocks){
+         sweepCollection.initialise(&iBlock, cell_idx_c(1));
+      }
+      sweepCollection.initialiseBlockPointer();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Initialisation done")
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto ldc = std::make_shared< LDC >(blocks->getDepth());
+
+      const FlagUID fluidFlagUID("Fluid");
+      ldc->setupBoundaryFlagField(*blocks, flagFieldID);
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID, 0);
+      BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldGpuID, fluidFlagUID);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+      auto communication = std::make_shared< NonUniformGPUScheme< CommunicationStencil_T > >(blocks, gpuEnabledMPI);
+      auto packInfo      = lbm_generated::setupNonuniformGPUPdfCommunication< GPUPdfField_T >(blocks, pdfFieldGpuID);
+      communication->addPackInfo(packInfo);
+      WALBERLA_MPI_BARRIER()
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      int streamHighPriority = 0;
+      int streamLowPriority  = 0;
+      WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
+      sweepCollection.setOuterPriority(streamHighPriority);
+      auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+
+      lbm_generated::BasicRecursiveTimeStepGPU< GPUPdfField_T, SweepCollection_T, BoundaryCollection_T >
+         LBMMeshRefinement(blocks, pdfFieldGpuID, sweepCollection, boundaryCollection, communication, packInfo);
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+
+      LBMMeshRefinement.addRefinementToTimeLoop(timeLoop, uint_c(0));
+
+      // VTK
+      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      const bool useVTKAMRWriter     = parameters.getParameter< bool >("useVTKAMRWriter", false);
+      const bool oneFilePerProcess   = parameters.getParameter< bool >("oneFilePerProcess", false);
+
+      auto finalDomain = blocks->getDomain();
+      if (vtkWriteFrequency > 0){
+         auto vtkOutput =
+            vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", "simulation_step",
+                                           false, true, true, false, 0, useVTKAMRWriter, oneFilePerProcess);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldCpuID, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+
+         if (parameters.getParameter< bool >("writeOnlySlice", true)){
+            const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - blocks->dz(blocks->getDepth()),
+                               finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + blocks->dz(blocks->getDepth()));
+            vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
+         }
+
+         vtkOutput->addBeforeFunction([&]() {
+            for (auto& block : *blocks)
+               sweepCollection.calculateMacroscopicParameters(&block);
+            gpu::fieldCpy< VelocityField_T, gpu::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
+         });
+         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      }
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto remainingTimeLoggerFrequency =
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
+      if (remainingTimeLoggerFrequency > 0){
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency);
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
+      }
+
+      lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
+      fluidCells();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Non uniform Grid benchmark with " << fluidCells.numberOfCells()
+                                                                   << " fluid cells (in total on all levels)")
+
+      WcTimingPool timeloopTiming;
+      WcTimer simTimer;
+
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      WALBERLA_MPI_BARRIER()
+
+      simTimer.start();
+      timeLoop.run(timeloopTiming);
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_MPI_BARRIER()
+      simTimer.end();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+      double time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+      WALBERLA_ROOT_SECTION()
+      {
+         if (inputIsPython)
+         {
+            python_coupling::PythonCallback pythonCallbackResults("results_callback");
+            if (pythonCallbackResults.isCallable())
+            {
+               pythonCallbackResults.data().exposeValue("numProcesses", lbm_generated::PerformanceEvaluation< FlagField_T >::processes());
+               pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
+               pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+               pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+               pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
+               pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
+               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+               // Call Python function to report results
+               pythonCallbackResults();
+            }
+         }
+      }
+   }
+   return EXIT_SUCCESS;
+}
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
+import sympy as sp
+import numpy as np
+
+import pystencils as ps
+from pystencils.typing import TypedSymbol
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil, SubgridScaleModel
+
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+omega = sp.symbols("omega")
+omega_free = sp.Symbol("omega_free")
+compile_time_block_size = False
+max_threads = 256
+
+sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                    TypedSymbol("cudaBlockSize1", np.int32),
+                    TypedSymbol("cudaBlockSize2", np.int32))
+
+gpu_indexing_params = {'block_size': sweep_block_size}
+
+info_header = """
+const char * infoStencil = "{stencil}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+
+with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
+
+    streaming_pattern = 'esopull'
+    timesteps = get_timesteps(streaming_pattern)
+    stencil = LBStencil(Stencil.D3Q19)
+    method_enum = Method.CUMULANT
+
+    fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
+    collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()
+
+    assert stencil.D == 3, "This application supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
+                           fourth_order_correction=fourth_order_correction,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
+
+    generate_lbm_package(ctx, name="NonUniformGridGPU",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields,
+                         target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
+                         max_threads=max_threads)
+
+    infoHeaderParams = {
+        'stencil': stencil.name.lower(),
+        'streaming_pattern': streaming_pattern,
+        'collision_setup': collision_setup,
+        'cse_global': int(lbm_opt.cse_global),
+        'cse_pdfs': int(lbm_opt.cse_pdfs),
+    }
+
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
+    generate_info_header(ctx, 'NonUniformGridGPUInfoHeader',
+                         field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
--- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sqlite3
+import os
+import sys
+
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
+DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))
+
+
+class Scenario:
+    def __init__(self,
+                 domain_size=(64, 64, 64),
+                 root_blocks=(2, 2, 2),
+                 num_processes=1,
+                 refinement_depth=0,
+                 cells_per_block=(32, 32, 32),
+                 timesteps=101,
+                 gpu_enabled_mpi=False,
+                 vtk_write_frequency=0,
+                 logger_frequency=30,
+                 blockforest_filestem="blockforest",
+                 write_setup_vtk=True,
+                 db_file_name=None):
+
+        self.domain_size = domain_size
+        self.root_blocks = root_blocks
+        self.cells_per_block = cells_per_block
+        self.periodic = (0, 0, 1)
+
+        self.refinement_depth = refinement_depth
+        self.num_processes = num_processes
+        self.bfs_filestem = blockforest_filestem
+        self.write_setup_vtk = write_setup_vtk
+
+        self.timesteps = timesteps
+        self.gpu_enabled_mpi = gpu_enabled_mpi
+        self.vtk_write_frequency = vtk_write_frequency
+        self.logger_frequency = logger_frequency
+
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
+
+        self.config_dict = self.config(print_dict=False)
+
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        from pprint import pformat
+        config_dict = {
+            'DomainSetup': {
+                'domainSize': self.domain_size,
+                'rootBlocks': self.root_blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+            },
+            'SetupBlockForest': {
+                'refinementDepth': self.refinement_depth,
+                'numProcesses': self.num_processes,
+                'blockForestFilestem': self.bfs_filestem,
+                'writeVtk': self.write_setup_vtk,
+                'outputStatistics': True,
+                'writeSetupForestAndReturn': True,
+            },
+            'Parameters': {
+                'omega': 1.95,
+                'timesteps': self.timesteps,
+                'remainingTimeLoggerFrequency': self.logger_frequency,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'useVTKAMRWriter': True,
+                'oneFilePerProcess': False,
+                'writeOnlySlice': False,
+                'gpuEnabledMPI': self.gpu_enabled_mpi,
+                'gpuBlockSize': (128, 1, 1),
+            },
+            'Logging': {
+                'logLevel': "info",
+            }
+        }
+
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+
+        return config_dict
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
+        sequenceValuesToScalars(data)
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs"
+        table_name = table_name.replace("-", "_")
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+
+
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+
+    domain_size = (192, 192, 64)
+    cells_per_block = (64, 64, 64)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        cells_per_block=cells_per_block,
+                        timesteps=0,
+                        vtk_write_frequency=0,
+                        refinement_depth=3,
+                        gpu_enabled_mpi=False)
+    scenarios.add(scenario)
+
+
+def weak_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
+    wlb.log_info_on_root("Running weak scaling benchmark...")
+
+    # This benchmark must run from 16 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if uniform:
+        factor = 3 * num_proc
+        name = "uniform"
+    else:
+        if num_proc % 16 != 0:
+            raise RuntimeError("Number of processes must be dividable by 16")
+        factor = int(num_proc // 16)
+        name = "nonuniform"
+
+    cells_per_block = (WeakX, WeakY, WeakZ)
+    domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        gpu_enabled_mpi=gpu_enabled_mpi,
+                        db_file_name=f"weakScalingGPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+def strong_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
+    wlb.log_info_on_root("Running strong scaling benchmark...")
+
+    # This benchmark must run from 64 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if num_proc % 64 != 0:
+        raise RuntimeError("Number of processes must be dividable by 64")
+
+    cells_per_block = (StrongX, StrongY, StrongZ)
+
+    if uniform:
+        domain_size = (cells_per_block[0] * 2, cells_per_block[1] * 2, cells_per_block[2] * 16)
+        name = "uniform"
+    else:
+        factor = int(num_proc / 64)
+        blocks64 = block_decomposition(factor)
+        cells_per_block = tuple([int(c / b) for c, b in zip(cells_per_block, reversed(blocks64))])
+        domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+        name = "nonuniform"
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        gpu_enabled_mpi=gpu_enabled_mpi,
+                        db_file_name=f"strongScalingGPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+if BENCHMARK == 0:
+    validation_run()
+elif BENCHMARK == 1:
+    weak_scaling_ldc(1, True, False)
+elif BENCHMARK == 2:
+    strong_scaling_ldc(1, True, False)
+else:
+    print(f"Invalid benchmark case {BENCHMARK}")
+
+
--- a/apps/benchmarks/Percolation/CMakeLists.txt
+++ b/apps/benchmarks/Percolation/CMakeLists.txt
+waLBerla_link_files_to_builddir("*.prm")
+
+if (WALBERLA_BUILD_WITH_CODEGEN)
+    if (NOT WALBERLA_BUILD_WITH_GPU_SUPPORT OR (WALBERLA_BUILD_WITH_GPU_SUPPORT AND (CMAKE_CUDA_ARCHITECTURES GREATER_EQUAL 60 OR WALBERLA_BUILD_WITH_HIP)))
+        waLBerla_add_executable(NAME Percolation FILES Percolation.cpp
+              DEPENDS walberla::blockforest walberla::core walberla::field walberla::geometry walberla::gpu walberla::lbm walberla::lbm_mesapd_coupling walberla::mesa_pd walberla::sqlite walberla::vtk PSMCodegenPython_trt-smagorinsky_sc1 )
+        target_compile_definitions(Percolation PRIVATE Weighting=2)
+    endif ()
+endif ()
--- a/apps/benchmarks/Percolation/Percolation.cpp
+++ b/apps/benchmarks/Percolation/Percolation.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Percolation.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/grid_generator/SCIterator.h"
+#include "core/logging/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/vtk/all.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "lbm/PerformanceLogger.h"
+#include "lbm/vtk/all.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "sqlite/SQLite.h"
+
+#include "vtk/all.h"
+
+#include "LBMSweep.h"
+#include "PSMPackInfo.h"
+#include "PSMSweep.h"
+#include "PSM_Density.h"
+#include "PSM_InfoHeader.h"
+#include "PSM_MacroGetter.h"
+
+namespace percolation
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using namespace lbm_mesapd_coupling::psm::gpu;
+typedef pystencils::PSMPackInfo PackInfo_T;
+
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag("Fluid");
+const FlagUID Density_Flag("Density");
+const FlagUID NoSlip_Flag("NoSlip");
+const FlagUID Inflow_Flag("Inflow");
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Benchmark of a percolation setup
+ *
+ * This code can be used as a percolation (useParticles=true) or as a channel flow (useParticles=false) benchmark.
+ * A constant inflow from west is applied and a pressure boundary condition is set at the east.
+ * For the percolation, mono-sized fixed spherical particles are generated on a structured grid with an offset for
+ * every second particle layer in flow direction to avoid channels in flow direction. The flow is described by Darcy's
+ * law. For the channel flow, the flow is described by the Hagen–Poiseuille equation.
+ *
+ * The domain is either periodic or bounded by (no slip) walls in the vertical directions (y and z).
+ *
+ * For the percolation, the PSM is used in combination with a two-way coupling, but no particle dynamics.
+ * For the channel flow, only the LBM is used.
+ *
+ * The parameters can be changed via the input file.
+ *
+ */
+//*******************************************************************************************************************
+int main(int argc, char** argv)
+{
+   Environment env(argc, argv);
+   auto cfgFile = env.config();
+   if (!cfgFile) { WALBERLA_ABORT("Usage: " << argv[0] << " path-to-configuration-file \n"); }
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   gpu::selectDeviceBasedOnMpiRank();
+#endif
+
+   WALBERLA_LOG_INFO_ON_ROOT("waLBerla revision: " << std::string(WALBERLA_GIT_SHA1).substr(0, 8));
+   WALBERLA_LOG_INFO_ON_ROOT("compiler flags: " << std::string(WALBERLA_COMPILER_FLAGS));
+   WALBERLA_LOG_INFO_ON_ROOT("build machine: " << std::string(WALBERLA_BUILD_MACHINE));
+   WALBERLA_LOG_INFO_ON_ROOT(*cfgFile);
+
+   // Read config file
+   Config::BlockHandle numericalSetup = cfgFile->getBlock("NumericalSetup");
+   const uint_t numXBlocks            = numericalSetup.getParameter< uint_t >("numXBlocks");
+   const uint_t numYBlocks            = numericalSetup.getParameter< uint_t >("numYBlocks");
+   const uint_t numZBlocks            = numericalSetup.getParameter< uint_t >("numZBlocks");
+   WALBERLA_CHECK_EQUAL(numXBlocks * numYBlocks * numZBlocks, uint_t(MPIManager::instance()->numProcesses()),
+                        "When using GPUs, the number of blocks ("
+                           << numXBlocks * numYBlocks * numZBlocks << ") has to match the number of MPI processes ("
+                           << uint_t(MPIManager::instance()->numProcesses()) << ")");
+   const bool periodicInY                 = numericalSetup.getParameter< bool >("periodicInY");
+   const bool periodicInZ                 = numericalSetup.getParameter< bool >("periodicInZ");
+   const uint_t numXCellsPerBlock         = numericalSetup.getParameter< uint_t >("numXCellsPerBlock");
+   const uint_t numYCellsPerBlock         = numericalSetup.getParameter< uint_t >("numYCellsPerBlock");
+   const uint_t numZCellsPerBlock         = numericalSetup.getParameter< uint_t >("numZCellsPerBlock");
+   const bool sendDirectlyFromGPU         = numericalSetup.getParameter< bool >("sendDirectlyFromGPU");
+   const bool useCommunicationHiding      = numericalSetup.getParameter< bool >("useCommunicationHiding");
+   const Vector3< uint_t > frameWidth     = numericalSetup.getParameter< Vector3< uint_t > >("frameWidth");
+   const uint_t timeSteps                 = numericalSetup.getParameter< uint_t >("timeSteps");
+   const bool useParticles                = numericalSetup.getParameter< bool >("useParticles");
+   const real_t particleDiameter          = numericalSetup.getParameter< real_t >("particleDiameter");
+   const real_t particleGenerationSpacing = numericalSetup.getParameter< real_t >("particleGenerationSpacing");
+   const Vector3< real_t > generationDomainFraction =
+      numericalSetup.getParameter< Vector3< real_t > >("generationDomainFraction");
+   const Vector3< uint_t > generationPointOfReferenceOffset =
+      numericalSetup.getParameter< Vector3< uint_t > >("generationPointOfReferenceOffset");
+   const bool useParticleOffset = numericalSetup.getParameter< bool >("useParticleOffset");
+   const Vector3< uint_t > particleSubBlockSize =
+      numericalSetup.getParameter< Vector3< uint_t > >("particleSubBlockSize");
+   const real_t uInflow        = numericalSetup.getParameter< real_t >("uInflow");
+   const real_t relaxationRate = numericalSetup.getParameter< real_t >("relaxationRate");
+   if ((periodicInY && numYBlocks == 1) || (periodicInZ && numZBlocks == 1))
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("Using only 1 block in periodic dimensions can lead to unexpected behavior.")
+   }
+   const real_t viscosity = lbm::collision_model::viscosityFromOmega(relaxationRate);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(viscosity)
+
+   Config::BlockHandle outputSetup      = cfgFile->getBlock("Output");
+   const uint_t vtkSpacing              = outputSetup.getParameter< uint_t >("vtkSpacing");
+   const std::string vtkFolder          = outputSetup.getParameter< std::string >("vtkFolder");
+   const uint_t performanceLogFrequency = outputSetup.getParameter< uint_t >("performanceLogFrequency");
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   const bool periodicInX                     = false;
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid(
+      numXBlocks, numYBlocks, numZBlocks, numXCellsPerBlock, numYCellsPerBlock, numZCellsPerBlock, real_t(1), uint_t(0),
+      false, false, periodicInX, periodicInY, periodicInZ, // periodicity
+      false);
+
+   auto simulationDomain = blocks->getDomain();
+
+   ////////////
+   // MesaPD //
+   ////////////
+
+   auto rpdDomain = std::make_shared< mesa_pd::domain::BlockForestDomain >(blocks->getBlockForestPointer());
+
+   // Init data structures
+   auto ps                  = walberla::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto ss                  = walberla::make_shared< mesa_pd::data::ShapeStorage >();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor            = walberla::make_shared< ParticleAccessor_T >(ps, ss);
+   auto sphereShape         = ss->create< mesa_pd::data::Sphere >(particleDiameter * real_t(0.5));
+
+   // Create spheres
+   if (useParticles)
+   {
+      // Ensure that generation domain is computed correctly
+      WALBERLA_CHECK_FLOAT_EQUAL(simulationDomain.xMin(), real_t(0));
+      WALBERLA_CHECK_FLOAT_EQUAL(simulationDomain.yMin(), real_t(0));
+      WALBERLA_CHECK_FLOAT_EQUAL(simulationDomain.zMin(), real_t(0));
+
+      auto generationDomain = math::AABB::createFromMinMaxCorner(
+         math::Vector3< real_t >(simulationDomain.xMax() * (real_t(1) - generationDomainFraction[0]) / real_t(2),
+                                 simulationDomain.yMax() * (real_t(1) - generationDomainFraction[1]) / real_t(2),
+                                 simulationDomain.zMax() * (real_t(1) - generationDomainFraction[2]) / real_t(2)),
+         math::Vector3< real_t >(simulationDomain.xMax() * (real_t(1) + generationDomainFraction[0]) / real_t(2),
+                                 simulationDomain.yMax() * (real_t(1) + generationDomainFraction[1]) / real_t(2),
+                                 simulationDomain.zMax() * (real_t(1) + generationDomainFraction[2]) / real_t(2)));
+      real_t particleOffset = particleGenerationSpacing / real_t(2);
+      for (auto pt :
+           grid_generator::SCGrid(generationDomain, generationDomain.center() + generationPointOfReferenceOffset,
+                                  particleGenerationSpacing))
+      {
+         // Offset every second particle layer in flow direction to avoid channels in flow direction
+         if (useParticleOffset &&
+             uint_t(round(math::abs(generationDomain.center()[0] - pt[0]) / (particleGenerationSpacing))) % uint_t(2) !=
+                uint_t(0))
+         {
+            pt = pt + Vector3(real_t(0), particleOffset, particleOffset);
+         }
+         if (rpdDomain->isContainedInProcessSubdomain(uint_c(mpi::MPIManager::instance()->rank()), pt))
+         {
+            mesa_pd::data::Particle&& p = *ps->create();
+            p.setPosition(pt);
+            p.setInteractionRadius(particleDiameter * real_t(0.5));
+            p.setOwner(mpi::MPIManager::instance()->rank());
+            p.setShapeID(sphereShape);
+            p.setType(0);
+         }
+      }
+   }
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // Setting initial PDFs to nan helps to detect bugs in the initialization/BC handling
+   // Depending on WALBERLA_BUILD_WITH_GPU_SUPPORT, pdfFieldCPUGPUID is either a CPU or a CPU field
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   BlockDataID pdfFieldID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field (fzyx)", real_c(std::nan("")), field::fzyx);
+   BlockDataID BFieldID         = field::addToStorage< BField_T >(blocks, "B field", 0, field::fzyx, 1);
+   BlockDataID pdfFieldCPUGPUID = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "pdf field GPU");
+#else
+   BlockDataID pdfFieldCPUGPUID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field CPU", real_c(std::nan("")), field::fzyx);
+#endif
+   BlockDataID densityFieldID = field::addToStorage< DensityField_T >(blocks, "density field", real_t(0), field::fzyx);
+   BlockDataID velFieldID  = field::addToStorage< VelocityField_T >(blocks, "velocity field", real_t(0), field::fzyx);
+   BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+   // Synchronize particles between the blocks for the correct mapping of ghost particles
+   mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+   syncNextNeighborFunc(*ps, *rpdDomain);
+
+   // Assemble boundary block string
+   std::string boundariesBlockString = " Boundaries"
+                                       "{"
+                                       "Border { direction W;    walldistance -1;  flag Inflow; }"
+                                       "Border { direction E;    walldistance -1;  flag Density; }";
+
+   if (!periodicInY)
+   {
+      boundariesBlockString += "Border { direction S;    walldistance -1;  flag NoSlip; }"
+                               "Border { direction N;    walldistance -1;  flag NoSlip; }";
+   }
+
+   if (!periodicInZ)
+   {
+      boundariesBlockString += "Border { direction T;    walldistance -1;  flag NoSlip; }"
+                               "Border { direction B;    walldistance -1;  flag NoSlip; }";
+   }
+
+   boundariesBlockString += "}";
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream boundariesFile("boundaries.prm");
+      boundariesFile << boundariesBlockString;
+      boundariesFile.close();
+   }
+   WALBERLA_MPI_BARRIER()
+
+   auto boundariesCfgFile = Config();
+   boundariesCfgFile.readParameterFile("boundaries.prm");
+   auto boundariesConfig = boundariesCfgFile.getBlock("Boundaries");
+
+   // map boundaries into the LBM simulation
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, Fluid_Flag);
+   lbm::PSM_Density density_bc(blocks, pdfFieldCPUGPUID, real_t(1.0));
+   density_bc.fillFromFlagField< FlagField_T >(blocks, flagFieldID, Density_Flag, Fluid_Flag);
+   lbm::PSM_NoSlip noSlip(blocks, pdfFieldCPUGPUID);
+   noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, NoSlip_Flag, Fluid_Flag);
+   lbm::PSM_UBB ubb(blocks, pdfFieldCPUGPUID, uInflow, real_t(0), real_t(0));
+   ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, Inflow_Flag, Fluid_Flag);
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // Map particles into the fluid domain
+   ParticleAndVolumeFractionSoA_T< Weighting > particleAndVolumeFractionSoA(blocks, relaxationRate);
+   PSMSweepCollection psmSweepCollection(blocks, accessor, lbm_mesapd_coupling::RegularParticlesSelector(),
+                                         particleAndVolumeFractionSoA, particleSubBlockSize);
+   if (useParticles)
+   {
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+      {
+         psmSweepCollection.particleMappingSweep(&(*blockIt));
+      }
+   }
+
+   // Initialize PDFs
+   pystencils::InitializeDomainForPSM pdfSetter(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0), real_t(0), real_t(0),
+      real_t(1.0), real_t(0), real_t(0), real_t(0));
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      // pdfSetter requires particle velocities at cell centers
+      if (useParticles) { psmSweepCollection.setParticleVelocitiesSweep(&(*blockIt)); }
+      pdfSetter(&(*blockIt));
+   }
+
+   // Setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU, false);
+#else
+   walberla::blockforest::communication::UniformBufferedScheme< Stencil_T > com(blocks);
+#endif
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldCPUGPUID));
+   auto communication = std::function< void() >([&]() { com.communicate(); });
+
+   SweepTimeloop commTimeloop(blocks->getBlockStorage(), timeSteps);
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timeSteps);
+
+   timeloop.addFuncBeforeTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   pystencils::PSM_MacroGetter getterSweep(BFieldID, densityFieldID, pdfFieldID, velFieldID, real_t(0.0), real_t(0.0),
+                                           real_t(0.0));
+#else
+   pystencils::PSM_MacroGetter getterSweep(particleAndVolumeFractionSoA.BFieldID, densityFieldID, pdfFieldCPUGPUID,
+                                           velFieldID, real_t(0.0), real_t(0.0), real_t(0.0));
+#endif
+   // VTK output
+   if (vtkSpacing != uint_t(0))
+   {
+      // Spheres
+      auto particleVtkOutput = make_shared< mesa_pd::vtk::ParticleVtkOutput >(ps);
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleUid >("uid");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleLinearVelocity >("velocity");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleInteractionRadius >("radius");
+      // Limit output to process-local spheres
+      particleVtkOutput->setParticleSelector([sphereShape](const mesa_pd::data::ParticleStorage::iterator& pIt) {
+         return pIt->getShapeID() == sphereShape &&
+                !(mesa_pd::data::particle_flags::isSet(pIt->getFlags(), mesa_pd::data::particle_flags::GHOST));
+      });
+      auto particleVtkWriter = vtk::createVTKOutput_PointData(particleVtkOutput, "particles", vtkSpacing, vtkFolder);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(particleVtkWriter), "VTK (sphere data)");
+
+      // Fields
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData(blocks, "fluid", vtkSpacing, 0, false, vtkFolder);
+
+      pdfFieldVTK->addBeforeFunction(communication);
+
+      pdfFieldVTK->addBeforeFunction([&]() {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+         gpu::fieldCpy< PdfField_T, gpu::GPUField< real_t > >(blocks, pdfFieldID, pdfFieldCPUGPUID);
+         gpu::fieldCpy< GhostLayerField< real_t, 1 >, BFieldGPU_T >(blocks, BFieldID,
+                                                                    particleAndVolumeFractionSoA.BFieldID);
+#endif
+         for (auto& block : *blocks)
+            getterSweep(&block);
+      });
+
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "Velocity"));
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< DensityField_T > >(densityFieldID, "Density"));
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< BField_T > >(BFieldID, "OverlapFraction"));
+#else
+      pdfFieldVTK->addCellDataWriter(
+         make_shared< field::VTKWriter< BField_T > >(particleAndVolumeFractionSoA.BFieldID, "OverlapFraction"));
+#endif
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< FlagField_T > >(flagFieldID, "FlagField"));
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(pdfFieldVTK), "VTK (fluid field data)");
+   }
+
+   if (vtkSpacing != uint_t(0)) { vtk::writeDomainDecomposition(blocks, "domain_decomposition", vtkFolder); }
+
+   // Add performance logging
+   lbm::PerformanceLogger< FlagField_T > performanceLogger(blocks, flagFieldID, Fluid_Flag, performanceLogFrequency);
+   if (performanceLogFrequency > 0)
+   {
+      timeloop.addFuncAfterTimeStep(performanceLogger, "Evaluate performance logging");
+   }
+
+   // Add LBM communication function and boundary handling sweep
+   if (useCommunicationHiding)
+   {
+      timeloop.add() << Sweep(deviceSyncWrapper(density_bc.getSweep()), "Boundary Handling (Density)");
+   }
+   else
+   {
+      timeloop.add() << BeforeFunction(communication, "LBM Communication")
+                     << Sweep(deviceSyncWrapper(density_bc.getSweep()), "Boundary Handling (Density)");
+   }
+   timeloop.add() << Sweep(deviceSyncWrapper(ubb.getSweep()), "Boundary Handling (UBB)");
+   if (!periodicInY || !periodicInZ)
+   {
+      timeloop.add() << Sweep(deviceSyncWrapper(noSlip.getSweep()), "Boundary Handling (NoSlip)");
+   }
+
+   // PSM kernel
+   pystencils::PSMSweep PSMSweep(particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+                                 particleAndVolumeFractionSoA.particleForcesFieldID,
+                                 particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0.0),
+                                 real_t(0.0), real_t(0.0), relaxationRate);
+   pystencils::PSMSweepSplit PSMSplitSweep(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleForcesFieldID, particleAndVolumeFractionSoA.particleVelocitiesFieldID,
+      pdfFieldCPUGPUID, real_t(0.0), real_t(0.0), real_t(0.0), relaxationRate, frameWidth);
+   pystencils::LBMSweep LBMSweep(pdfFieldCPUGPUID, real_t(0.0), real_t(0.0), real_t(0.0), relaxationRate);
+   pystencils::LBMSplitSweep LBMSplitSweep(pdfFieldCPUGPUID, real_t(0.0), real_t(0.0), real_t(0.0), relaxationRate,
+                                           frameWidth);
+
+   if (useParticles)
+   {
+      if (useCommunicationHiding)
+      {
+         addPSMSweepsToTimeloops(commTimeloop, timeloop, com, psmSweepCollection, PSMSplitSweep);
+      }
+      else { addPSMSweepsToTimeloop(timeloop, psmSweepCollection, PSMSweep); }
+   }
+   else
+   {
+      if (useCommunicationHiding)
+      {
+         commTimeloop.add() << BeforeFunction([&]() { com.startCommunication(); }, "LBM Communication (start)")
+                            << Sweep(deviceSyncWrapper(LBMSplitSweep.getInnerSweep()), "LBM inner sweep")
+                            << AfterFunction([&]() { com.wait(); }, "LBM Communication (wait)");
+         timeloop.add() << Sweep(deviceSyncWrapper(LBMSplitSweep.getOuterSweep()), "LBM outer sweep");
+      }
+      else { timeloop.add() << Sweep(deviceSyncWrapper(LBMSweep), "LBM sweep"); }
+   }
+
+   WcTimingPool timeloopTiming;
+   // TODO: maybe add warmup phase
+   for (uint_t timeStep = 0; timeStep < timeSteps; ++timeStep)
+   {
+      if (useCommunicationHiding) { commTimeloop.singleStep(timeloopTiming); }
+      timeloop.singleStep(timeloopTiming);
+   }
+   timeloopTiming.logResultOnRoot();
+   auto timeloopTimingReduced = timeloopTiming.getReduced();
+
+   // Write parameters and performance results in sqlite database
+   WALBERLA_ROOT_SECTION()
+   {
+      // Use DB_FILE environment variable if set
+      std::string dbFile;
+      if (std::getenv("DB_FILE") != nullptr) { dbFile = std::getenv("DB_FILE"); }
+      else
+      {
+         if (useParticles) { dbFile = "percolation_benchmark.sqlite3"; }
+         else { dbFile = "channel_flow_benchmark.sqlite3"; }
+      }
+
+      std::map< std::string, int > integerProperties;
+      std::map< std::string, double > realProperties;
+      std::map< std::string, std::string > stringProperties;
+
+      integerProperties["numXBlocks"]                = int(numXBlocks);
+      integerProperties["numYBlocks"]                = int(numYBlocks);
+      integerProperties["numZBlocks"]                = int(numZBlocks);
+      integerProperties["numXCellsPerBlock"]         = int(numXCellsPerBlock);
+      integerProperties["numYCellsPerBlock"]         = int(numYCellsPerBlock);
+      integerProperties["numZCellsPerBlock"]         = int(numZCellsPerBlock);
+      integerProperties["timeSteps"]                 = int(timeSteps);
+      integerProperties["sendDirectlyFromGPU"]       = int(sendDirectlyFromGPU);
+      integerProperties["useCommunicationHiding"]    = int(useCommunicationHiding);
+      integerProperties["communicationHidingXWidth"] = int(frameWidth[0]);
+      integerProperties["communicationHidingYWidth"] = int(frameWidth[1]);
+      integerProperties["communicationHidingZWidth"] = int(frameWidth[2]);
+      integerProperties["useParticles"]              = int(useParticles);
+      integerProperties["numParticles"]              = int(ps->size());
+      integerProperties["particleSubBlockXSize"]     = int(particleSubBlockSize[0]);
+      integerProperties["particleSubBlockYSize"]     = int(particleSubBlockSize[1]);
+      integerProperties["particleSubBlockZSize"]     = int(particleSubBlockSize[2]);
+
+      realProperties["particleDiameter"]          = double(particleDiameter);
+      realProperties["particleGenerationSpacing"] = double(particleGenerationSpacing);
+
+      performanceLogger.getBestResultsForSQLOnRoot(integerProperties, realProperties, stringProperties);
+
+      auto runId = sqlite::storeRunInSqliteDB(dbFile, integerProperties, stringProperties, realProperties);
+      sqlite::storeTimingPoolInSqliteDB(dbFile, runId, *timeloopTimingReduced, "Timeloop");
+   }
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace percolation
+
+int main(int argc, char** argv) { percolation::main(argc, argv); }
--- a/apps/benchmarks/Percolation/benchmark.prm
+++ b/apps/benchmarks/Percolation/benchmark.prm
+
+NumericalSetup
+{
+    // product of number of blocks should be equal to number of used processes
+    numXBlocks 1;
+    numYBlocks 1;
+    numZBlocks 1;
+
+    periodicInY false;
+    periodicInZ false;
+
+    numXCellsPerBlock 256;
+    numYCellsPerBlock 128;
+    numZCellsPerBlock 128;
+
+    timeSteps 100;
+
+    sendDirectlyFromGPU false; // use GPU-GPU communication
+    useCommunicationHiding false;
+    frameWidth <1, 1, 1>; // width of the outer region if splitting the LBM/PSM into inner and outer (only used if useCommunicationHiding is true)
+
+    // particle distribution in LBM units
+    useParticles true; // if true, PSM/particle mapping/velocity computation/hydrodynamic force reduction is used, else LBM is used
+    particleDiameter 20.0;
+    particleGenerationSpacing 21.0;
+    generationDomainFraction <0.8, 1.0, 1.0>; // fraction of the domain where particles are generated
+    generationPointOfReferenceOffset <0, 0, 0>; // offset of point of reference from domain center, see SCIterator.h
+    useParticleOffset true; // offset every second particle layer in flow direction
+    particleSubBlockSize <8, 8, 8>;
+
+    // fluid quantities in LBM units
+    uInflow 0.00008;
+    relaxationRate 0.9;
+}
+
+Output
+{
+    vtkSpacing 0;
+    vtkFolder vtk_out;
+
+    performanceLogFrequency 100;
+}
--- a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
+++ b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
 waLBerla_link_files_to_builddir(*.prm)
 waLBerla_link_files_to_builddir(*.py)

-if (WALBERLA_BUILD_WITH_CUDA)
-    waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGenGPU
-            FILE multiphase_codegen.py
-            OUT_FILES initialize_phase_field_distributions.cu initialize_phase_field_distributions.h
-            initialize_velocity_based_distributions.cu initialize_velocity_based_distributions.h
-            phase_field_LB_step.cu phase_field_LB_step.h
-            hydro_LB_step.cu hydro_LB_step.h
-            PackInfo_phase_field_distributions.cu PackInfo_phase_field_distributions.h
-            PackInfo_phase_field.cu PackInfo_phase_field.h
-            PackInfo_velocity_based_distributions.cu PackInfo_velocity_based_distributions.h
-            GenDefines.h)
+waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
+        FILE multiphase_codegen.py
+        OUT_FILES initialize_phase_field_distributions.${CODEGEN_FILE_SUFFIX} initialize_phase_field_distributions.h
+        initialize_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} initialize_velocity_based_distributions.h
+        phase_field_LB_step.${CODEGEN_FILE_SUFFIX} phase_field_LB_step.h
+        hydro_LB_step.${CODEGEN_FILE_SUFFIX} hydro_LB_step.h
+        PackInfo_phase_field_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_phase_field_distributions.h
+        PackInfo_phase_field.${CODEGEN_FILE_SUFFIX} PackInfo_phase_field.h
+        PackInfo_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_velocity_based_distributions.h
+        GenDefines.h)

+if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core cuda field postprocessing lbm geometry timeloop gui BenchmarkPhaseFieldCodeGenGPU)
-    set_target_properties(benchmark_multiphase PROPERTIES CXX_VISIBILITY_PRESET hidden)
+          DEPENDS walberla::blockforest walberla::core walberla::gpu walberla::field walberla::postprocessing walberla::python_coupling walberla::lbm_generated walberla::geometry walberla::timeloop BenchmarkPhaseFieldCodeGen )
 else ()
-    waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGenCPU
-            FILE multiphase_codegen.py
-            OUT_FILES initialize_phase_field_distributions.cpp initialize_phase_field_distributions.h
-            initialize_velocity_based_distributions.cpp initialize_velocity_based_distributions.h
-            phase_field_LB_step.cpp phase_field_LB_step.h
-            hydro_LB_step.cpp hydro_LB_step.h
-            PackInfo_phase_field_distributions.cpp PackInfo_phase_field_distributions.h
-            PackInfo_phase_field.cpp PackInfo_phase_field.h
-            PackInfo_velocity_based_distributions.cpp PackInfo_velocity_based_distributions.h
-            GenDefines.h)
-
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core field postprocessing lbm geometry timeloop gui BenchmarkPhaseFieldCodeGenCPU)
-    set_target_properties(benchmark_multiphase PROPERTIES CXX_VISIBILITY_PRESET hidden)
-endif (WALBERLA_BUILD_WITH_CUDA)
+          DEPENDS walberla::blockforest walberla::core walberla::field walberla::postprocessing walberla::python_coupling walberla::lbm_generated walberla::geometry walberla::timeloop BenchmarkPhaseFieldCodeGen )
+endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )

--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
@@ -3,30 +3,50 @@ import waLBerla as wlb
 import pandas as pd

 from waLBerla.tools.sqlitedb import sequenceValuesToScalars
+from waLBerla.tools.config import block_decomposition

 import sys
+from math import prod
+
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
+
+def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
+    """Checks if a single block of given size fits into GPU memory"""
+
+    cells = prod(b + 2 * gls for b in block_size)
+    # 3 values for the velocity and two for the phase field and the temporary phase field
+    values_per_cell = 2 * q_phase + 2 * q_hydro + 3 + 2
+    needed_memory = values_per_cell * cells * size_per_value
+    return needed_memory < total_mem


 class Scenario:
-    def __init__(self, timeStepStrategy, overlappingWidth, cuda_block_size):
+    def __init__(self, time_step_strategy,
+                 cuda_block_size,
+                 cells_per_block=(256, 256, 256),
+                 cuda_enabled_mpi=False):
        # output frequencies
        self.vtkWriteFrequency = 0

        # simulation parameters
        self.timesteps = 101
-        edge_size = 32
-        self.cells = (edge_size, edge_size, edge_size)
-        self.blocks = (1, 1, 1)
+        self.cells_per_block = cells_per_block
+        self.blocks = block_decomposition(wlb.mpi.numProcesses())
        self.periodic = (1, 1, 1)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
+        self.size = (self.cells_per_block[0] * self.blocks[0],
+                     self.cells_per_block[1] * self.blocks[1],
+                     self.cells_per_block[2] * self.blocks[2])

-        self.timeStepStrategy = timeStepStrategy
-        self.overlappingWidth = overlappingWidth
+        self.timeStepStrategy = time_step_strategy
        self.cuda_block_size = cuda_block_size
        self.warmupSteps = 10

+        self.cudaEnabledMpi = cuda_enabled_mpi
+
        # bubble parameters
        self.bubbleRadius = min(self.size) // 4
        self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
@@ -41,19 +61,18 @@ class Scenario:
        return {
            'DomainSetup': {
                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                'cellsPerBlock': self.cells_per_block,
                'periodic': self.periodic,
            },
            'Parameters': {
                'timesteps': self.timesteps,
                'vtkWriteFrequency': self.vtkWriteFrequency,
-                'useGui': 0,
                'remainingTimeLoggerFrequency': -1,
                'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
                'gpuBlockSize': self.cuda_block_size,
                'warmupSteps': self.warmupSteps,
                'scenario': self.scenario,
+                'cudaEnabledMpi': self.cudaEnabledMpi
            },
            'Boundaries_GPU': {
                'Border': []
@@ -77,6 +96,14 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
        sequenceValuesToScalars(data)

        df = pd.DataFrame.from_records([data])
@@ -86,43 +113,22 @@ class Scenario:
            df.to_csv(self.csv_file, index=False, mode='a', header=False)


-def overlap_benchmark():
-    scenarios = wlb.ScenarioManager()
-    overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
-                         (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
-                         (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
-
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
-                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
-                   (32, 4, 2), (64, 4, 2), (128, 2, 2),
-                   (32, 8, 1), (64, 8, 1), (64, 4, 2),
-                   (32, 16, 1), (16, 16, 1), (16, 16, 2)]
-
-    # no overlap
-    scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1), cuda_block_size=(16, 16, 1)))
-
-    # overlap
-    for overlap_strategy in ['overlap']:
-        for overlappingWidth in overlappingWidths:
-            for cuda_block in cuda_blocks:
-                scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                    overlappingWidth=overlappingWidth,
-                                    cuda_block_size=cuda_block)
-                scenarios.add(scenario)
-
-
-def kernel_benchmark():
+def benchmark():
    scenarios = wlb.ScenarioManager()

-    # overlap
-    # for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
-    for overlap_strategy in ['overlap']:
-        scenario = Scenario(timeStepStrategy=overlap_strategy,
-                            overlappingWidth=(1, 1, 1),
-                            cuda_block_size=(128, 1, 1))
-        scenarios.add(scenario)
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+
+    block_size = (320, 320, 320)
+    cuda_enabled_mpi = True
+
+    if not domain_block_size_ok(block_size, gpu_mem):
+        wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+    else:
+        scenarios.add(Scenario(time_step_strategy='normal',
+                               cuda_block_size=(128, 1, 1),
+                               cells_per_block=block_size,
+                               cuda_enabled_mpi=cuda_enabled_mpi))


-# overlap_benchmark()
-kernel_benchmark()
+benchmark()
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_cpu.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_cpu.py
@@ -3,57 +3,55 @@ import waLBerla as wlb
 import pandas as pd

 from waLBerla.tools.sqlitedb import sequenceValuesToScalars
+from waLBerla.tools.config import block_decomposition

 import sys


 class Scenario:
-    def __init__(self, timeStepStrategy, overlappingWidth):
+    def __init__(self, time_step_strategy, cells_per_block=(256, 256, 256),
+                 cuda_enabled_mpi=False):
        # output frequencies
-        self.vtkWriteFrequency = -1
+        self.vtkWriteFrequency = 0

        # simulation parameters
-        self.timesteps = 500
-        edge_size = 50
-        self.cells = (edge_size, edge_size, edge_size)
-        self.blocks = (1, 1, 1)
+        self.timesteps = 101
+        self.cells_per_block = cells_per_block
+        self.blocks = block_decomposition(wlb.mpi.numProcesses())
        self.periodic = (1, 1, 1)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
+        self.size = (self.cells_per_block[0] * self.blocks[0],
+                     self.cells_per_block[1] * self.blocks[1],
+                     self.cells_per_block[2] * self.blocks[2])

-        self.timeStepStrategy = timeStepStrategy
-        self.overlappingWidth = overlappingWidth
-        self.cuda_block_size = (-1, -1, -1)
+        self.timeStepStrategy = time_step_strategy
        self.warmupSteps = 10

+        self.cudaEnabledMpi = cuda_enabled_mpi
+
        # bubble parameters
        self.bubbleRadius = min(self.size) // 4
        self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)

-        self.scenario = 1  # 1 rising bubble, 2 RTI
        self.config_dict = self.config()

-        self.csv_file = "benchmark_cpu.csv"
+        self.csv_file = "benchmark.csv"

    @wlb.member_callback
    def config(self):
        return {
            'DomainSetup': {
                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
+                'cellsPerBlock': self.cells_per_block,
                'periodic': self.periodic,
            },
            'Parameters': {
                'timesteps': self.timesteps,
                'vtkWriteFrequency': self.vtkWriteFrequency,
-                'useGui': 0,
-                'remainingTimeLoggerFrequency': 10.0,
+                'remainingTimeLoggerFrequency': -1,
                'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
-                'gpuBlockSize': self.cuda_block_size,
                'warmupSteps': self.warmupSteps,
-                'scenario': self.scenario,
+                'scenario': 1,
+                'cudaEnabledMpi': self.cudaEnabledMpi
            },
            'Boundaries_GPU': {
                'Border': []
@@ -86,31 +84,23 @@ class Scenario:
            df.to_csv(self.csv_file, index=False, mode='a', header=False)


-def overlap_benchmark():
+def benchmark():
    scenarios = wlb.ScenarioManager()
-    overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
-                         (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
-                         (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32),
-                         (64, 32, 32), (64, 64, 32), (64, 64, 64)]
-    # no overlap
-    scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1)))
-
-    # overlap
-    for overlap_strategy in ['overlap']:
-        for overlappingWidth in overlappingWidths:
-            scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                overlappingWidth=overlappingWidth)
-            scenarios.add(scenario)
+    block_size = (64, 64, 64)
+
+    scenarios.add(Scenario(time_step_strategy='normal', cells_per_block=block_size))


 def kernel_benchmark():
    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]

-    # overlap
-    scenario = Scenario(timeStepStrategy='overlap',
-                        overlappingWidth=(8, 8, 8))
-    scenarios.add(scenario)
+    for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
+        for block_size in block_sizes:
+            scenario = Scenario(time_step_strategy=time_step_strategy,
+                                cells_per_block=block_size)
+            scenarios.add(scenario)


-# overlap_benchmark()
+# benchmark()
 kernel_benchmark()
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -18,7 +18,6 @@
 //
 //======================================================================================================================
 #include "blockforest/Initialization.h"
-#include "blockforest/communication/UniformDirectScheme.h"

 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
@@ -30,6 +29,7 @@
 #include "field/vtk/VTKWriter.h"

 #include "geometry/InitBoundaryHandling.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"

 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/DictWrapper.h"
@@ -37,8 +37,6 @@

 #include "timeloop/SweepTimeloop.h"

-#include <blockforest/communication/UniformBufferedScheme.h>
-
 #include "InitializerFunctions.h"

 //////////////////////////////
@@ -46,53 +44,31 @@
 ////////////////////////////

 #if defined(WALBERLA_BUILD_WITH_CUDA)
-#   include "cuda/AddGPUFieldToStorage.h"
-#   include "cuda/DeviceSelectMPI.h"
-#   include "cuda/HostFieldAllocator.h"
-#   include "cuda/ParallelStreams.h"
-#   include "cuda/communication/GPUPackInfo.h"
-#   include "cuda/communication/MemcpyPackInfo.h"
-#   include "cuda/communication/UniformGPUScheme.h"
-
-#   include "GenDefines.h"
-#   include "PackInfo_phase_field.h"
-#   include "PackInfo_phase_field_distributions.h"
-#   include "PackInfo_velocity_based_distributions.h"
-#   include "hydro_LB_step.h"
-#   include "initialize_phase_field_distributions.h"
-#   include "initialize_velocity_based_distributions.h"
-#   include "phase_field_LB_step.h"
-
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/DeviceSelectMPI.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/MemcpyPackInfo.h"
+#   include "gpu/communication/UniformGPUScheme.h"
 #else
-#   include "GenDefines.h"
-#   include "PackInfo_phase_field.h"
-#   include "PackInfo_phase_field_distributions.h"
-#   include "PackInfo_velocity_based_distributions.h"
-#   include "hydro_LB_step.h"
-#   include "initialize_phase_field_distributions.h"
-#   include "initialize_velocity_based_distributions.h"
-#   include "phase_field_LB_step.h"
+#   include <blockforest/communication/UniformBufferedScheme.h>
 #endif

+#include "GenDefines.h"
+
 using namespace walberla;

-using PdfField_phase_T = GhostLayerField< real_t, Stencil_phase_T::Size >;
-using PdfField_hydro_T = GhostLayerField< real_t, Stencil_hydro_T::Size >;
-using VelocityField_T  = GhostLayerField< real_t, Stencil_hydro_T::Dimension >;
-using PhaseField_T     = GhostLayerField< real_t, 1 >;
-using flag_t           = walberla::uint8_t;
-using FlagField_T      = FlagField< flag_t >;
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;

 #if defined(WALBERLA_BUILD_WITH_CUDA)
-typedef cuda::GPUField< real_t > GPUField;
+typedef gpu::GPUField< real_t > GPUField;
 #endif
-// using CommScheme_T = cuda::communication::UniformGPUScheme<stencil::D2Q9>;

 int main(int argc, char** argv)
 {
-   mpi::Environment env(argc, argv);
+   const mpi::Environment env(argc, argv);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
 #endif

   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
@@ -103,39 +79,35 @@ int main(int argc, char** argv)
      logging::configureLogging(config);
      shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGridFromConfig(config);

-      Vector3< uint_t > cellsPerBlock =
-         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
      // Reading parameters
      auto parameters                    = config->getOneBlock("Parameters");
      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
      const uint_t timesteps             = parameters.getParameter< uint_t >("timesteps", uint_c(50));
-      const real_t remainingTimeLoggerFrequency =
-         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0);
      const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
-      Vector3< int > overlappingWidth =
-         parameters.getParameter< Vector3< int > >("overlappingWidth", Vector3< int >(1, 1, 1));
-      const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
+      const uint_t warmupSteps  = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));

 #if defined(WALBERLA_BUILD_WITH_CUDA)
      // CPU fields
-      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
-      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_t(0), field::fzyx);
+      const BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
+      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
      // GPU fields
-      BlockDataID lb_phase_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
         blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
-      BlockDataID lb_velocity_field_gpu = cuda::addGPUFieldToStorage< cuda::GPUField< real_t > >(
+      const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
         blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
-      BlockDataID vel_field_gpu =
-         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
+      const BlockDataID vel_field_gpu =
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
      BlockDataID phase_field_gpu =
-         cuda::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+         gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+      BlockDataID phase_field_tmp = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "temporary phasefield", true);
 #else
      BlockDataID lb_phase_field =
-         field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_t(0), field::fzyx);
+         field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
      BlockDataID lb_velocity_field =
-         field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_t(0), field::fzyx);
-      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
-      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_t(0), field::fzyx);
+         field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx);
+      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
+      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
+      BlockDataID phase_field_tmp = field::addToStorage< PhaseField_T >(blocks, "phase tmp", real_c(0.0), field::fzyx);
 #endif

      if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only")
@@ -146,7 +118,7 @@ int main(int argc, char** argv)
            auto bubbleParameters = config->getOneBlock("Bubble");
            const Vector3< real_t > bubbleMidPoint =
               bubbleParameters.getParameter< Vector3< real_t > >("bubbleMidPoint");
-            const real_t bubbleRadius = bubbleParameters.getParameter< real_t >("bubbleRadius", 20.0);
+            const real_t bubbleRadius = bubbleParameters.getParameter< real_t >("bubbleRadius", real_c(20.0));
            initPhaseField_bubble(blocks, phase_field, bubbleRadius, bubbleMidPoint);
         }
         else if (scenario == 2)
@@ -154,7 +126,7 @@ int main(int argc, char** argv)
            initPhaseField_RTI(blocks, phase_field);
         }
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-         cuda::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
+         gpu::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
 #endif
         WALBERLA_LOG_INFO_ON_ROOT("initialization of the phase field done")
      }
@@ -166,54 +138,83 @@ int main(int argc, char** argv)
      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);

      pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1],
-         Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+         lb_phase_field_gpu, phase_field_gpu, phase_field_tmp, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
-                                              gpuBlockSize[1],
-                                              Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+                                              gpuBlockSize[1], gpuBlockSize[2]);
 #else
      pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
-      pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field, phase_field, vel_field, Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
-      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field,
-                                              Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
+      pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, phase_field_tmp, vel_field);
+      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
 #endif

 // add communication
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      auto Comm_velocity_based_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
-      auto generatedPackInfo_velocity_based_distributions =
-         make_shared< pystencils::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
-      Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      const bool gpuEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
+      const int streamLowPriority  = 0;
+      const int streamHighPriority = 0;
+      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
+
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field_gpu);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
-      Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);

-      auto Comm_phase_field_distributions =
-         make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
-      auto generatedPackInfo_phase_field_distributions =
-         make_shared< pystencils::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
-      Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
-#else
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Full_Stencil_T > >(blocks, gpuEnabledMpi, false);
+      auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false, 65432);
+
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
+
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });

+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });

-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });

+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< gpu::GPUField<real_t> >(phase_field_gpu);
+           auto phaseFieldTMP = b->getData< gpu::GPUField<real_t> >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
+
+#else
+
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
-      auto generatedPackInfo_velocity_based_distributions =
-         make_shared< pystencils::PackInfo_velocity_based_distributions >(lb_velocity_field);

-      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field);
-      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
+      auto UniformGPUSchemePhaseField = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks, 65432);
+
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
+
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
+
+      auto Comm_phase_field_distributions = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });

-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
-      auto generatedPackInfo_phase_field_distributions =
-         make_shared< pystencils::PackInfo_phase_field_distributions >(lb_phase_field);
-      Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
+
+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< PhaseField_T >(phase_field);
+           auto phaseFieldTMP = b->getData< PhaseField_T >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
 #endif

-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+      BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
      // Boundaries
      const FlagUID fluidFlagUID("Fluid");
      auto boundariesConfig = config->getBlock("Boundaries_GPU");
@@ -232,177 +233,105 @@ int main(int argc, char** argv)
            init_h(&block);
            init_g(&block);
         }
+         WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+         WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+         WALBERLA_MPI_BARRIER()
         WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions done")
      }

+      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      int streamLowPriority  = 0;
-      int streamHighPriority = 0;
-      auto defaultStream     = cuda::StreamRAII::newPriorityStream(streamLowPriority);
-      auto innerOuterStreams = cuda::ParallelStreams(streamHighPriority);
-#endif
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_step.getSweep(defaultStream), "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");

-      auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      auto normalTimeStep = [&]() {
-         for (auto& block : *blocks)
-         {
-            Comm_phase_field_distributions->communicate(nullptr);
-            phase_field_LB_step(&block);
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_step.getSweep(defaultStream), "Hydro LB Step");
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");

-            Comm_velocity_based_distributions->communicate(nullptr);
-            hydro_LB_step(&block);
-         }
-      };
-      auto simpleOverlapTimeStep = [&]() {
-         Comm_phase_field_distributions->startCommunication(defaultStream);
-         for (auto& block : *blocks)
-            phase_field_LB_step.inner(&block, defaultStream);
-         Comm_phase_field_distributions->wait(defaultStream);
-         for (auto& block : *blocks)
-            phase_field_LB_step.outer(&block, defaultStream);
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");

-         Comm_velocity_based_distributions->startCommunication(defaultStream);
-         for (auto& block : *blocks)
-            hydro_LB_step.inner(&block, defaultStream);
-         Comm_velocity_based_distributions->wait(defaultStream);
-         for (auto& block : *blocks)
-            hydro_LB_step.outer(&block, defaultStream);
-      };
-      auto phase_only = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-      };
-      auto hydro_only = [&]() {
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-      auto without_comm = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
 #else
-      auto normalTimeStep = [&]() {
-         for (auto& block : *blocks)
-         {
-            Comm_phase_field_distributions();
-            phase_field_LB_step(&block);
-
-            Comm_velocity_based_distributions();
-            hydro_LB_step(&block);
-         }
-      };
-      auto simpleOverlapTimeStep = [&]() {
-        Comm_phase_field_distributions.startCommunication();
-        for (auto& block : *blocks)
-           phase_field_LB_step.inner(&block);
-        Comm_phase_field_distributions.wait();
-        for (auto& block : *blocks)
-           phase_field_LB_step.outer(&block);
-
-        Comm_velocity_based_distributions.startCommunication();
-        for (auto& block : *blocks)
-           hydro_LB_step.inner(&block);
-        Comm_velocity_based_distributions.wait();
-        for (auto& block : *blocks)
-           hydro_LB_step.outer(&block);
-      };
-      auto phase_only = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-      };
-      auto hydro_only = [&]() {
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-      auto without_comm = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-#endif
-      std::function< void() > timeStep;
-      if (timeStepStrategy == "overlap")
-      {
-         timeStep = std::function< void() >(simpleOverlapTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("overlapping timestep")
-      }
-      else if (timeStepStrategy == "phase_only")
-      {
-         timeStep = std::function< void() >(phase_only);
-         WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
-      }
-      else if (timeStepStrategy == "hydro_only")
-      {
-         timeStep = std::function< void() >(hydro_only);
-         WALBERLA_LOG_INFO_ON_ROOT("started only hydro step without communication for benchmarking")
-      }
-      else if (timeStepStrategy == "kernel_only")
-      {
-         timeStep = std::function< void() >(without_comm);
-         WALBERLA_LOG_INFO_ON_ROOT("started complete phasefield model without communication for benchmarking")
-      }
-      else
-      {
-         timeStep = std::function< void() >(normalTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("normal timestep with no overlapping")
-      }
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_step.getSweep(), "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");

-      timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_step.getSweep(), "Hydro LB Step");
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");

-      // remaining time logger
-      if (remainingTimeLoggerFrequency > 0)
-         timeLoop->addFuncAfterTimeStep(
-            timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
-            "remaining time logger");
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
+#endif

-      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
      if (vtkWriteFrequency > 1)
      {
         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                         "simulation_step", false, true, true, false, 0);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-         vtkOutput->addBeforeFunction([&]() {
-            cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
-            // cuda::fieldCpy<VelocityField_T, GPUField>( blocks, vel_field, vel_field_gpu );
-         });
+         vtkOutput->addBeforeFunction(
+            [&]() { gpu::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu); });
 #endif
         auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
         vtkOutput->addCellDataWriter(phaseWriter);

-         timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+         timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
      }

+      lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
+      fluidCells();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Multiphase benchmark with " << fluidCells.numberOfCells() << " fluid cells")
+      WALBERLA_LOG_INFO_ON_ROOT("Running " << warmupSteps << " timesteps to warm up the system")
+
      for (uint_t i = 0; i < warmupSteps; ++i)
-         timeLoop->singleStep();
+         timeloop.singleStep();

-      timeLoop->setCurrentTimeStepToZero();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Warmup timesteps done")
+
+      timeloop.setCurrentTimeStepToZero();
+      WALBERLA_MPI_BARRIER()
      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      WcTimingPool timeloopTiming;
      WcTimer simTimer;
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      cudaDeviceSynchronize();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 #endif
      simTimer.start();
-      timeLoop->run();
+      timeloop.run(timeloopTiming);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      cudaDeviceSynchronize();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 #endif
+      WALBERLA_MPI_BARRIER()
      simTimer.end();
      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-      auto time            = simTimer.last();
-      auto nrOfCells       = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
-      auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
-      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
+      double time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << performance.mlupsPerProcess(timesteps, time))
      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
      WALBERLA_ROOT_SECTION()
      {
         python_coupling::PythonCallback pythonCallbackResults("results_callback");
         if (pythonCallbackResults.isCallable())
         {
-            pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+            pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
+            pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
+            pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
+            #if defined(WALBERLA_BUILD_WITH_CUDA)
+               pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", gpuEnabledMpi);
+            #endif
            // Call Python function to report results
            pythonCallbackResults();
         }

--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_piz_daint.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_piz_daint.py
-import os
-import waLBerla as wlb
-import csv
-
-from waLBerla.tools.sqlitedb import sequenceValuesToScalars
-
-import sys
-
-
-class Scenario:
-    def __init__(self, timeStepStrategy, overlappingWidth, cuda_block_size):
-        # output frequencies
-        self.vtkWriteFrequency = 0
-
-        # simulation parameters
-        self.timesteps = 201
-        edge_size = 100
-        self.cells = (edge_size, edge_size, edge_size)
-        self.blocks = (1, 1, 1)
-        self.periodic = (1, 1, 1)
-        self.size = (self.cells[0] * self.blocks[0],
-                     self.cells[1] * self.blocks[1],
-                     self.cells[2] * self.blocks[2])
-
-        self.timeStepStrategy = timeStepStrategy
-        self.overlappingWidth = overlappingWidth
-        self.cuda_block_size = cuda_block_size
-        self.warmupSteps = 20
-
-        # bubble parameters
-        self.bubbleRadius = min(self.size) // 4
-        self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
-
-        self.scenario = 1  # 1 rising bubble, 2 RTI
-        self.config_dict = self.config()
-
-        self.csv_file = "benchmark_piz_daint.csv"
-
-    @wlb.member_callback
-    def config(self):
-        return {
-            'DomainSetup': {
-                'blocks': self.blocks,
-                'cellsPerBlock': self.cells,
-                'periodic': self.periodic,
-            },
-            'Parameters': {
-                'timesteps': self.timesteps,
-                'vtkWriteFrequency': self.vtkWriteFrequency,
-                'useGui': 0,
-                'remainingTimeLoggerFrequency': -1,
-                'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
-                'gpuBlockSize': self.cuda_block_size,
-                'warmupSteps': self.warmupSteps,
-                'scenario': self.scenario,
-            },
-            'Boundaries_GPU': {
-                'Border': []
-            },
-            'Boundaries_CPU': {
-                'Border': []
-            },
-            'Bubble': {
-                'bubbleMidPoint': self.bubbleMidPoint,
-                'bubbleRadius': self.bubbleRadius,
-            },
-        }
-
-    @wlb.member_callback
-    def results_callback(self, **kwargs):
-        data = {}
-        data.update(self.config_dict['Parameters'])
-        data.update(self.config_dict['DomainSetup'])
-        data.update(kwargs)
-        data['executable'] = sys.argv[0]
-        data['compile_flags'] = wlb.build_info.compiler_flags
-        data['walberla_version'] = wlb.build_info.version
-        data['build_machine'] = wlb.build_info.build_machine
-        sequenceValuesToScalars(data)
-
-        if not os.path.isfile(self.csv_file):
-            try:
-                with open(self.csv_file, 'w') as csvfile:
-                    writer = csv.DictWriter(csvfile, fieldnames=data)
-                    writer.writeheader()
-                    writer.writerow(data)
-            except IOError:
-                print("could not create csv file")
-        else:
-            try:
-                with open(self.csv_file, 'a') as csvfile:
-                    writer = csv.DictWriter(csvfile, fieldnames=data)
-                    writer.writerow(data)
-            except IOError:
-                print("could not write to csv file")
-
-
-def overlap_benchmark():
-    scenarios = wlb.ScenarioManager()
-    overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
-                         (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
-                         (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
-
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1),
-                   (32, 4, 1), (64, 4, 1),
-                   (32, 4, 2), (32, 8, 1), (16, 16, 1)]
-
-    # no overlap
-    scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1), cuda_block_size=(16, 16, 1)))
-
-    # overlap
-    for overlap_strategy in ['overlap']:
-        for overlappingWidth in overlappingWidths:
-            for cuda_block in cuda_blocks:
-                scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                    overlappingWidth=overlappingWidth,
-                                    cuda_block_size=cuda_block)
-                scenarios.add(scenario)
-
-
-def kernel_benchmark():
-    scenarios = wlb.ScenarioManager()
-
-    # overlap
-    # for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
-    for overlap_strategy in ['overlap']:
-        scenario = Scenario(timeStepStrategy=overlap_strategy,
-                            overlappingWidth=(16, 16, 16),
-                            cuda_block_size=(128, 4, 1))
-        scenarios.add(scenario)
-
-
-def weak_scaling():
-    scenarios = wlb.ScenarioManager()
-
-    # overlap
-    # for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
-    for overlap_strategy in ['overlap']:
-        scenario = Scenario(timeStepStrategy=overlap_strategy,
-                            overlappingWidth=(32, 32, 32),
-                            cuda_block_size=(128, 2, 1))
-        scenarios.add(scenario)
-
-
-# overlap_benchmark()
-# kernel_benchmark()
-weak_scaling()
--- a/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
-from pystencils import fields, TypedSymbol
-from pystencils.simp import sympy_cse
-from pystencils import AssignmentCollection
-
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
-from lbmpy.stencils import get_stencil
-
-from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
-
-from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb, \
-    initializer_kernel_hydro_lb, interface_tracking_force, \
-    hydrodynamic_force, get_collision_assignments_hydro
-
-from lbmpy.phasefield_allen_cahn.force_model import MultiphaseForceModel
-
 import numpy as np
+import sympy as sp

-stencil_phase = get_stencil("D3Q15", "walberla")
-stencil_hydro = get_stencil("D3Q27", "walberla")
-q_phase = len(stencil_phase)
-q_hydro = len(stencil_hydro)
-
-maxwellian_moments = False
-assert (len(stencil_phase[0]) == len(stencil_hydro[0]))
-dimensions = len(stencil_hydro[0])
-
-########################
-# PARAMETER DEFINITION #
-########################
-
-density_liquid = 1.0
-density_gas = 0.001
-surface_tension = 0.0001
-M = 0.02
-
-# phase-field parameter
-drho3 = (density_liquid - density_gas) / 3
-# interface thickness
-W = 5
-# coefficient related to surface tension
-beta = 12.0 * (surface_tension / W)
-# coefficient related to surface tension
-kappa = 1.5 * surface_tension * W
-# relaxation rate allen cahn (h)
-w_c = 1.0 / (0.5 + (3.0 * M))
-
-########################
-# FIELDS #
-########################
-
-# velocity field
-u = fields(f"vel_field({dimensions}): [{dimensions}D]", layout='fzyx')
-# phase-field
-C = fields(f"phase_field: [{dimensions}D]", layout='fzyx')
-
-# phase-field distribution functions
-h = fields(f"lb_phase_field({q_phase}): [{dimensions}D]", layout='fzyx')
-h_tmp = fields(f"lb_phase_field_tmp({q_phase}): [{dimensions}D]", layout='fzyx')
-# hydrodynamic distribution functions
-g = fields(f"lb_velocity_field({q_hydro}): [{dimensions}D]", layout='fzyx')
-g_tmp = fields(f"lb_velocity_field_tmp({q_hydro}): [{dimensions}D]", layout='fzyx')
-
-########################################
-# RELAXATION RATES AND EXTERNAL FORCES #
-########################################
-
-# calculate the relaxation rate for the hydro lb as well as the body force
-density = density_gas + C.center * (density_liquid - density_gas)
-
-# force acting on all phases of the model
-body_force = np.array([0, 1e-6, 0])
-
-relaxation_time = 0.03 + 0.5
-relaxation_rate = 1.0 / relaxation_time
-
-###############
-# LBM METHODS #
-###############
-
-method_phase = create_lb_method(stencil=stencil_phase, method='srt', relaxation_rate=w_c, compressible=True)
-
-method_hydro = create_lb_method(stencil=stencil_hydro, method="mrt", weighted=True,
-                                relaxation_rates=[relaxation_rate, 1, 1, 1, 1, 1],
-                                maxwellian_moments=maxwellian_moments)
-
-# create the kernels for the initialization of the g and h field
-h_updates = initializer_kernel_phase_field_lb(h, C, u, method_phase, W)
-g_updates = initializer_kernel_hydro_lb(g, u, method_hydro)
-
-
-force_h = [f / 3 for f in interface_tracking_force(C, stencil_phase, W)]
-force_model_h = MultiphaseForceModel(force=force_h)
-
-force_g = hydrodynamic_force(g, C, method_hydro, relaxation_time, density_liquid, density_gas, kappa, beta, body_force)
-
-h_tmp_symbol_list = [h_tmp.center(i) for i, _ in enumerate(stencil_phase)]
-sum_h = np.sum(h_tmp_symbol_list[:])
-
-####################
-# LBM UPDATE RULES #
-####################
-
-method_phase.set_force_model(force_model_h)
-
-phase_field_LB_step = create_lb_update_rule(lb_method=method_phase,
-                                            velocity_input=u,
-                                            compressible=True,
-                                            optimization={"symbolic_field": h,
-                                                          "symbolic_temporary_field": h_tmp},
-                                            kernel_type='stream_pull_collide')
-
-
-phase_field_LB_step.set_main_assignments_from_dict({**phase_field_LB_step.main_assignments_dict, **{C.center: sum_h}})
-
-phase_field_LB_step = AssignmentCollection(main_assignments=phase_field_LB_step.main_assignments,
-                                           subexpressions=phase_field_LB_step.subexpressions)
-phase_field_LB_step = sympy_cse(phase_field_LB_step)
-
-# ---------------------------------------------------------------------------------------------------------
-
-hydro_LB_step = get_collision_assignments_hydro(lb_method=method_hydro,
-                                                density=density,
-                                                velocity_input=u,
-                                                force=force_g,
-                                                sub_iterations=1,
-                                                symbolic_fields={"symbolic_field": g,
-                                                                 "symbolic_temporary_field": g_tmp},
-                                                kernel_type='collide_stream_push')
-
-# streaming of the hydrodynamic distribution
-stream_hydro = create_lb_update_rule(stencil=stencil_hydro,
-                                     optimization={"symbolic_field": g,
-                                                   "symbolic_temporary_field": g_tmp},
-                                     kernel_type='stream_pull_only')
-
-###################
-# GENERATE SWEEPS #
-###################
-
-cpu_vec = {'assume_inner_stride_one': True, 'nontemporal': True}
-
-vp = [('int32_t', 'cudaBlockSize0'),
-      ('int32_t', 'cudaBlockSize1')]
+from pystencils import fields, Target
+from pystencils.typing import TypedSymbol
+from pystencils.simp import sympy_cse

-sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                    TypedSymbol("cudaBlockSize1", np.int32),
-                    1)
-sweep_params = {'block_size': sweep_block_size}
+from lbmpy import LBMConfig, LBStencil, Method, Stencil
+from lbmpy.creationfunctions import LBMOptimisation, create_lb_method, create_lb_update_rule
+from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb, \
+    initializer_kernel_hydro_lb, interface_tracking_force, hydrodynamic_force, add_interface_tracking_force, \
+    add_hydrodynamic_force, hydrodynamic_force_assignments
+from lbmpy.phasefield_allen_cahn.parameter_calculation import AllenCahnParameters

-info_header = f"""
-#include "stencil/D3Q{q_phase}.h"\nusing Stencil_phase_T = walberla::stencil::D3Q{q_phase};
-#include "stencil/D3Q{q_hydro}.h"\nusing Stencil_hydro_T = walberla::stencil::D3Q{q_hydro};
-"""
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field, generate_info_header
+from lbmpy_walberla import generate_lb_pack_info

 with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
+
+    stencil_phase = LBStencil(Stencil.D3Q15)
+    stencil_hydro = LBStencil(Stencil.D3Q19)
+    assert (stencil_phase.D == stencil_hydro.D)
+
+    ########################
+    # PARAMETER DEFINITION #
+    ########################
+    # In the codegneration skript we only need the symbolic parameters
+    parameters = AllenCahnParameters(density_heavy=1.0, density_light=0.001,
+                                     dynamic_viscosity_heavy=0.03, dynamic_viscosity_light=0.03,
+                                     surface_tension=0.0001, mobility=0.02, interface_thickness=5,
+                                     gravitational_acceleration=1e-6)
+
+    ########################
+    # FIELDS #
+    ########################
+
+    # velocity field
+    u = fields(f"vel_field({stencil_hydro.D}): {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+    # phase-field
+    C = fields(f"phase_field: {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+    C_tmp = fields(f"phase_field_tmp: {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+
+    # phase-field distribution functions
+    h = fields(f"lb_phase_field({stencil_phase.Q}): {field_type}[{stencil_phase.D}D]", layout='fzyx')
+    h_tmp = fields(f"lb_phase_field_tmp({stencil_phase.Q}): {field_type}[{stencil_phase.D}D]", layout='fzyx')
+    # hydrodynamic distribution functions
+    g = fields(f"lb_velocity_field({stencil_hydro.Q}): {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+    g_tmp = fields(f"lb_velocity_field_tmp({stencil_hydro.Q}): {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+
+    ########################################
+    # RELAXATION RATES AND EXTERNAL FORCES #
+    ########################################
+
+    # relaxation rate for interface tracking LB step
+    relaxation_rate_allen_cahn = 1.0 / (0.5 + (3.0 * parameters.symbolic_mobility))
+    # calculate the relaxation rate for hydrodynamic LB step
+    rho_L = parameters.symbolic_density_light
+    rho_H = parameters.symbolic_density_heavy
+    density = rho_L + C.center * (rho_H - rho_L)
+    # force acting on all phases of the model
+    body_force = [0, 0, 0]
+    body_force[1] = parameters.symbolic_gravitational_acceleration * density
+
+    omega = parameters.omega(C)
+
+    ###############
+    # LBM METHODS #
+    ###############
+
+    rates = [0.0]
+    rates += [relaxation_rate_allen_cahn] * stencil_phase.D
+    rates += [1.0] * (stencil_phase.Q - stencil_phase.D - 1)
+
+    lbm_config_phase = LBMConfig(stencil=stencil_phase, method=Method.MRT, compressible=True,
+                                 delta_equilibrium=False,
+                                 force=sp.symbols(f"F_:{stencil_phase.D}"), velocity_input=u,
+                                 weighted=True, relaxation_rates=rates,
+                                 output={'density': C_tmp})
+    method_phase = create_lb_method(lbm_config=lbm_config_phase)
+
+    lbm_config_hydro = LBMConfig(stencil=stencil_hydro, method=Method.MRT, compressible=False,
+                                 weighted=True, relaxation_rate=omega,
+                                 force=sp.symbols(f"F_:{stencil_hydro.D}"),
+                                 output={'velocity': u})
+    method_hydro = create_lb_method(lbm_config=lbm_config_hydro)
+
+    # create the kernels for the initialization of the g and h field
+    h_updates = initializer_kernel_phase_field_lb(method_phase, C, u, h, parameters)
+    h_updates = h_updates.new_with_substitutions(parameters.parameter_map())
+
+    g_updates = initializer_kernel_hydro_lb(method_hydro, 1, u, g)
+    g_updates = g_updates.new_with_substitutions(parameters.parameter_map())
+
+    force_h = interface_tracking_force(C, stencil_phase, parameters)
+    hydro_force = hydrodynamic_force(C, method_hydro, parameters, body_force)
+
+    ####################
+    # LBM UPDATE RULES #
+    ####################
+
+    lbm_optimisation = LBMOptimisation(symbolic_field=h, symbolic_temporary_field=h_tmp)
+    phase_field_LB_step = create_lb_update_rule(lbm_config=lbm_config_phase,
+                                                lbm_optimisation=lbm_optimisation)
+
+    phase_field_LB_step = add_interface_tracking_force(phase_field_LB_step, force_h)
+    phase_field_LB_step = phase_field_LB_step.new_with_substitutions(parameters.parameter_map())
+
+    phase_field_LB_step = sympy_cse(phase_field_LB_step)
+    # ---------------------------------------------------------------------------------------------------------
+    force_Assignments = hydrodynamic_force_assignments(u, C, method_hydro, parameters, body_force)
+
+    lbm_optimisation = LBMOptimisation(symbolic_field=g, symbolic_temporary_field=g_tmp)
+    hydro_LB_step = create_lb_update_rule(lbm_config=lbm_config_hydro,
+                                          lbm_optimisation=lbm_optimisation)
+
+    hydro_LB_step = add_hydrodynamic_force(hydro_LB_step, force_Assignments, C, g, parameters)
+    hydro_LB_step = hydro_LB_step.new_with_substitutions(parameters.parameter_map())
+
+    hydro_LB_step = sympy_cse(hydro_LB_step)
+
+    ###################
+    # GENERATE SWEEPS #
+    ###################
+
+    # by default NT Stores are deactivated because they do not work in all cases
+    # must be activated to achieve full potential for example on AVX512 CPUs
+    cpu_vec = {'assume_inner_stride_one': True, 'nontemporal': False}
+
+    vp = [('int32_t', 'cudaBlockSize0'),
+          ('int32_t', 'cudaBlockSize1'),
+          ('int32_t', 'cudaBlockSize2')]
+
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        TypedSymbol("cudaBlockSize2", np.int32))
+    sweep_params = {'block_size': sweep_block_size}
+
+    stencil_typedefs = {'Stencil_phase_T': stencil_phase,
+                        'Stencil_hydro_T': stencil_hydro,
+                        'Full_Stencil_T': LBStencil(Stencil.D3Q27)}
+    field_typedefs = {'PdfField_phase_T': h,
+                      'PdfField_hydro_T': g,
+                      'VelocityField_T': u,
+                      'PhaseField_T': C}
+
+    additional_code = f"""
+    const char * StencilNamePhase = "{stencil_phase.name}";
+    const char * StencilNameHydro = "{stencil_hydro.name}";
+    """
+
    if not ctx.cuda:
        if not ctx.optimize_for_localhost:
            cpu_vec = {'instruction_set': None}

-        generate_sweep(ctx, 'initialize_phase_field_distributions', h_updates)
-        generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates)
+        generate_sweep(ctx, 'initialize_phase_field_distributions', h_updates, target=Target.CPU)
+        generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target=Target.CPU)

        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
                       field_swaps=[(h, h_tmp)],
                       inner_outer_split=True,
-                       cpu_vectorize_info=cpu_vec)
+                       cpu_vectorize_info=cpu_vec,
+                       target=Target.CPU)

        generate_sweep(ctx, 'hydro_LB_step', hydro_LB_step,
                       field_swaps=[(g, g_tmp)],
                       inner_outer_split=True,
-                       cpu_vectorize_info=cpu_vec)
+                       cpu_vectorize_info=cpu_vec,
+                       target=Target.CPU)

        # communication
-        generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field_distributions',
-                                       phase_field_LB_step.main_assignments, target='cpu')
-        generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field',
-                                       hydro_LB_step.all_assignments, target='cpu', kind='pull')
-        generate_pack_info_from_kernel(ctx, 'PackInfo_velocity_based_distributions',
-                                       hydro_LB_step.all_assignments, target='cpu', kind='push')
+        generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
+                              streaming_pattern='pull', target=Target.CPU)
+
+        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
+                              streaming_pattern='pull', target=Target.CPU)

-        ctx.write_file("GenDefines.h", info_header)
+        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.CPU)

    if ctx.cuda:
        generate_sweep(ctx, 'initialize_phase_field_distributions',
-                       h_updates, target='gpu')
+                       h_updates, target=Target.GPU)
        generate_sweep(ctx, 'initialize_velocity_based_distributions',
-                       g_updates, target='gpu')
+                       g_updates, target=Target.GPU)

        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
                       field_swaps=[(h, h_tmp)],
-                       inner_outer_split=True,
-                       target='gpu',
+                       target=Target.GPU,
                       gpu_indexing_params=sweep_params,
                       varying_parameters=vp)

        generate_sweep(ctx, 'hydro_LB_step', hydro_LB_step,
                       field_swaps=[(g, g_tmp)],
-                       inner_outer_split=True,
-                       target='gpu',
+                       target=Target.GPU,
                       gpu_indexing_params=sweep_params,
                       varying_parameters=vp)
        # communication
-        generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field_distributions',
-                                       phase_field_LB_step.main_assignments, target='gpu')
-        generate_pack_info_from_kernel(ctx, 'PackInfo_phase_field',
-                                       hydro_LB_step.all_assignments, target='gpu', kind='pull')
-        generate_pack_info_from_kernel(ctx, 'PackInfo_velocity_based_distributions',
-                                       hydro_LB_step.all_assignments, target='gpu', kind='push')
+        generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
+                              streaming_pattern='pull', target=Target.GPU)
+
+        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
+                              streaming_pattern='pull', target=Target.GPU)

-        ctx.write_file("GenDefines.h", info_header)
+        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.GPU)

-print("finished code generation successfully")
+        # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'GenDefines', stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
+                         additional_code=additional_code)
--- a/apps/benchmarks/PhaseFieldAllenCahn/profiling.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/profiling.py
@@ -16,15 +16,13 @@ class Scenario:
                     self.cells[1] * self.blocks[1],
                     self.cells[2] * self.blocks[2])

-        self.timeStepStrategy = 'phase_only'
-        self.overlappingWidth = (1, 1, 1)
+        self.timeStepStrategy = 'phase_only'  # 'phase_only', 'hydro_only', 'kernel_only', 'normal'
        self.cuda_block_size = (128, 1, 1)

        # bubble parameters
        self.bubbleRadius = min(self.size) // 4
        self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)

-        self.scenario = 1  # 1 rising bubble, 2 RTI
        self.config_dict = self.config()

    @wlb.member_callback
@@ -38,13 +36,11 @@ class Scenario:
            'Parameters': {
                'timesteps': self.timesteps,
                'vtkWriteFrequency': self.vtkWriteFrequency,
-                'useGui': 0,
                'remainingTimeLoggerFrequency': -1,
                'timeStepStrategy': self.timeStepStrategy,
-                'overlappingWidth': self.overlappingWidth,
                'gpuBlockSize': self.cuda_block_size,
                'warmupSteps': 0,
-                'scenario': self.scenario,
+                'scenario': 1,
            },
            'Boundaries_GPU': {
                'Border': []

--- a/apps/benchmarks/PoiseuilleChannel/CMakeLists.txt
+++ b/apps/benchmarks/PoiseuilleChannel/CMakeLists.txt

 waLBerla_link_files_to_builddir( "*.dat" )                 
-                                  
-waLBerla_add_executable( NAME PoiseuilleChannel DEPENDS blockforest boundary core field lbm postprocessing stencil timeloop vtk sqlite)
+
+waLBerla_add_executable( NAME PoiseuilleChannel DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::field walberla::lbm walberla::postprocessing walberla::stencil walberla::timeloop walberla::vtk walberla::sqlite )

 ##############
 # Some tests #
 ##############

-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlatesNoCheckRelease COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlatesNoCheck.dat --trt --linear-exp PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipeNoCheckRelease           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipeNoCheck.dat           --trt --linear-exp PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlatesNoCheckRelease COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlatesNoCheck.dat --trt --linear-exp PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipeNoCheckRelease           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipeNoCheck.dat           --trt --linear-exp PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )

-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlatesNoCheckDebug COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlatesNoCheck.dat --trt --linear-exp PROCESSES 4 LABELS longrun CONFIGURATIONS Debug DebugOptimized )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipeNoCheckDebug           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipeNoCheck.dat           --trt --linear-exp PROCESSES 4 LABELS longrun CONFIGURATIONS Debug DebugOptimized )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlatesNoCheckDebug COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlatesNoCheck.dat --trt --linear-exp PROCESSES 4 LABELS longrun CONFIGURATIONS Debug DebugOptimized DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipeNoCheckDebug           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipeNoCheck.dat           --trt --linear-exp PROCESSES 4 LABELS longrun CONFIGURATIONS Debug DebugOptimized DEPENDS_ON_TARGETS PoiseuilleChannel )

-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlates0 COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlates0.dat --trt --linear-exp LABELS longrun                         CONFIGURATIONS Release RelWithDbgInfo )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlates2 COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlates2.dat --trt --linear-exp LABELS longrun verylongrun PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipe0           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipe0.dat --trt --linear-exp           LABELS longrun                         CONFIGURATIONS Release RelWithDbgInfo )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipe2           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipe2.dat --trt --linear-exp           LABELS longrun verylongrun PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlates0 COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlates0.dat --trt --linear-exp LABELS longrun                         CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlates2 COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlates2.dat --trt --linear-exp LABELS longrun verylongrun PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipe0           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipe0.dat --trt --linear-exp           LABELS longrun                         CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipe2           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipe2.dat --trt --linear-exp           LABELS longrun verylongrun PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
--- a/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
+++ b/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
@@ -608,7 +608,7 @@ protected:
      const auto & dy = this->blockStorage_->dy( level );
      WALBERLA_ASSERT_FLOAT_EQUAL( dy, this->blockStorage_->dz( level ) );

-      const real_t acceleration_L = pdf_->latticeModel().forceModel().force()[0]; // force = acceleration (in lattice units of the current level!)
+      const real_t acceleration_L = pdf_->latticeModel().forceModel().forceDensity()[0]; // force = acceleration (in lattice units of the current level!)
      const real_t viscosity_L    = pdf_->latticeModel().collisionModel().viscosity(); // in lattice units on the current level
      const real_t radius_L       = channelRadius / dy; // in lattice units on the current level
      
@@ -890,7 +890,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod

   // remaining time logger

-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
   timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );

   // logging right before the simulation starts

--- a/apps/benchmarks/ProbeVsExtraMessage/CMakeLists.txt
+++ b/apps/benchmarks/ProbeVsExtraMessage/CMakeLists.txt
 waLBerla_add_executable ( NAME PackPerformance 
                          FILES PackPerformance.cpp
-                          DEPENDS core )
+      DEPENDS walberla::core )

 waLBerla_add_executable ( NAME ProbeVsExtraMessage 
                          FILES ProbeVsExtraMessage.cpp
-                          DEPENDS core postprocessing stencil sqlite)
+      DEPENDS walberla::core walberla::postprocessing walberla::stencil walberla::sqlite )
--- a/apps/benchmarks/SchaeferTurek/CMakeLists.txt
+++ b/apps/benchmarks/SchaeferTurek/CMakeLists.txt

 waLBerla_link_files_to_builddir( "*.dat" )   
-                                  
-waLBerla_add_executable( NAME SchaeferTurek DEPENDS blockforest boundary core field lbm postprocessing stencil timeloop vtk sqlite )

-waLBerla_execute_test( NO_MODULE_LABEL NAME SchaeferTurekTest COMMAND $<TARGET_FILE:SchaeferTurek> Test2D.dat PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
\ No newline at end of file
+waLBerla_add_executable( NAME SchaeferTurek DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::field walberla::lbm walberla::postprocessing walberla::stencil walberla::timeloop walberla::vtk walberla::sqlite )
+
+waLBerla_execute_test( NO_MODULE_LABEL NAME SchaeferTurekTest COMMAND $<TARGET_FILE:SchaeferTurek> Test2D.dat PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS SchaeferTurek )
\ No newline at end of file
--- a/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
+++ b/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
@@ -1064,7 +1064,7 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
   uint_t maxInflowLevel( uint_t(0) );
   uint_t maxOutflowLevel( uint_t(0) );

-   // In addtion to keeping in- and outflow blocks at the same level, this callback also
+   // In addition to keeping in- and outflow blocks at the same level, this callback also
   // prevents these blocks from coarsening.

   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
@@ -1472,7 +1472,7 @@ void Evaluation< LatticeModel_T >::operator()()
      {
         WALBERLA_LOG_RESULT_ON_ROOT( "force acting on cylinder (in dimensionless lattice units of the coarsest grid - evaluated in time step "
                                      << forceEvaluationExecutionCount_ << "):\n   " << force_ << oss.str() <<
-                                      "\ndrag and lift coefficients (including extremas of last " << ( coefficients_[0].size() * checkFrequency_ ) << " time steps):"
+                                      "\ndrag and lift coefficients (including extrema of last " << ( coefficients_[0].size() * checkFrequency_ ) << " time steps):"
                                      "\n   \"real\" area:"
                                      "\n      c_D: " << cDRealArea << " (min = " << coefficientExtremas_[0].first << ", max = " << coefficientExtremas_[0].second << ")" <<
                                      "\n      c_L: " << cLRealArea << " (min = " << coefficientExtremas_[1].first << ", max = " << coefficientExtremas_[1].second << ")" <<
@@ -2050,9 +2050,7 @@ void Evaluation< LatticeModel_T >::evaluate( real_t & cDRealArea, real_t & cLRea

   // force on obstacle

-   mpi::reduceInplace( force_[0], mpi::SUM );
-   mpi::reduceInplace( force_[1], mpi::SUM );
-   mpi::reduceInplace( force_[2], mpi::SUM );
+   mpi::reduceInplace( force_, mpi::SUM );

   if( setup_.evaluateForceComponents )
   {
@@ -2365,7 +2363,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
   // add velocity field + initialize velocity field writer (only used for simulations with an adaptive block structure)

   using VelocityField_T = field::GhostLayerField<Vector3<real_t>, 1>;
-   BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::zyxf, FieldGhostLayers, true, None, Empty );
+   BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::fzyx, FieldGhostLayers, true, None, Empty );

   using VelocityFieldWriter_T = lbm::VelocityFieldWriter<typename Types<LatticeModel_T>::PdfField_T, VelocityField_T>;
   BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldId, velocityFieldId ), None, Empty );
@@ -2569,14 +2567,14 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
                     blockforest::DynamicDiffusionBalance< blockforest::NoPhantomData >( maxIterations, flowIterations ) );
      }

-      // add callback functions which are executed after all block data was unpakced after the dynamic load balancing
+      // add callback functions which are executed after all block data was unpacked after the dynamic load balancing

      // for blocks that have *not* migrated: store current flag field state (required for lbm::PostProcessing)
      blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::MarkerFieldGenerator< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >(
               pdfFieldId, markerDataId, flagFieldFilter ) );
      // (re)set boundaries = (re)initialize flag field for every block with respect to the new block structure (the size of neighbor blocks might have changed)
      blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( blockforest::BlockForest::RefreshCallbackWrappper( boundarySetter ) );
-      // treat boundary-fluid cell convertions
+      // treat boundary-fluid cell conversions
      blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::PostProcessing< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >(
               pdfFieldId, markerDataId, flagFieldFilter ) );
      // (re)set velocity field (velocity field data is not migrated!)
@@ -2625,7 +2623,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod

   // remaining time logger

-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
   timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );

   // logging right before the simulation starts
@@ -2920,10 +2918,10 @@ int main( int argc, char **argv )
                              "//                                                                                                                      //\n"
                              "//                                               Schaefer Turek Benchmark                                               //\n"
                              "//                                                                                                                      //\n"
-                              "// Reference: Schaefer, M. and Turek, S. (1996) 'Benchmark computations of laminar flow around a cylinder (with support //\n"
+                              "// Reference: Schaefer, M. and Turek, S. (1996) Benchmark computations of laminar flow around a cylinder (with support  //\n"
                              "//            by F. Durst, E. Krause and R. Rannacher), in E. Hirschel (Ed.): Flow Simulation with High-Performance     //\n"
-                              "//            Computers II. DFG Priority Research Program Results 1993-1995, No. 52 in Notes Numer, Fluid Mech.,        //\n"
-                              "//            pp.547-566, Vieweg, Weisbaden.                                                                            //\n"
+                              "//            Computers II. DFG Priority Research Program Results 1993-1995, No. 48 in Notes on Numerical Fluid         //\n"
+                              "//            Mechanics, pp.547-566, Vieweg, Weisbaden.                                                                 //\n"
                              "//                                                                                                                      //\n"
                              "//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////" );


--- a/apps/benchmarks/TurbulentChannel/CMakeLists.txt
+++ b/apps/benchmarks/TurbulentChannel/CMakeLists.txt
+waLBerla_link_files_to_builddir( *.prm )
+
+if( WALBERLA_BUILD_WITH_CODEGEN )
+
+    #   Turbulent Channel generation
+    walberla_generate_target_from_python( NAME TurbulentChannel_CodeGeneration
+            FILE TurbulentChannel.py
+            OUT_FILES CodegenIncludes.h
+            TurbulentChannel_Sweep.cpp TurbulentChannel_Sweep.h
+            TurbulentChannel_PackInfo.cpp TurbulentChannel_PackInfo.h
+            TurbulentChannel_Setter.cpp TurbulentChannel_Setter.h
+            TurbulentChannel_NoSlip.cpp TurbulentChannel_NoSlip.h
+            TurbulentChannel_FreeSlip_top.cpp TurbulentChannel_FreeSlip_top.h
+            TurbulentChannel_WFB_bottom.cpp TurbulentChannel_WFB_bottom.h
+            TurbulentChannel_WFB_top.cpp TurbulentChannel_WFB_top.h
+            TurbulentChannel_Welford.cpp TurbulentChannel_Welford.h
+            TurbulentChannel_Welford_TKE_SGS.cpp TurbulentChannel_Welford_TKE_SGS.h
+            TurbulentChannel_TKE_SGS_Writer.cpp TurbulentChannel_TKE_SGS_Writer.h
+    )
+
+    walberla_add_executable( NAME TurbulentChannel_Application
+          FILES TurbulentChannel.cpp
+          DEPENDS walberla::blockforest walberla::core walberla::domain_decomposition walberla::field walberla::geometry walberla::timeloop
+          walberla::lbm walberla::stencil walberla::vtk TurbulentChannel_CodeGeneration )
+
+endif()
\ No newline at end of file
--- a/apps/benchmarks/TurbulentChannel/TurbulentChannel.cpp
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TurbulentChannel.cpp
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include <memory>
+#include <cmath>
+#include <string>
+#include <iostream>
+
+#include <blockforest/all.h>
+#include <core/all.h>
+#include <domain_decomposition/all.h>
+#include <field/all.h>
+#include <field/vtk/VTKWriter.h>
+#include <geometry/all.h>
+#include <timeloop/all.h>
+#include <lbm/all.h>
+
+//    Codegen Includes
+#include "CodegenIncludes.h"
+
+namespace walberla {
+   ///////////////////////
+   /// Typedef Aliases ///
+   ///////////////////////
+
+   using Stencil_T = codegen::Stencil_T;
+
+   // Communication Pack Info
+   using PackInfo_T = pystencils::TurbulentChannel_PackInfo;
+
+   // PDF field type
+   using PdfField_T = field::GhostLayerField< real_t, Stencil_T::Size >;
+
+   // Field Types
+   using ScalarField_T = field::GhostLayerField< real_t, 1 >;
+   using VectorField_T = field::GhostLayerField< real_t, Stencil_T::D >;
+   using TensorField_T = field::GhostLayerField< real_t, Stencil_T::D*Stencil_T::D >;
+
+   using Setter_T = pystencils::TurbulentChannel_Setter;
+
+   using StreamCollideSweep_T = pystencils::TurbulentChannel_Sweep;
+   using WelfordSweep_T = pystencils::TurbulentChannel_Welford;
+   using TKEWelfordSweep_T = pystencils::TurbulentChannel_Welford_TKE_SGS;
+
+   using TkeSgsWriter_T = pystencils::TurbulentChannel_TKE_SGS_Writer;
+
+   // Boundary Handling
+   using flag_t = uint8_t;
+   using FlagField_T = FlagField< flag_t >;
+   using NoSlip_T = lbm::TurbulentChannel_NoSlip;
+   using FreeSlip_top_T = lbm::TurbulentChannel_FreeSlip_top;
+   using WFB_bottom_T = lbm::TurbulentChannel_WFB_bottom;
+   using WFB_top_T = lbm::TurbulentChannel_WFB_top;
+
+   /// DEAN CORRELATIONS
+
+   namespace dean_correlation {
+
+      real_t calculateFrictionReynoldsNumber(const real_t reynoldsBulk) {
+         return std::pow(0.073_r / 8_r, 1_r / 2_r) * std::pow(reynoldsBulk, 7_r / 8_r);
+      }
+
+      real_t calculateBulkReynoldsNumber(const real_t reynoldsFriction) {
+         return std::pow(8_r / 0.073_r, 4_r / 7_r) * std::pow(reynoldsFriction, 8_r / 7_r);
+      }
+   } // namespace dean_correlation
+
+
+   /// VELOCITY FIELD INITIALISATION
+
+   /*
+    * Initialises the velocity field with a logarithmic profile and sinusoidal perturbations to trigger turbulence.
+    * This initialisation is provided by Henrik Asmuth.
+    */
+   template<typename VelocityField_T>
+   void setVelocityFieldsAsmuth( const std::weak_ptr<StructuredBlockStorage>& forest,
+                                 const BlockDataID & velocityFieldId, const BlockDataID & meanVelocityFieldId,
+                                 const real_t frictionVelocity, const uint_t channel_half_width,
+                                 const real_t B, const real_t kappa, const real_t viscosity,
+                                 const uint_t wallAxis, const uint_t flowAxis ) {
+
+      auto blocks = forest.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+      const auto domainSize = blocks->getDomain().max();
+      const auto delta = real_c(channel_half_width);
+      const auto remAxis = 3 - wallAxis - flowAxis;
+
+      for( auto block = blocks->begin(); block != blocks->end(); ++block ) {
+
+         auto * velocityField = block->template getData<VelocityField_T>(velocityFieldId);
+         WALBERLA_CHECK_NOT_NULLPTR(velocityField)
+
+         auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityFieldId);
+         WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+
+         const auto ci = velocityField->xyzSizeWithGhostLayer();
+         for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
+
+            Cell globalCell(*cellIt);
+            blocks->transformBlockLocalToGlobalCell(globalCell, *block);
+            Vector3<real_t> cellCenter;
+            blocks->getCellCenter(cellCenter, globalCell);
+
+            const auto y = cellCenter[wallAxis];
+            const auto rel_x = cellCenter[flowAxis] / domainSize[flowAxis];
+            const auto rel_z = cellCenter[remAxis] / domainSize[remAxis];
+
+            const real_t pos = std::max(delta - std::abs(y - delta - 1_r), 0.05_r);
+            const auto rel_y = pos / delta;
+
+            auto initialVel = frictionVelocity * (std::log(frictionVelocity * pos / viscosity) / kappa + B);
+
+            Vector3<real_t> vel;
+            vel[flowAxis] = initialVel;
+
+            vel[remAxis] = 2_r * frictionVelocity / kappa * std::sin(math::pi * 16_r * rel_x) *
+                           std::sin(math::pi * 8_r * rel_y) / (std::pow(rel_y, 2_r) + 1_r);
+
+            vel[wallAxis] = 8_r * frictionVelocity / kappa *
+                            (std::sin(math::pi * 8_r * rel_z) * std::sin(math::pi * 8_r * rel_y) +
+                             std::sin(math::pi * 8_r * rel_x)) / (std::pow(0.5_r * delta - pos, 2_r) + 1_r);
+
+            for(uint_t d = 0; d < 3; ++d) {
+               velocityField->get(*cellIt, d) = vel[d];
+               meanVelocityField->get(*cellIt, d) = vel[d];
+            }
+
+         }
+      }
+
+   } // function setVelocityFieldsHenrik
+
+   /// SIMULATION PARAMETERS
+
+   struct SimulationParameters {
+
+      SimulationParameters(const Config::BlockHandle & config)
+      {
+         channelHalfWidth = config.getParameter<uint_t>("channel_half_width");
+         fullChannel = config.getParameter<bool>("full_channel", false);
+
+         /// TARGET QUANTITIES
+
+         targetFrictionReynolds = config.getParameter<real_t>("target_friction_reynolds");
+         targetBulkVelocity = config.getParameter<real_t>("target_bulk_velocity", 0.05_r);
+
+         targetBulkReynolds = dean_correlation::calculateBulkReynoldsNumber(targetFrictionReynolds);
+         viscosity = 2_r * real_c(channelHalfWidth) * targetBulkVelocity / targetBulkReynolds;
+         targetFrictionVelocity = targetFrictionReynolds * viscosity / real_c(channelHalfWidth);
+
+         /// TIMESTEPS
+
+         timesteps = config.getParameter<uint_t>("timesteps", 0);
+         const uint_t turnoverPeriods = config.getParameter<uint_t>("turnover_periods", 0);
+
+         WALBERLA_ASSERT((timesteps != 0) != (turnoverPeriods != 0),
+                         "Either timesteps OR turnover periods must be given.")
+
+         if(turnoverPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            timesteps = turnoverPeriods * uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
+         }
+
+         /// DOMAIN DEFINITIONS
+
+         // obtained from codegen script -> adapt there
+         wallAxis = codegen::wallAxis;
+         flowAxis = codegen::flowAxis;
+
+         uint_t sizeFlowAxis      = config.getParameter<uint_t>("size_flow_axis", 0);
+         uint_t sizeRemainingAxis = config.getParameter<uint_t>("size_remaining_axis", 0);
+
+         WALBERLA_ASSERT_NOT_IDENTICAL(wallAxis, flowAxis, "Wall and flow axis must be different.")
+
+         const auto sizeFactor = channelHalfWidth / uint_t(10);
+         if( !sizeFlowAxis) sizeFlowAxis = sizeFactor * 64;
+         if( !sizeRemainingAxis) sizeRemainingAxis = sizeFactor * 32;
+
+         domainSize[wallAxis] = fullChannel ? 2 * channelHalfWidth : channelHalfWidth;
+         domainSize[flowAxis] = sizeFlowAxis;
+         domainSize[3- wallAxis - flowAxis] = sizeRemainingAxis;
+
+         periodicity[wallAxis] = false;
+
+         boundaryCondition = config.getParameter<std::string>("wall_boundary_condition", "WFB");
+
+         /// OUTPUT
+
+         auto tsPerPeriod = uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
+
+         vtkFrequency = config.getParameter<uint_t>("vtk_frequency", 0);
+         vtkStart = config.getParameter<uint_t>("vtk_start", 0);
+         plotFrequency = config.getParameter<uint_t>("plot_frequency", 0);
+         plotStart = config.getParameter<uint_t>("plot_start", 0);
+
+         // vtk start
+         vtkStart = config.getParameter<uint_t>("vtk_start_timesteps", 0);
+         const uint_t vtkStartPeriods = config.getParameter<uint_t>("vtk_start_periods", 0);
+
+         if(vtkStart || vtkStartPeriods) {
+            WALBERLA_ASSERT((vtkStart != 0) != (vtkStartPeriods != 0),
+                            "VTK start must be given in timesteps OR periods, not both.")
+         }
+
+         if(vtkStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            vtkStart = vtkStartPeriods * tsPerPeriod;
+         }
+
+         // plot start
+         plotStart = config.getParameter<uint_t>("plot_start_timesteps", 0);
+         const uint_t plotStartPeriods = config.getParameter<uint_t>("plot_start_periods", 0);
+
+         if(plotStart || plotStartPeriods) {
+            WALBERLA_ASSERT((plotStart != 0) != (plotStartPeriods != 0),
+                            "Plotting start must be given in timesteps OR periods, not both.")
+         }
+
+         if(plotStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            plotStart = plotStartPeriods * tsPerPeriod;
+         }
+
+         // frequencies
+         if(plotFrequency) {
+            timesteps = uint_c(std::ceil(real_c(timesteps) / real_c(plotFrequency))) * plotFrequency;
+         }
+
+         // sampling start & interval
+         samplingStart = config.getParameter<uint_t>("sampling_start_timesteps", 0);
+         const uint_t samplingStartPeriods = config.getParameter<uint_t>("sampling_start_periods", 0);
+
+         if(samplingStart || samplingStartPeriods) {
+            WALBERLA_ASSERT((samplingStart != 0) != (samplingStartPeriods != 0),
+                            "Sampling start must be given in timesteps OR periods, not both.")
+         }
+
+         if(samplingStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            samplingStart = samplingStartPeriods * tsPerPeriod;
+         }
+
+         samplingInterval = config.getParameter<uint_t>("sampling_interval_timesteps", 0);
+         const uint_t samplingIntervalPeriods = config.getParameter<uint_t>("sampling_interval_periods", 0);
+
+         if(samplingInterval || samplingIntervalPeriods) {
+            WALBERLA_ASSERT((samplingInterval != 0) != (samplingIntervalPeriods != 0),
+                            "Sampling start must be given in timesteps OR periods, not both.")
+         }
+
+         if(samplingStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            samplingInterval = samplingIntervalPeriods * tsPerPeriod;
+         }
+
+         timesteps += 1;
+
+      }
+
+      uint_t channelHalfWidth{};
+      bool fullChannel{};
+      Vector3<uint_t> domainSize{};
+      Vector3<uint_t> periodicity{true};
+
+      real_t targetFrictionReynolds{};
+      real_t targetBulkReynolds{};
+      real_t targetFrictionVelocity{};
+      real_t targetBulkVelocity{};
+
+      real_t viscosity{};
+      const real_t density{1.0};
+
+      uint_t timesteps{};
+
+      std::string boundaryCondition{};
+
+      uint_t wallAxis{};
+      uint_t flowAxis{};
+
+      /// output
+      uint_t vtkFrequency{};
+      uint_t vtkStart{};
+      uint_t plotFrequency{};
+      uint_t plotStart{};
+      uint_t samplingStart{};
+      uint_t samplingInterval{};
+
+   };
+
+
+   namespace boundaries {
+      void createBoundaryConfig(const SimulationParameters & parameters, Config::Block & boundaryBlock) {
+
+         auto & bottomWall = boundaryBlock.createBlock("Border");
+         bottomWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, true)]);
+         bottomWall.addParameter("walldistance", "-1");
+         if(parameters.boundaryCondition == "NoSlip") {
+            bottomWall.addParameter("flag", "NoSlip");
+         } else if(parameters.boundaryCondition == "WFB") {
+            bottomWall.addParameter("flag", "WFB_bottom");
+         }
+
+         auto & topWall = boundaryBlock.createBlock("Border");
+         topWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, false)]);
+         topWall.addParameter("walldistance", "-1");
+         if(parameters.fullChannel) {
+            if (parameters.boundaryCondition == "NoSlip") {
+               topWall.addParameter("flag", "NoSlip");
+            } else if (parameters.boundaryCondition == "WFB") {
+               topWall.addParameter("flag", "WFB_top");
+            }
+         } else {
+            topWall.addParameter("flag", "FreeSlip");
+         }
+
+      }
+   }
+
+   /// BULK VELOCITY CALCULATION
+   template< typename VelocityField_T >
+   class ForceCalculator {
+
+    public:
+      ForceCalculator(const std::weak_ptr<StructuredBlockStorage> & blocks, const BlockDataID meanVelocityId,
+                      const SimulationParameters & parameter)
+         : blocks_(blocks), meanVelocityId_(meanVelocityId), channelHalfWidth_(real_c(parameter.channelHalfWidth)),
+           targetBulkVelocity_(parameter.targetBulkVelocity), targetFrictionVelocity_(parameter.targetFrictionVelocity)
+      {
+         const auto & domainSize = parameter.domainSize;
+
+         Cell maxCell;
+         maxCell[parameter.wallAxis] = int_c(parameter.channelHalfWidth) - 1;
+         maxCell[flowDirection_] = int_c(domainSize[flowDirection_]) - 1;
+         const auto remainingIdx = 3 - parameter.wallAxis - flowDirection_;
+         maxCell[remainingIdx] = int_c(domainSize[remainingIdx]) - 1;
+         ci_ = CellInterval(Cell{}, maxCell);
+
+         numCells_ = real_c(parameter.channelHalfWidth * domainSize[flowDirection_] * domainSize[remainingIdx]);
+      }
+
+      real_t bulkVelocity() const { return bulkVelocity_; }
+      void setBulkVelocity(const real_t bulkVelocity) { bulkVelocity_ = bulkVelocity; }
+
+      void calculateBulkVelocity() {
+
+         // reset bulk velocity
+         bulkVelocity_ = 0_r;
+
+         auto blocks = blocks_.lock();
+         WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+         for( auto block = blocks->begin(); block != blocks->end(); ++block) {
+
+            auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityId_);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+
+            auto fieldSize = meanVelocityField->xyzSize();
+            CellInterval localCi;
+            blocks->transformGlobalToBlockLocalCellInterval(localCi, *block, ci_);
+            fieldSize.intersect(localCi);
+
+            auto * slicedField = meanVelocityField->getSlicedField(fieldSize);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+
+            for(auto fieldIt = slicedField->beginXYZ(); fieldIt != slicedField->end(); ++fieldIt) {
+               const auto localMean = fieldIt[flowDirection_];
+               bulkVelocity_ += localMean;
+            }
+
+         }
+
+         mpi::allReduceInplace< real_t >(bulkVelocity_, mpi::SUM);
+         bulkVelocity_ /= numCells_;
+
+      }
+
+      real_t calculateDrivingForce() const {
+
+         // forcing term as in Malaspinas (2014) "Wall model for large-eddy simulation based on the lattice Boltzmann method"
+         const auto force = targetFrictionVelocity_ * targetFrictionVelocity_ / channelHalfWidth_
+                            + (targetBulkVelocity_ - bulkVelocity_) * targetBulkVelocity_ / channelHalfWidth_;
+
+         return force;
+      }
+
+    private:
+      const std::weak_ptr<StructuredBlockStorage> blocks_{};
+      const BlockDataID meanVelocityId_{};
+
+      const uint_t flowDirection_{};
+      const real_t channelHalfWidth_{};
+      const real_t targetBulkVelocity_{};
+      const real_t targetFrictionVelocity_{};
+
+      CellInterval ci_{};
+
+      real_t numCells_{};
+      real_t bulkVelocity_{};
+   };
+
+   template< typename Welford_T >
+   class TurbulentChannelPlotter {
+
+    public:
+      TurbulentChannelPlotter(SimulationParameters const * const parameters, Timeloop * const timeloop,
+                              ForceCalculator<VectorField_T> const * const forceCalculator,
+                              const std::weak_ptr<StructuredBlockStorage> & blocks,
+                              const BlockDataID velocityFieldId, const BlockDataID meanVelocityFieldId,
+                              const BlockDataID meanTkeSGSFieldId, const BlockDataID sosFieldId, Welford_T * velocityWelford,
+                              const bool separateFile = false)
+         : parameters_(parameters), forceCalculator_(forceCalculator), timeloop_(timeloop), blocks_(blocks),
+           velocityWelford_(velocityWelford), velocityFieldId_(velocityFieldId), meanVelocityFieldId_(meanVelocityFieldId),
+           meanTkeSGSFieldId_(meanTkeSGSFieldId), sosFieldId_(sosFieldId), plotFrequency_(parameters->plotFrequency), plotStart_(parameters->plotStart),
+           separateFiles_(separateFile)
+      {
+         if(!plotFrequency_)
+            return;
+
+         // prepare output folder
+         const filesystem::path path(baseFolder_);
+         std::string fileSuffix = parameters->boundaryCondition + "_";
+
+         if(parameters->fullChannel)
+            fileSuffix += "full_D";
+         else
+            fileSuffix += "half_D";
+
+         fileSuffix += std::to_string(parameters->channelHalfWidth) + "_Re" +
+                       std::to_string(int(parameters->targetFrictionReynolds)) ;
+
+         velocityProfilesFilePath_ = path / ("velocity_profiles_" + fileSuffix);
+         forcingDataFilePath_ = path / ("forcing_data_" + fileSuffix + "_t" +
+                                        std::to_string(parameters->timesteps-1) + ".txt");
+
+         WALBERLA_ROOT_SECTION() {
+            // create directory if not existent; empty if existent
+            if( !filesystem::exists(path) ) {
+               filesystem::create_directories(path);
+            } else {
+               for (const auto& entry : filesystem::directory_iterator(path))
+                  std::filesystem::remove_all(entry.path());
+            }
+         }
+
+         // write force header
+         std::ofstream os (forcingDataFilePath_, std::ios::out | std::ios::trunc);
+         if(os.is_open()) {
+            os << "# timestep\t bulk_velocity\t driving_force\n";
+            os.close();
+         } else {
+            WALBERLA_ABORT("Could not open forcing data file.")
+         }
+
+      }
+
+      void operator()() {
+
+         const auto ts = timeloop_->getCurrentTimeStep();
+         if(ts < plotStart_)
+            return;
+
+         if(!plotFrequency_ || (ts % plotFrequency_))
+            return;
+
+         const auto channelHalfWidth = real_c(parameters_->channelHalfWidth);
+         const auto bulkVelocity = forceCalculator_->bulkVelocity();
+
+         /// write force data
+
+         WALBERLA_ROOT_SECTION() {
+            std::ofstream forceOS(forcingDataFilePath_, std::ios::out | std::ios::app);
+            if (forceOS.is_open())
+            {
+               forceOS << ts << "\t" << bulkVelocity << "\t" << forceCalculator_->calculateDrivingForce() << "\n";
+               forceOS.close();
+            }
+         }
+
+         /// write velocity data
+
+         // gather velocity data
+         std::vector<real_t> instantaneousVelocityVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> meanVelocityVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> tkeSGSVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> tkeResolvedVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> reynoldsStressVector(parameters_->channelHalfWidth * TensorField_T::F_SIZE, 0_r);
+
+         const auto idxFlow = int_c(parameters_->domainSize[parameters_->flowAxis] / uint_t(2));
+         const auto idxRem = int_c(parameters_->domainSize[3 - parameters_->flowAxis - parameters_->wallAxis] / uint_t(2));
+
+         Cell point;
+         point[parameters_->flowAxis] = idxFlow;
+         point[3 - parameters_->flowAxis - parameters_->wallAxis] = idxRem;
+
+         const auto flowAxis = int_c(parameters_->flowAxis);
+
+         auto blocks = blocks_.lock();
+         WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+         for(auto block = blocks->begin(); block != blocks->end(); ++block) {
+
+            const auto * const velocity = block->template getData<VectorField_T>(velocityFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(velocity)
+
+            const auto * const meanVelocity = block->template getData<VectorField_T>(meanVelocityFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocity)
+
+            const auto * const tkeSGS = block->template getData<ScalarField_T>(meanTkeSGSFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(tkeSGS)
+
+            const auto * const sos = block->template getData<TensorField_T>(sosFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(sos)
+
+            for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
+
+               point[parameters_->wallAxis] = int_c(idx);
+
+               Cell localCell;
+               blocks->transformGlobalToBlockLocalCell(localCell, *block, point);
+
+               if(velocity->xyzSize().contains(localCell)){
+                  instantaneousVelocityVector[idx] = velocity->get(localCell, flowAxis);
+                  meanVelocityVector[idx] = meanVelocity->get(localCell, flowAxis);
+                  tkeSGSVector[idx] = tkeSGS->get(localCell);
+                  for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+i] = sos->get(localCell,i) / velocityWelford_->getCounter();
+                  }
+                  tkeResolvedVector[idx] = real_c(0.5) * (
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+0] +
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+4] +
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+8]
+                  );
+               }
+            }
+         }
+
+         // MPI exchange information
+         mpi::reduceInplace(instantaneousVelocityVector, mpi::SUM);
+         mpi::reduceInplace(meanVelocityVector, mpi::SUM);
+         mpi::reduceInplace(tkeSGSVector, mpi::SUM);
+         mpi::reduceInplace(tkeResolvedVector, mpi::SUM);
+         mpi::reduceInplace(reynoldsStressVector, mpi::SUM);
+
+         WALBERLA_ROOT_SECTION()
+         {
+            std::ofstream velocityOS;
+            filesystem::path path = velocityProfilesFilePath_;
+            if (separateFiles_) {
+               path.concat("_t" + std::to_string(timeloop_->getCurrentTimeStep()) + ".txt");
+               velocityOS.open(path, std::ios::out | std::ios::trunc);
+            } else {
+               path.concat("_t" + std::to_string(parameters_->timesteps-1) + ".txt");
+               velocityOS.open(path, std::ios::out | std::ios::trunc);
+            }
+
+            if (velocityOS.is_open()) {
+               if (!separateFiles_) velocityOS << "# t = " << ts << "\n";
+               velocityOS << "# y/delta\t y+\t u+\t u_mean\t u_instantaneous\t TKE_SGS\t TKE_resolved\t uu_rms\t uv_rms\t uw_rms\t vu_rms\t vv_rms\t vw_rms\t wu_rms\t wv_rms\t ww_rms\n";
+
+               const auto & viscosity = parameters_->viscosity;
+               const auto bulkReynolds = 2_r * channelHalfWidth * bulkVelocity / viscosity;
+               const auto frictionReynolds = dean_correlation::calculateFrictionReynoldsNumber(bulkReynolds);
+               const auto frictionVelocity = frictionReynolds * viscosity / channelHalfWidth;
+
+               for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
+                  // relative position
+                  velocityOS << (real_c(idx)+0.5_r) / channelHalfWidth << "\t";
+                  // y+
+                  velocityOS << (real_c(idx)+0.5_r) * frictionVelocity / viscosity << "\t";
+                  // u+
+                  velocityOS << meanVelocityVector[idx] / frictionVelocity << "\t";
+                  // mean velocity
+                  velocityOS << meanVelocityVector[idx] << "\t";
+                  // instantaneous velocity
+                  velocityOS << instantaneousVelocityVector[idx] << "\t";
+                  // subgrid-scale TKE
+                  velocityOS << tkeSGSVector[idx] << "\t";
+                  // resolved TKE
+                  velocityOS << tkeResolvedVector[idx] << "\t";
+                  // Reynolds stresses
+                  for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+                     velocityOS << reynoldsStressVector[idx*TensorField_T::F_SIZE+i] << "\t";
+                  }
+
+                  velocityOS << "\n";
+               }
+               velocityOS.close();
+            } else{
+               WALBERLA_ABORT("Could not open velocity plot file " << path.generic_string())
+            }
+         }
+      }
+
+    private:
+
+
+      SimulationParameters const * const parameters_{};
+      ForceCalculator<VectorField_T> const * const forceCalculator_{};
+
+      Timeloop * const timeloop_{};
+      const std::weak_ptr<StructuredBlockStorage> blocks_;
+      Welford_T * const velocityWelford_{};
+
+      const BlockDataID velocityFieldId_{};
+      const BlockDataID meanVelocityFieldId_{};
+      const BlockDataID meanTkeSGSFieldId_{};
+      const BlockDataID sosFieldId_{};
+
+      const uint_t plotFrequency_{};
+      const uint_t plotStart_{};
+
+      const bool separateFiles_{false};
+      const std::string baseFolder_{"output"};
+      filesystem::path velocityProfilesFilePath_;
+      filesystem::path forcingDataFilePath_;
+   };
+
+   /////////////////////
+   /// Main Function ///
+   /////////////////////
+
+   int main(int argc, char** argv) {
+
+      Environment walberlaEnv(argc, argv);
+
+      if (!walberlaEnv.config()) { WALBERLA_ABORT("No configuration file specified!") }
+
+      ///////////////////////////////////////////////////////
+      /// Block Storage Creation and Simulation Parameter ///
+      ///////////////////////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating block forest...")
+
+      const auto channelParameter = walberlaEnv.config()->getOneBlock("TurbulentChannel");
+      SimulationParameters simulationParameters(channelParameter);
+
+      // domain creation
+      std::shared_ptr<StructuredBlockForest> blocks;
+      {
+         Vector3< uint_t > numBlocks;
+         Vector3< uint_t > cellsPerBlock;
+         blockforest::calculateCellDistribution(simulationParameters.domainSize,
+                                                uint_c(mpi::MPIManager::instance()->numProcesses()),
+                                                numBlocks, cellsPerBlock);
+
+         const auto & periodicity = simulationParameters.periodicity;
+         auto & domainSize = simulationParameters.domainSize;
+         const Vector3<uint_t> newDomainSize(numBlocks[0] * cellsPerBlock[0], numBlocks[1] * cellsPerBlock[1], numBlocks[2] * cellsPerBlock[2]);
+
+         if(domainSize != newDomainSize) {
+            domainSize = newDomainSize;
+            WALBERLA_LOG_WARNING_ON_ROOT("\nWARNING: Domain size has changed due to the chosen domain decomposition.\n")
+         }
+
+         SetupBlockForest sforest;
+
+         sforest.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );
+
+         sforest.init( AABB(0_r, 0_r, 0_r, real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2])),
+                       numBlocks[0], numBlocks[1], numBlocks[2], periodicity[0], periodicity[1], periodicity[2] );
+
+         // calculate process distribution
+
+         const memory_t memoryLimit = numeric_cast< memory_t >( sforest.getNumberOfBlocks() );
+
+         const blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > metisConfig(
+            true, false, std::bind( blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2,
+                                    cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] ) );
+
+         sforest.calculateProcessDistribution_Default( uint_c( MPIManager::instance()->numProcesses() ), memoryLimit,
+                                                       "hilbert", 10, false, metisConfig );
+
+         if( !MPIManager::instance()->rankValid() )
+            MPIManager::instance()->useWorldComm();
+
+         // create StructuredBlockForest (encapsulates a newly created BlockForest)
+
+         WALBERLA_LOG_INFO_ON_ROOT("SetupBlockForest created successfully:\n" << sforest)
+
+         sforest.writeVTKOutput("domain_decomposition");
+
+         auto bf = std::make_shared< BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, false );
+
+         blocks = std::make_shared< StructuredBlockForest >( bf, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] );
+         blocks->createCellBoundingBoxes();
+
+      }
+
+      ////////////////////////////////////
+      /// PDF Field and Velocity Setup ///
+      ////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating fields...")
+
+      // Common Fields
+      const BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), codegen::layout);
+      const BlockDataID meanVelocityFieldId = field::addToStorage< VectorField_T >(blocks, "mean velocity", real_c(0.0), codegen::layout);
+      const BlockDataID sosFieldId = field::addToStorage< TensorField_T >(blocks, "sum of squares", real_c(0.0), codegen::layout);
+
+      const BlockDataID tkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "tke_SGS", real_c(0.0), codegen::layout);
+      const BlockDataID meanTkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "mean_tke_SGS", real_c(0.0), codegen::layout);
+
+      const BlockDataID omegaFieldId = field::addToStorage< ScalarField_T >(blocks, "omega_out", real_c(0.0), codegen::layout);
+
+      const BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+      // CPU Field for PDFs
+      const BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), codegen::layout);
+
+      ///////////////////////////////////////////
+      /// Force and bulk velocity calculation ///
+      ///////////////////////////////////////////
+
+      ForceCalculator<VectorField_T> forceCalculator(blocks, velocityFieldId, simulationParameters);
+
+      //////////////
+      /// Setter ///
+      //////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Setting up fields...")
+
+      // Velocity field setup
+      setVelocityFieldsAsmuth<VectorField_T>(
+         blocks, velocityFieldId, meanVelocityFieldId,
+         simulationParameters.targetFrictionVelocity, simulationParameters.channelHalfWidth,
+         5.5_r, 0.41_r, simulationParameters.viscosity,
+         simulationParameters.wallAxis, simulationParameters.flowAxis );
+
+      forceCalculator.setBulkVelocity(simulationParameters.targetBulkVelocity);
+      const auto initialForce = forceCalculator.calculateDrivingForce();
+
+      // pdfs setup
+      Setter_T pdfSetter(pdfFieldId, velocityFieldId, initialForce, simulationParameters.density);
+
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+         pdfSetter(blockIt.get());
+
+      /////////////
+      /// Sweep ///
+      /////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating sweeps...")
+
+      const auto omega = lbm::collision_model::omegaFromViscosity(simulationParameters.viscosity);
+      StreamCollideSweep_T streamCollideSweep(omegaFieldId, pdfFieldId, velocityFieldId, initialForce, omega);
+
+      WelfordSweep_T welfordSweep(meanVelocityFieldId, sosFieldId, velocityFieldId, 0_r);
+      TKEWelfordSweep_T welfordTKESweep(meanTkeSgsFieldId, tkeSgsFieldId, 0_r);
+
+      TkeSgsWriter_T tkeSgsWriter(omegaFieldId, pdfFieldId, tkeSgsFieldId, initialForce, omega);
+
+      /////////////////////////
+      /// Boundary Handling ///
+      /////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating boundary handling...")
+
+      const FlagUID fluidFlagUID("Fluid");
+
+      Config::Block boundaryBlock;
+      boundaries::createBoundaryConfig(simulationParameters, boundaryBlock);
+
+      std::unique_ptr<WFB_bottom_T> wfb_bottom_ptr = std::make_unique<WFB_bottom_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
+      std::unique_ptr<WFB_top_T > wfb_top_ptr = std::make_unique<WFB_top_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
+
+      NoSlip_T noSlip(blocks, pdfFieldId);
+      FreeSlip_top_T freeSlip_top(blocks, pdfFieldId);
+
+      geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, Config::BlockHandle(&boundaryBlock));
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
+      freeSlip_top.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("FreeSlip"), fluidFlagUID);
+      wfb_bottom_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_bottom"), fluidFlagUID);
+      wfb_top_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_top"), fluidFlagUID);
+
+      //////////////
+      /// Output ///
+      //////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating field output...")
+
+      // vtk output
+      auto vtkWriter = vtk::createVTKOutput_BlockData(
+         blocks, "field_writer", simulationParameters.vtkFrequency, 0, false, "vtk_out", "simulation_step",
+         false, false, true, false
+      );
+      vtkWriter->setInitialWriteCallsToSkip(simulationParameters.vtkStart);
+
+      // velocity field writer
+      auto velocityWriter = std::make_shared<field::VTKWriter<VectorField_T>>(velocityFieldId, "instantaneous velocity");
+      vtkWriter->addCellDataWriter(velocityWriter);
+
+      auto meanVelocityFieldWriter = std::make_shared<field::VTKWriter<VectorField_T>>(meanVelocityFieldId, "mean velocity");
+      vtkWriter->addCellDataWriter(meanVelocityFieldWriter);
+
+      // vtk writer
+      {
+         auto flagOutput = vtk::createVTKOutput_BlockData(
+            blocks, "flag_writer", 1, 1, false, "vtk_out", "simulation_step",
+            false, true, true, false
+         );
+         auto flagWriter = std::make_shared<field::VTKWriter<FlagField_T>>(flagFieldId, "flag field");
+         flagOutput->addCellDataWriter(flagWriter);
+         flagOutput->write();
+      }
+
+
+      /////////////////
+      /// Time Loop ///
+      /////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating timeloop...")
+
+      SweepTimeloop timeloop(blocks->getBlockStorage(), simulationParameters.timesteps);
+
+      // Communication
+      blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
+      communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+
+      auto setNewForce = [&](const real_t newForce) {
+         streamCollideSweep.setF_x(newForce);
+         tkeSgsWriter.setF_x(newForce);
+      };
+
+      // plotting
+      const bool outputSeparateFiles = channelParameter.getParameter<bool>("separate_files", false);
+      const TurbulentChannelPlotter<WelfordSweep_T > plotter(&simulationParameters, &timeloop, &forceCalculator, blocks,
+                                                             velocityFieldId, meanVelocityFieldId,
+                                                             meanTkeSgsFieldId, sosFieldId, &welfordSweep,
+                                                             outputSeparateFiles);
+
+      //NOTE must convert sweeps that are altered to lambdas, otherwise copy and counter will stay 0
+      auto welfordLambda = [&welfordSweep, &welfordTKESweep](IBlock * block) {
+         welfordSweep(block);
+         welfordTKESweep(block);
+      };
+
+      auto wfbLambda = [&wfb_bottom_ptr, &wfb_top_ptr](IBlock * block) {
+         wfb_bottom_ptr->operator()(block);
+         wfb_top_ptr->operator()(block);
+      };
+
+      auto streamCollideLambda = [&streamCollideSweep](IBlock * block) {
+         streamCollideSweep(block);
+      };
+
+      // Timeloop
+      timeloop.add() << BeforeFunction(communication, "communication")
+                     << BeforeFunction([&](){forceCalculator.calculateBulkVelocity();}, "bulk velocity calculation")
+                     << BeforeFunction([&](){
+                           const auto newForce = forceCalculator.calculateDrivingForce();
+                           setNewForce(newForce);
+                        }, "new force setter")
+                     << Sweep([](IBlock *){}, "new force setter");
+      timeloop.add() << Sweep(freeSlip_top, "freeSlip");
+      timeloop.add() << Sweep(noSlip, "noSlip");
+      timeloop.add() << Sweep(wfbLambda, "wall function bounce");
+      timeloop.add() << Sweep(streamCollideLambda, "stream and collide");
+      timeloop.add() << BeforeFunction([&](){
+                           const uint_t velCtr = uint_c(welfordSweep.getCounter());
+                           if((timeloop.getCurrentTimeStep() == simulationParameters.samplingStart) ||
+                              (timeloop.getCurrentTimeStep() > simulationParameters.samplingStart && simulationParameters.samplingInterval && (velCtr % simulationParameters.samplingInterval == 0))) {
+                              welfordSweep.setCounter(real_c(0.0));
+                              welfordTKESweep.setCounter(real_c(0.0));
+                               for(auto & block : *blocks) {
+                                  auto * sopField = block.template getData<TensorField_T >(sosFieldId);
+                                  sopField->setWithGhostLayer(0.0);
+
+                                  auto * tkeField = block.template getData<ScalarField_T>(tkeSgsFieldId);
+                                  tkeField->setWithGhostLayer(0.0);
+                               }
+                           }
+
+                           welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
+                           welfordTKESweep.setCounter(welfordTKESweep.getCounter() + real_c(1.0));
+                        }, "welford sweep")
+                     << Sweep(welfordLambda, "welford sweep");
+      timeloop.add() << Sweep(tkeSgsWriter, "TKE_SGS writer");
+
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkWriter), "VTK field output");
+      timeloop.addFuncAfterTimeStep(plotter, "Turbulent quantity plotting");
+
+      // LBM stability check
+      timeloop.addFuncAfterTimeStep( makeSharedFunctor( field::makeStabilityChecker< PdfField_T, FlagField_T >(
+                                       walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID ) ),
+                                    "LBM stability check" );
+
+      // Time logger
+      timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), 5_r),
+                                    "remaining time logger");
+
+
+      WALBERLA_LOG_INFO_ON_ROOT("Running timeloop with " << timeloop.getNrOfTimeSteps() - 1 << " timesteps...")
+
+      WcTimingPool timing;
+
+      WcTimer timer;
+      timer.start();
+
+      timeloop.run(timing);
+
+      timer.end();
+
+      double time = timer.max();
+      walberla::mpi::reduceInplace(time, walberla::mpi::MAX);
+
+      const auto timeloopTiming = timing.getReduced();
+      WALBERLA_LOG_INFO_ON_ROOT("Timeloop timing:\n" << *timeloopTiming)
+
+      const walberla::lbm::PerformanceEvaluation<FlagField_T> performance(blocks, flagFieldId, fluidFlagUID);
+      performance.logResultOnRoot(simulationParameters.timesteps, time);
+
+      return EXIT_SUCCESS;
+   }
+
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
No results found