7a6d3b57 · 7a6d3b57 · 7a6d3b57 · 7a6d3b57 · 7a6d3b57 · 7a6d3b57
--- a/apps/benchmarks/PhaseFieldAllenCahn/InitializerFunctions.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/InitializerFunctions.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file InitializerFunctions.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Constants.h"
+
+#include "field/FlagField.h"
+#include "field/communication/PackInfo.h"
+#include "field/vtk/VTKWriter.h"
+
+namespace walberla
+{
+using PhaseField_T = GhostLayerField< real_t, 1 >;
+
+void initPhaseField_bubble(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID,
+                           const real_t R                         = 10,
+                           const Vector3< real_t > bubbleMidPoint = Vector3< real_t >(0.0, 0.0, 0.0),
+                           const real_t W                         = 5)
+{
+   for (auto& block : *blocks)
+   {
+      auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+      // clang-format off
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(phaseField, Cell globalCell;
+         blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+         real_t Ri = std::sqrt((real_c(globalCell[0]) - bubbleMidPoint[0]) * (real_c(globalCell[0]) - bubbleMidPoint[0]) +
+                               (real_c(globalCell[1]) - bubbleMidPoint[1]) * (real_c(globalCell[1]) - bubbleMidPoint[1]) +
+                               (real_c(globalCell[2]) - bubbleMidPoint[2]) * (real_c(globalCell[2]) - bubbleMidPoint[2]));
+         phaseField->get(x, y, z) = real_t(0.5) + real_t(0.5) * std::tanh(real_t(2.0) * (Ri - R) / W);
+      )
+      // clang-format on
+   }
+}
+
+void initPhaseField_RTI(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID,
+                        const real_t W = 5)
+{
+   const real_t X              = real_c(blocks->getDomainCellBB().xMax());
+   const real_t halfY          = real_c(blocks->getDomainCellBB().yMax()) / real_t(2.0);
+   const real_t perturbation   = real_t(0.05);
+
+   for (auto& block : *blocks)
+   {
+      auto phaseField = block.getData< PhaseField_T >(phaseFieldID);
+      // clang-format off
+      WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(phaseField, Cell globalCell;
+         blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+         real_t tmp = perturbation * X * (std::cos((real_t(2.0) * math::pi * real_c(globalCell[0])) / X)
+                                        + std::cos((real_t(2.0) * math::pi * real_c(globalCell[2])) / X));
+         phaseField->get(x, y, z) = real_t(0.5) + real_t(0.5) * std::tanh(((real_t(globalCell[1]) - halfY) - tmp) / (W / real_t(2.0)));
+      )
+      // clang-format on
+   }
+}
+} // namespace walberla
--- a/apps/benchmarks/PhaseFieldAllenCahn/InitializerFunctions.h
+++ b/apps/benchmarks/PhaseFieldAllenCahn/InitializerFunctions.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file InitializerFunctions.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "core/math/Constants.h"
+
+#include "field/FlagField.h"
+#include "field/communication/PackInfo.h"
+#include "field/vtk/VTKWriter.h"
+
+#pragma once
+
+namespace walberla
+{
+void initPhaseField_bubble(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t R,
+                           Vector3< real_t > bubbleMidPoint, real_t W = 5);
+
+void initPhaseField_RTI(const shared_ptr< StructuredBlockStorage >& blocks, BlockDataID phaseFieldID, real_t W = 5);
+
+} // namespace walberla
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
+import os
+import waLBerla as wlb
+import pandas as pd
+
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars
+from waLBerla.tools.config import block_decomposition
+
+import sys
+from math import prod
+
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
+
+def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
+    """Checks if a single block of given size fits into GPU memory"""
+
+    cells = prod(b + 2 * gls for b in block_size)
+    # 3 values for the velocity and two for the phase field and the temporary phase field
+    values_per_cell = 2 * q_phase + 2 * q_hydro + 3 + 2
+    needed_memory = values_per_cell * cells * size_per_value
+    return needed_memory < total_mem
+
+
+class Scenario:
+    def __init__(self, time_step_strategy,
+                 cuda_block_size,
+                 cells_per_block=(256, 256, 256),
+                 cuda_enabled_mpi=False):
+        # output frequencies
+        self.vtkWriteFrequency = 0
+
+        # simulation parameters
+        self.timesteps = 101
+        self.cells_per_block = cells_per_block
+        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+        self.periodic = (1, 1, 1)
+        self.size = (self.cells_per_block[0] * self.blocks[0],
+                     self.cells_per_block[1] * self.blocks[1],
+                     self.cells_per_block[2] * self.blocks[2])
+
+        self.timeStepStrategy = time_step_strategy
+        self.cuda_block_size = cuda_block_size
+        self.warmupSteps = 10
+
+        self.cudaEnabledMpi = cuda_enabled_mpi
+
+        # bubble parameters
+        self.bubbleRadius = min(self.size) // 4
+        self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
+
+        self.scenario = 1  # 1 rising bubble, 2 RTI
+        self.config_dict = self.config()
+
+        self.csv_file = "benchmark.csv"
+
+    @wlb.member_callback
+    def config(self):
+        return {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+            },
+            'Parameters': {
+                'timesteps': self.timesteps,
+                'vtkWriteFrequency': self.vtkWriteFrequency,
+                'remainingTimeLoggerFrequency': -1,
+                'timeStepStrategy': self.timeStepStrategy,
+                'gpuBlockSize': self.cuda_block_size,
+                'warmupSteps': self.warmupSteps,
+                'scenario': self.scenario,
+                'cudaEnabledMpi': self.cudaEnabledMpi
+            },
+            'Boundaries_GPU': {
+                'Border': []
+            },
+            'Boundaries_CPU': {
+                'Border': []
+            },
+            'Bubble': {
+                'bubbleMidPoint': self.bubbleMidPoint,
+                'bubbleRadius': self.bubbleRadius,
+            },
+        }
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
+        sequenceValuesToScalars(data)
+
+        df = pd.DataFrame.from_records([data])
+        if not os.path.isfile(self.csv_file):
+            df.to_csv(self.csv_file, index=False)
+        else:
+            df.to_csv(self.csv_file, index=False, mode='a', header=False)
+
+
+def benchmark():
+    scenarios = wlb.ScenarioManager()
+
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+
+    block_size = (320, 320, 320)
+    cuda_enabled_mpi = True
+
+    if not domain_block_size_ok(block_size, gpu_mem):
+        wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+    else:
+        scenarios.add(Scenario(time_step_strategy='normal',
+                               cuda_block_size=(128, 1, 1),
+                               cells_per_block=block_size,
+                               cuda_enabled_mpi=cuda_enabled_mpi))
+
+
+benchmark()
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_cpu.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_cpu.py
+import os
+import waLBerla as wlb
+import pandas as pd
+
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars
+from waLBerla.tools.config import block_decomposition
+
+import sys
+
+
+class Scenario:
+    def __init__(self, time_step_strategy, cells_per_block=(256, 256, 256),
+                 cuda_enabled_mpi=False):
+        # output frequencies
+        self.vtkWriteFrequency = 0
+
+        # simulation parameters
+        self.timesteps = 101
+        self.cells_per_block = cells_per_block
+        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+        self.periodic = (1, 1, 1)
+        self.size = (self.cells_per_block[0] * self.blocks[0],
+                     self.cells_per_block[1] * self.blocks[1],
+                     self.cells_per_block[2] * self.blocks[2])
+
+        self.timeStepStrategy = time_step_strategy
+        self.warmupSteps = 10
+
+        self.cudaEnabledMpi = cuda_enabled_mpi
+
+        # bubble parameters
+        self.bubbleRadius = min(self.size) // 4
+        self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
+
+        self.config_dict = self.config()
+
+        self.csv_file = "benchmark.csv"
+
+    @wlb.member_callback
+    def config(self):
+        return {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+            },
+            'Parameters': {
+                'timesteps': self.timesteps,
+                'vtkWriteFrequency': self.vtkWriteFrequency,
+                'remainingTimeLoggerFrequency': -1,
+                'timeStepStrategy': self.timeStepStrategy,
+                'warmupSteps': self.warmupSteps,
+                'scenario': 1,
+                'cudaEnabledMpi': self.cudaEnabledMpi
+            },
+            'Boundaries_GPU': {
+                'Border': []
+            },
+            'Boundaries_CPU': {
+                'Border': []
+            },
+            'Bubble': {
+                'bubbleMidPoint': self.bubbleMidPoint,
+                'bubbleRadius': self.bubbleRadius,
+            },
+        }
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        sequenceValuesToScalars(data)
+
+        df = pd.DataFrame.from_records([data])
+        if not os.path.isfile(self.csv_file):
+            df.to_csv(self.csv_file, index=False)
+        else:
+            df.to_csv(self.csv_file, index=False, mode='a', header=False)
+
+
+def benchmark():
+    scenarios = wlb.ScenarioManager()
+    block_size = (64, 64, 64)
+
+    scenarios.add(Scenario(time_step_strategy='normal', cells_per_block=block_size))
+
+
+def kernel_benchmark():
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]
+
+    for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
+        for block_size in block_sizes:
+            scenario = Scenario(time_step_strategy=time_step_strategy,
+                                cells_per_block=block_size)
+            scenarios.add(scenario)
+
+
+# benchmark()
+kernel_benchmark()
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file benchmark_multiphase.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Constants.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include "InitializerFunctions.h"
+
+//////////////////////////////
+// INCLUDE GENERATED FILES //
+////////////////////////////
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+#   include "gpu/AddGPUFieldToStorage.h"
+#   include "gpu/DeviceSelectMPI.h"
+#   include "gpu/ParallelStreams.h"
+#   include "gpu/communication/MemcpyPackInfo.h"
+#   include "gpu/communication/UniformGPUScheme.h"
+#else
+#   include <blockforest/communication/UniformBufferedScheme.h>
+#endif
+
+#include "GenDefines.h"
+
+using namespace walberla;
+
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+typedef gpu::GPUField< real_t > GPUField;
+#endif
+
+int main(int argc, char** argv)
+{
+   const mpi::Environment env(argc, argv);
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+   gpu::selectDeviceBasedOnMpiRank();
+#endif
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      WALBERLA_MPI_WORLD_BARRIER()
+
+      auto config = *cfg;
+      logging::configureLogging(config);
+      shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGridFromConfig(config);
+
+      // Reading parameters
+      auto parameters                    = config->getOneBlock("Parameters");
+      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
+      const uint_t timesteps             = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
+      const uint_t warmupSteps  = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      // CPU fields
+      const BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
+      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
+      // GPU fields
+      const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+         blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
+      const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+         blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
+      const BlockDataID vel_field_gpu =
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
+      BlockDataID phase_field_gpu =
+         gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+      BlockDataID phase_field_tmp = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "temporary phasefield", true);
+#else
+      BlockDataID lb_phase_field =
+         field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
+      BlockDataID lb_velocity_field =
+         field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx);
+      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
+      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
+      BlockDataID phase_field_tmp = field::addToStorage< PhaseField_T >(blocks, "phase tmp", real_c(0.0), field::fzyx);
+#endif
+
+      if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only")
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("initialization of the phase field")
+         if (scenario == 1)
+         {
+            auto bubbleParameters = config->getOneBlock("Bubble");
+            const Vector3< real_t > bubbleMidPoint =
+               bubbleParameters.getParameter< Vector3< real_t > >("bubbleMidPoint");
+            const real_t bubbleRadius = bubbleParameters.getParameter< real_t >("bubbleRadius", real_c(20.0));
+            initPhaseField_bubble(blocks, phase_field, bubbleRadius, bubbleMidPoint);
+         }
+         else if (scenario == 2)
+         {
+            initPhaseField_RTI(blocks, phase_field);
+         }
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+         gpu::fieldCpy< GPUField, PhaseField_T >(blocks, phase_field_gpu, phase_field);
+#endif
+         WALBERLA_LOG_INFO_ON_ROOT("initialization of the phase field done")
+      }
+
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      Vector3< int32_t > gpuBlockSize =
+         parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
+      pystencils::initialize_phase_field_distributions init_h(lb_phase_field_gpu, phase_field_gpu, vel_field_gpu);
+      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
+
+      pystencils::phase_field_LB_step phase_field_LB_step(
+         lb_phase_field_gpu, phase_field_gpu, phase_field_tmp, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
+      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
+                                              gpuBlockSize[1], gpuBlockSize[2]);
+#else
+      pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
+      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
+      pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, phase_field_tmp, vel_field);
+      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
+#endif
+
+// add communication
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      const bool gpuEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
+      const int streamLowPriority  = 0;
+      const int streamHighPriority = 0;
+      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
+
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field_gpu);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
+      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
+
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Full_Stencil_T > >(blocks, gpuEnabledMpi, false);
+      auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false, 65432);
+
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
+
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
+
+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
+
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
+
+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< gpu::GPUField<real_t> >(phase_field_gpu);
+           auto phaseFieldTMP = b->getData< gpu::GPUField<real_t> >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
+
+#else
+
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
+      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
+
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
+      auto UniformGPUSchemePhaseField = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks, 65432);
+
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
+
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
+
+      auto Comm_phase_field_distributions = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
+
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
+
+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< PhaseField_T >(phase_field);
+           auto phaseFieldTMP = b->getData< PhaseField_T >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
+#endif
+
+      BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+      // Boundaries
+      const FlagUID fluidFlagUID("Fluid");
+      auto boundariesConfig = config->getBlock("Boundaries_GPU");
+      if (boundariesConfig)
+      {
+         geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+         geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
+      }
+
+      // initialize the two lattice Boltzmann fields
+      if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only")
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions")
+         for (auto& block : *blocks)
+         {
+            init_h(&block);
+            init_g(&block);
+         }
+         WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+         WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+         WALBERLA_MPI_BARRIER()
+         WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions done")
+      }
+
+      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_step.getSweep(defaultStream), "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
+
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_step.getSweep(defaultStream), "Hydro LB Step");
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
+
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
+
+#else
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_step.getSweep(), "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
+
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_step.getSweep(), "Hydro LB Step");
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
+
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
+#endif
+
+      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      if (vtkWriteFrequency > 1)
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                         "simulation_step", false, true, true, false, 0);
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+         vtkOutput->addBeforeFunction(
+            [&]() { gpu::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu); });
+#endif
+         auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
+         vtkOutput->addCellDataWriter(phaseWriter);
+
+         timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      }
+
+      lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
+      fluidCells();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Multiphase benchmark with " << fluidCells.numberOfCells() << " fluid cells")
+      WALBERLA_LOG_INFO_ON_ROOT("Running " << warmupSteps << " timesteps to warm up the system")
+
+      for (uint_t i = 0; i < warmupSteps; ++i)
+         timeloop.singleStep();
+
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Warmup timesteps done")
+
+      timeloop.setCurrentTimeStepToZero();
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      WcTimingPool timeloopTiming;
+      WcTimer simTimer;
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+#endif
+      simTimer.start();
+      timeloop.run(timeloopTiming);
+#if defined(WALBERLA_BUILD_WITH_CUDA)
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+#endif
+      WALBERLA_MPI_BARRIER()
+      simTimer.end();
+      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+      double time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << performance.mlupsPerProcess(timesteps, time))
+      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
+      WALBERLA_ROOT_SECTION()
+      {
+         python_coupling::PythonCallback pythonCallbackResults("results_callback");
+         if (pythonCallbackResults.isCallable())
+         {
+            pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
+            pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
+            pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
+            #if defined(WALBERLA_BUILD_WITH_CUDA)
+               pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", gpuEnabledMpi);
+            #endif
+            // Call Python function to report results
+            pythonCallbackResults();
+         }
+      }
+   }
+
+   return EXIT_SUCCESS;
+}
--- a/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
+import numpy as np
+import sympy as sp
+
+from pystencils import fields, Target
+from pystencils.typing import TypedSymbol
+from pystencils.simp import sympy_cse
+
+from lbmpy import LBMConfig, LBStencil, Method, Stencil
+from lbmpy.creationfunctions import LBMOptimisation, create_lb_method, create_lb_update_rule
+from lbmpy.phasefield_allen_cahn.kernel_equations import initializer_kernel_phase_field_lb, \
+    initializer_kernel_hydro_lb, interface_tracking_force, hydrodynamic_force, add_interface_tracking_force, \
+    add_hydrodynamic_force, hydrodynamic_force_assignments
+from lbmpy.phasefield_allen_cahn.parameter_calculation import AllenCahnParameters
+
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_for_field, generate_info_header
+from lbmpy_walberla import generate_lb_pack_info
+
+with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
+
+    stencil_phase = LBStencil(Stencil.D3Q15)
+    stencil_hydro = LBStencil(Stencil.D3Q19)
+    assert (stencil_phase.D == stencil_hydro.D)
+
+    ########################
+    # PARAMETER DEFINITION #
+    ########################
+    # In the codegneration skript we only need the symbolic parameters
+    parameters = AllenCahnParameters(density_heavy=1.0, density_light=0.001,
+                                     dynamic_viscosity_heavy=0.03, dynamic_viscosity_light=0.03,
+                                     surface_tension=0.0001, mobility=0.02, interface_thickness=5,
+                                     gravitational_acceleration=1e-6)
+
+    ########################
+    # FIELDS #
+    ########################
+
+    # velocity field
+    u = fields(f"vel_field({stencil_hydro.D}): {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+    # phase-field
+    C = fields(f"phase_field: {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+    C_tmp = fields(f"phase_field_tmp: {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+
+    # phase-field distribution functions
+    h = fields(f"lb_phase_field({stencil_phase.Q}): {field_type}[{stencil_phase.D}D]", layout='fzyx')
+    h_tmp = fields(f"lb_phase_field_tmp({stencil_phase.Q}): {field_type}[{stencil_phase.D}D]", layout='fzyx')
+    # hydrodynamic distribution functions
+    g = fields(f"lb_velocity_field({stencil_hydro.Q}): {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+    g_tmp = fields(f"lb_velocity_field_tmp({stencil_hydro.Q}): {field_type}[{stencil_hydro.D}D]", layout='fzyx')
+
+    ########################################
+    # RELAXATION RATES AND EXTERNAL FORCES #
+    ########################################
+
+    # relaxation rate for interface tracking LB step
+    relaxation_rate_allen_cahn = 1.0 / (0.5 + (3.0 * parameters.symbolic_mobility))
+    # calculate the relaxation rate for hydrodynamic LB step
+    rho_L = parameters.symbolic_density_light
+    rho_H = parameters.symbolic_density_heavy
+    density = rho_L + C.center * (rho_H - rho_L)
+    # force acting on all phases of the model
+    body_force = [0, 0, 0]
+    body_force[1] = parameters.symbolic_gravitational_acceleration * density
+
+    omega = parameters.omega(C)
+
+    ###############
+    # LBM METHODS #
+    ###############
+
+    rates = [0.0]
+    rates += [relaxation_rate_allen_cahn] * stencil_phase.D
+    rates += [1.0] * (stencil_phase.Q - stencil_phase.D - 1)
+
+    lbm_config_phase = LBMConfig(stencil=stencil_phase, method=Method.MRT, compressible=True,
+                                 delta_equilibrium=False,
+                                 force=sp.symbols(f"F_:{stencil_phase.D}"), velocity_input=u,
+                                 weighted=True, relaxation_rates=rates,
+                                 output={'density': C_tmp})
+    method_phase = create_lb_method(lbm_config=lbm_config_phase)
+
+    lbm_config_hydro = LBMConfig(stencil=stencil_hydro, method=Method.MRT, compressible=False,
+                                 weighted=True, relaxation_rate=omega,
+                                 force=sp.symbols(f"F_:{stencil_hydro.D}"),
+                                 output={'velocity': u})
+    method_hydro = create_lb_method(lbm_config=lbm_config_hydro)
+
+    # create the kernels for the initialization of the g and h field
+    h_updates = initializer_kernel_phase_field_lb(method_phase, C, u, h, parameters)
+    h_updates = h_updates.new_with_substitutions(parameters.parameter_map())
+
+    g_updates = initializer_kernel_hydro_lb(method_hydro, 1, u, g)
+    g_updates = g_updates.new_with_substitutions(parameters.parameter_map())
+
+    force_h = interface_tracking_force(C, stencil_phase, parameters)
+    hydro_force = hydrodynamic_force(C, method_hydro, parameters, body_force)
+
+    ####################
+    # LBM UPDATE RULES #
+    ####################
+
+    lbm_optimisation = LBMOptimisation(symbolic_field=h, symbolic_temporary_field=h_tmp)
+    phase_field_LB_step = create_lb_update_rule(lbm_config=lbm_config_phase,
+                                                lbm_optimisation=lbm_optimisation)
+
+    phase_field_LB_step = add_interface_tracking_force(phase_field_LB_step, force_h)
+    phase_field_LB_step = phase_field_LB_step.new_with_substitutions(parameters.parameter_map())
+
+    phase_field_LB_step = sympy_cse(phase_field_LB_step)
+    # ---------------------------------------------------------------------------------------------------------
+    force_Assignments = hydrodynamic_force_assignments(u, C, method_hydro, parameters, body_force)
+
+    lbm_optimisation = LBMOptimisation(symbolic_field=g, symbolic_temporary_field=g_tmp)
+    hydro_LB_step = create_lb_update_rule(lbm_config=lbm_config_hydro,
+                                          lbm_optimisation=lbm_optimisation)
+
+    hydro_LB_step = add_hydrodynamic_force(hydro_LB_step, force_Assignments, C, g, parameters)
+    hydro_LB_step = hydro_LB_step.new_with_substitutions(parameters.parameter_map())
+
+    hydro_LB_step = sympy_cse(hydro_LB_step)
+
+    ###################
+    # GENERATE SWEEPS #
+    ###################
+
+    # by default NT Stores are deactivated because they do not work in all cases
+    # must be activated to achieve full potential for example on AVX512 CPUs
+    cpu_vec = {'assume_inner_stride_one': True, 'nontemporal': False}
+
+    vp = [('int32_t', 'cudaBlockSize0'),
+          ('int32_t', 'cudaBlockSize1'),
+          ('int32_t', 'cudaBlockSize2')]
+
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        TypedSymbol("cudaBlockSize2", np.int32))
+    sweep_params = {'block_size': sweep_block_size}
+
+    stencil_typedefs = {'Stencil_phase_T': stencil_phase,
+                        'Stencil_hydro_T': stencil_hydro,
+                        'Full_Stencil_T': LBStencil(Stencil.D3Q27)}
+    field_typedefs = {'PdfField_phase_T': h,
+                      'PdfField_hydro_T': g,
+                      'VelocityField_T': u,
+                      'PhaseField_T': C}
+
+    additional_code = f"""
+    const char * StencilNamePhase = "{stencil_phase.name}";
+    const char * StencilNameHydro = "{stencil_hydro.name}";
+    """
+
+    if not ctx.cuda:
+        if not ctx.optimize_for_localhost:
+            cpu_vec = {'instruction_set': None}
+
+        generate_sweep(ctx, 'initialize_phase_field_distributions', h_updates, target=Target.CPU)
+        generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target=Target.CPU)
+
+        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
+                       field_swaps=[(h, h_tmp)],
+                       inner_outer_split=True,
+                       cpu_vectorize_info=cpu_vec,
+                       target=Target.CPU)
+
+        generate_sweep(ctx, 'hydro_LB_step', hydro_LB_step,
+                       field_swaps=[(g, g_tmp)],
+                       inner_outer_split=True,
+                       cpu_vectorize_info=cpu_vec,
+                       target=Target.CPU)
+
+        # communication
+        generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
+                              streaming_pattern='pull', target=Target.CPU)
+
+        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
+                              streaming_pattern='pull', target=Target.CPU)
+
+        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.CPU)
+
+    if ctx.cuda:
+        generate_sweep(ctx, 'initialize_phase_field_distributions',
+                       h_updates, target=Target.GPU)
+        generate_sweep(ctx, 'initialize_velocity_based_distributions',
+                       g_updates, target=Target.GPU)
+
+        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
+                       field_swaps=[(h, h_tmp)],
+                       target=Target.GPU,
+                       gpu_indexing_params=sweep_params,
+                       varying_parameters=vp)
+
+        generate_sweep(ctx, 'hydro_LB_step', hydro_LB_step,
+                       field_swaps=[(g, g_tmp)],
+                       target=Target.GPU,
+                       gpu_indexing_params=sweep_params,
+                       varying_parameters=vp)
+        # communication
+        generate_lb_pack_info(ctx, 'PackInfo_phase_field_distributions', stencil_phase, h,
+                              streaming_pattern='pull', target=Target.GPU)
+
+        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
+                              streaming_pattern='pull', target=Target.GPU)
+
+        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.GPU)
+
+        # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'GenDefines', stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
+                         additional_code=additional_code)
--- a/apps/benchmarks/PhaseFieldAllenCahn/profiling.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/profiling.py
+import waLBerla as wlb
+
+
+class Scenario:
+    def __init__(self):
+        # output frequencies
+        self.vtkWriteFrequency = 100
+
+        # simulation parameters
+        self.timesteps = 3
+        edge_size = 200
+        self.cells = (edge_size, edge_size, edge_size)
+        self.blocks = (1, 1, 1)
+        self.periodic = (1, 1, 1)
+        self.size = (self.cells[0] * self.blocks[0],
+                     self.cells[1] * self.blocks[1],
+                     self.cells[2] * self.blocks[2])
+
+        self.timeStepStrategy = 'phase_only'  # 'phase_only', 'hydro_only', 'kernel_only', 'normal'
+        self.cuda_block_size = (128, 1, 1)
+
+        # bubble parameters
+        self.bubbleRadius = min(self.size) // 4
+        self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
+
+        self.config_dict = self.config()
+
+    @wlb.member_callback
+    def config(self):
+        return {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells,
+                'periodic': self.periodic,
+            },
+            'Parameters': {
+                'timesteps': self.timesteps,
+                'vtkWriteFrequency': self.vtkWriteFrequency,
+                'remainingTimeLoggerFrequency': -1,
+                'timeStepStrategy': self.timeStepStrategy,
+                'gpuBlockSize': self.cuda_block_size,
+                'warmupSteps': 0,
+                'scenario': 1,
+            },
+            'Boundaries_GPU': {
+                'Border': []
+            },
+            'Boundaries_CPU': {
+                'Border': []
+            },
+            'Bubble': {
+                'bubbleMidPoint': self.bubbleMidPoint,
+                'bubbleRadius': self.bubbleRadius,
+            },
+        }
+
+
+scenarios = wlb.ScenarioManager()
+scenarios.add(Scenario())
--- a/apps/benchmarks/PoiseuilleChannel/CMakeLists.txt
+++ b/apps/benchmarks/PoiseuilleChannel/CMakeLists.txt

 waLBerla_link_files_to_builddir( "*.dat" )                 
-                                  
-waLBerla_add_executable( NAME PoiseuilleChannel DEPENDS blockforest boundary core field lbm postprocessing stencil timeloop vtk sqlite)
+
+waLBerla_add_executable( NAME PoiseuilleChannel DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::field walberla::lbm walberla::postprocessing walberla::stencil walberla::timeloop walberla::vtk walberla::sqlite )

 ##############
 # Some tests #
 ##############

-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlatesNoCheckRelease COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlatesNoCheck.dat --trt --linear-exp PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipeNoCheckRelease           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipeNoCheck.dat           --trt --linear-exp PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlatesNoCheckRelease COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlatesNoCheck.dat --trt --linear-exp PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipeNoCheckRelease           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipeNoCheck.dat           --trt --linear-exp PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )

-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlatesNoCheckDebug COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlatesNoCheck.dat --trt --linear-exp PROCESSES 4 LABELS longrun CONFIGURATIONS Debug DebugOptimized )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipeNoCheckDebug           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipeNoCheck.dat           --trt --linear-exp PROCESSES 4 LABELS longrun CONFIGURATIONS Debug DebugOptimized )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlatesNoCheckDebug COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlatesNoCheck.dat --trt --linear-exp PROCESSES 4 LABELS longrun CONFIGURATIONS Debug DebugOptimized DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipeNoCheckDebug           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipeNoCheck.dat           --trt --linear-exp PROCESSES 4 LABELS longrun CONFIGURATIONS Debug DebugOptimized DEPENDS_ON_TARGETS PoiseuilleChannel )

-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlates0 COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlates0.dat --trt --linear-exp LABELS longrun                         CONFIGURATIONS Release RelWithDbgInfo )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlates2 COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlates2.dat --trt --linear-exp LABELS longrun verylongrun PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipe0           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipe0.dat --trt --linear-exp           LABELS longrun                         CONFIGURATIONS Release RelWithDbgInfo )
-waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipe2           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipe2.dat --trt --linear-exp           LABELS longrun verylongrun PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlates0 COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlates0.dat --trt --linear-exp LABELS longrun                         CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestParallelPlates2 COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestParallelPlates2.dat --trt --linear-exp LABELS longrun verylongrun PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipe0           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipe0.dat --trt --linear-exp           LABELS longrun                         CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
+waLBerla_execute_test( NO_MODULE_LABEL NAME PoiseuilleChannelTestPipe2           COMMAND $<TARGET_FILE:PoiseuilleChannel> ${CMAKE_CURRENT_SOURCE_DIR}/TestPipe2.dat --trt --linear-exp           LABELS longrun verylongrun PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS PoiseuilleChannel )
--- a/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
+++ b/apps/benchmarks/PoiseuilleChannel/PoiseuilleChannel.cpp
@@ -116,15 +116,15 @@ using walberla::real_t;
 // TYPEDEFS //
 //////////////

-typedef lbm::D3Q19< lbm::collision_model::SRT, false, lbm::force_model::SimpleConstant > D3Q19_SRT_INCOMP;
-typedef lbm::D3Q19< lbm::collision_model::SRT, true,  lbm::force_model::SimpleConstant > D3Q19_SRT_COMP;
-typedef lbm::D3Q19< lbm::collision_model::TRT, false, lbm::force_model::SimpleConstant > D3Q19_TRT_INCOMP;
+using D3Q19_SRT_INCOMP = lbm::D3Q19<lbm::collision_model::SRT, false, lbm::force_model::SimpleConstant>;
+using D3Q19_SRT_COMP = lbm::D3Q19<lbm::collision_model::SRT, true, lbm::force_model::SimpleConstant>;
+using D3Q19_TRT_INCOMP = lbm::D3Q19<lbm::collision_model::TRT, false, lbm::force_model::SimpleConstant>;
 //typedef lbm::D3Q19< lbm::collision_model::TRT, true,  lbm::force_model::SimpleConstant > D3Q19_TRT_COMP;

-typedef lbm::D3Q27< lbm::collision_model::SRT, false, lbm::force_model::SimpleConstant > D3Q27_SRT_INCOMP;
-typedef lbm::D3Q27< lbm::collision_model::SRT, true,  lbm::force_model::SimpleConstant > D3Q27_SRT_COMP;
-typedef lbm::D3Q27< lbm::collision_model::TRT, false, lbm::force_model::SimpleConstant > D3Q27_TRT_INCOMP;
-typedef lbm::D3Q27< lbm::collision_model::TRT, true,  lbm::force_model::SimpleConstant > D3Q27_TRT_COMP;
+using D3Q27_SRT_INCOMP = lbm::D3Q27<lbm::collision_model::SRT, false, lbm::force_model::SimpleConstant>;
+using D3Q27_SRT_COMP = lbm::D3Q27<lbm::collision_model::SRT, true, lbm::force_model::SimpleConstant>;
+using D3Q27_TRT_INCOMP = lbm::D3Q27<lbm::collision_model::TRT, false, lbm::force_model::SimpleConstant>;
+using D3Q27_TRT_COMP = lbm::D3Q27<lbm::collision_model::TRT, true, lbm::force_model::SimpleConstant>;

 template< typename LatticeModel_T >
 struct Types
@@ -154,13 +154,13 @@ template< typename LatticeModel_T, class Enable = void >
 struct StencilString;

 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q19 > > >
 {
   static const char * str() { return "D3Q19"; }
 };

 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q27 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q27 > > >
 {
   static const char * str() { return "D3Q27"; }
 };
@@ -170,15 +170,15 @@ template< typename LatticeModel_T, class Enable = void >
 struct CollisionModelString;

 template< typename LatticeModel_T >
-struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag,
-                                                                                          lbm::collision_model::SRT_tag >::value >::type >
+struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
+                                                                                          lbm::collision_model::SRT_tag > > >
 {
   static const char * str() { return "SRT"; }
 };

 template< typename LatticeModel_T >
-struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag,
-                                                                                          lbm::collision_model::TRT_tag >::value >::type >
+struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
+                                                                                          lbm::collision_model::TRT_tag > > >
 {
   static const char * str() { return "TRT"; }
 };
@@ -241,10 +241,10 @@ private:
      if( setup_.circularProfile )
      {
         auto corners = block.getAABB().corners();
-         for( auto p = corners.begin(); p != corners.end(); ++p )
+         for(auto & corner : corners)
         {
-            const real_t dy = (*p)[1] - forest.getDomain().center()[1];
-            const real_t dz = (*p)[2] - forest.getDomain().center()[2];
+            const real_t dy = corner[1] - forest.getDomain().center()[1];
+            const real_t dz = corner[2] - forest.getDomain().center()[2];
            const real_t r = setup_.radius_L - bufferDistance_;
            if( (dy * dy + dz * dz) >= (r * r) )
               return false;
@@ -265,9 +265,7 @@ private:
      {
         const auto d = block.getAABB().sqSignedDistance( forest.getDomain().center() );
         const real_t r = setup_.radius_L + bufferDistance_;
-         if( d > (r * r) )
-            return true;
-         return false;
+         return d > (r * r);
      }

      return false;
@@ -283,10 +281,10 @@ private:

 static void workloadAndMemoryAssignment( SetupBlockForest& forest, const memory_t memoryPerBlock )
 {
-   for( auto block = forest.begin(); block != forest.end(); ++block )
+   for(auto & block : forest)
   {
-      block->setWorkload( numeric_cast< workload_t >( uint_t(1) << block->getLevel() ) );
-      block->setMemory( memoryPerBlock );
+      block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) );
+      block.setMemory( memoryPerBlock );
   }
 }

@@ -305,7 +303,7 @@ static shared_ptr< SetupBlockForest > createSetupBlockForest( const blockforest:
                                                             ( setup.zCells + uint_t(2) * FieldGhostLayers ) ) * memoryPerCell;

   forest->addRefinementSelectionFunction( refinementSelectionFunctions );
-   forest->addWorkloadMemorySUIDAssignmentFunction( std::bind( workloadAndMemoryAssignment, std::placeholders::_1, memoryPerBlock ) );
+   forest->addWorkloadMemorySUIDAssignmentFunction( [memoryPerBlock](auto & bForest) { return workloadAndMemoryAssignment(bForest, memoryPerBlock); } );

   forest->init( AABB( real_c(0), real_c(0), real_c(0), real_c( setup.xBlocks * setup.xCells ),
                                                        real_c( setup.yBlocks * setup.yCells ),
@@ -455,10 +453,10 @@ class MyBoundaryHandling
 {
 public:

-   typedef lbm::NoSlip< LatticeModel_T, flag_t >      NoSlip_T;
-   typedef lbm::Curved< LatticeModel_T, FlagField_T > Curved_T;
+   using NoSlip_T = lbm::NoSlip<LatticeModel_T, flag_t>;
+   using Curved_T = lbm::Curved<LatticeModel_T, FlagField_T>;

-   typedef BoundaryHandling< FlagField_T, typename Types<LatticeModel_T>::Stencil_T, NoSlip_T, Curved_T > BoundaryHandling_T;
+   using BoundaryHandling_T = BoundaryHandling<FlagField_T, typename Types<LatticeModel_T>::Stencil_T, NoSlip_T, Curved_T>;



@@ -610,7 +608,7 @@ protected:
      const auto & dy = this->blockStorage_->dy( level );
      WALBERLA_ASSERT_FLOAT_EQUAL( dy, this->blockStorage_->dz( level ) );

-      const real_t acceleration_L = pdf_->latticeModel().forceModel().force()[0]; // force = acceleration (in lattice units of the current level!)
+      const real_t acceleration_L = pdf_->latticeModel().forceModel().forceDensity()[0]; // force = acceleration (in lattice units of the current level!)
      const real_t viscosity_L    = pdf_->latticeModel().collisionModel().viscosity(); // in lattice units on the current level
      const real_t radius_L       = channelRadius / dy; // in lattice units on the current level
      
@@ -824,9 +822,9 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod

   if( vtkBeforeTimeStep )
   {
-      for( auto output = vtkOutputFunctions.begin(); output != vtkOutputFunctions.end(); ++output )
-         timeloop.addFuncBeforeTimeStep( output->second.outputFunction, std::string("VTK: ") + output->first,
-                                         output->second.requiredGlobalStates, output->second.incompatibleGlobalStates );
+      for(auto & output : vtkOutputFunctions)
+         timeloop.addFuncBeforeTimeStep( output.second.outputFunction, std::string("VTK: ") + output.first,
+                                        output.second.requiredGlobalStates, output.second.incompatibleGlobalStates );
   }

   // add 'refinement' LB time step to time loop
@@ -846,13 +844,15 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
   timeloop.addFuncBeforeTimeStep( makeSharedFunctor(ts), "LBM refinement time step" );
                                                 
   // evaluation 
-   
-   const auto exactSolutionFunction = setup.circularProfile ? std::bind( exactPipeVelocity, std::placeholders::_1, blocks, setup ) :
-                                                              std::bind( exactPlatesVelocity, std::placeholders::_1, blocks, setup );
+   using FlowRateVelocitySolution_T = std::function<Vector3<real_t> (const Vector3<real_t> &)>;
+   const FlowRateVelocitySolution_T exactPipeFunction = [&blocks, &setup](const Vector3<real_t> &p) { return exactPipeVelocity(p, blocks, setup); };
+   const FlowRateVelocitySolution_T exactPlateFunction = [&blocks, &setup](const Vector3<real_t> &p) { return exactPlatesVelocity(p, blocks, setup); };
+
+   const auto exactSolutionFunction = setup.circularProfile ? exactPipeFunction : exactPlateFunction;

   auto volumetricFlowRate = field::makeVolumetricFlowRateEvaluation< VelocityAdaptor_T, FlagField_T >( configBlock, blocks, velocityAdaptorId,
                                                                                                        flagFieldId, Fluid_Flag,
-                                                                                                        std::bind( exactFlowRate, setup.flowRate_L ),
+                                                                                                        [flowRate = setup.flowRate_L] { return exactFlowRate(flowRate); },
                                                                                                        exactSolutionFunction );
   volumetricFlowRate->setNormalizationFactor( real_t(1) / setup.maxVelocity_L );
   volumetricFlowRate->setDomainNormalization( Vector3<real_t>( real_t(1) ) );
@@ -885,14 +885,14 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod

   if( !vtkBeforeTimeStep )
   {
-      for( auto output = vtkOutputFunctions.begin(); output != vtkOutputFunctions.end(); ++output )
-         timeloop.addFuncAfterTimeStep( output->second.outputFunction, std::string("VTK: ") + output->first,
-                                        output->second.requiredGlobalStates, output->second.incompatibleGlobalStates );
+      for(auto & output : vtkOutputFunctions)
+         timeloop.addFuncAfterTimeStep( output.second.outputFunction, std::string("VTK: ") + output.first,
+                                       output.second.requiredGlobalStates, output.second.incompatibleGlobalStates );
   }

   // remaining time logger

-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
   timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );

   // logging right before the simulation starts

--- a/apps/benchmarks/ProbeVsExtraMessage/CMakeLists.txt
+++ b/apps/benchmarks/ProbeVsExtraMessage/CMakeLists.txt
 waLBerla_add_executable ( NAME PackPerformance 
                          FILES PackPerformance.cpp
-                          DEPENDS core )
+      DEPENDS walberla::core )

 waLBerla_add_executable ( NAME ProbeVsExtraMessage 
                          FILES ProbeVsExtraMessage.cpp
-                          DEPENDS core postprocessing stencil sqlite)
+      DEPENDS walberla::core walberla::postprocessing walberla::stencil walberla::sqlite )
--- a/apps/benchmarks/ProbeVsExtraMessage/ProbeVsExtraMessage.cpp
+++ b/apps/benchmarks/ProbeVsExtraMessage/ProbeVsExtraMessage.cpp
@@ -63,7 +63,7 @@ private:
   shared_ptr<mpi::MPIManager> manager_;
   Vector3<uint_t> procs_;
   Vector3<bool>   periodicity_;
-   Vector3<int>    pos_;
+   std::array<int, 3>    pos_;
   std::array<int, 27> ranks_;
 };

@@ -73,7 +73,7 @@ MPIInfo::MPIInfo( const Vector3<uint_t>& procs, const Vector3<bool>& periodicity
   , periodicity_(periodicity)
 {
   manager_->createCartesianComm(procs[0], procs[1], procs[2], periodicity[0], periodicity[1], periodicity[2]);
-   manager_->cartesianCoord(pos_.data());
+   manager_->cartesianCoord(pos_);

   for (auto dirIt = stencil::D3Q27::beginNoCenter(); dirIt != stencil::D3Q27::end(); ++dirIt)
   {
@@ -137,9 +137,9 @@ void communicate( MPIInfo& mpiInfo,
      WALBERLA_MPI_BARRIER();
      tp["unpack"].start();
      auto& recvInfos = bs.getRecvInfos();
-      for (auto recvIt = recvInfos.begin(); recvIt != recvInfos.end(); ++recvIt)
+      for (auto & recvInfoItem : recvInfos)
      {
-         auto& rb = recvIt->second.buffer;
+         auto& rb = recvInfoItem.second.buffer;
         rb >> recvBuf;
         WALBERLA_ASSERT(rb.isEmpty());
      }
@@ -259,7 +259,7 @@ int main( int argc, char ** argv )
 }
 } // namespace walberla

-int main( int argc, char* argv[] )
+int main( int argc, char** argv )
 {
   return walberla::main( argc, argv );
-}
+}
\ No newline at end of file
--- a/apps/benchmarks/SchaeferTurek/CMakeLists.txt
+++ b/apps/benchmarks/SchaeferTurek/CMakeLists.txt

 waLBerla_link_files_to_builddir( "*.dat" )   
-                                  
-waLBerla_add_executable( NAME SchaeferTurek DEPENDS blockforest boundary core field lbm postprocessing stencil timeloop vtk sqlite )

-waLBerla_execute_test( NO_MODULE_LABEL NAME SchaeferTurekTest COMMAND $<TARGET_FILE:SchaeferTurek> Test2D.dat PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo )
\ No newline at end of file
+waLBerla_add_executable( NAME SchaeferTurek DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::field walberla::lbm walberla::postprocessing walberla::stencil walberla::timeloop walberla::vtk walberla::sqlite )
+
+waLBerla_execute_test( NO_MODULE_LABEL NAME SchaeferTurekTest COMMAND $<TARGET_FILE:SchaeferTurek> Test2D.dat PROCESSES 4 CONFIGURATIONS Release RelWithDbgInfo DEPENDS_ON_TARGETS SchaeferTurek )
\ No newline at end of file
--- a/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
+++ b/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
@@ -133,26 +133,26 @@ using walberla::real_t;
 // TYPEDEFS //
 //////////////

-typedef lbm::D2Q9< lbm::collision_model::SRT,       false > D2Q9_SRT_INCOMP;
-typedef lbm::D2Q9< lbm::collision_model::SRT,       true  > D2Q9_SRT_COMP;
-typedef lbm::D2Q9< lbm::collision_model::TRT,       false > D2Q9_TRT_INCOMP;
-typedef lbm::D2Q9< lbm::collision_model::TRT,       true  > D2Q9_TRT_COMP;
-
-typedef lbm::D3Q15< lbm::collision_model::SRT,      false > D3Q15_SRT_INCOMP;
-typedef lbm::D3Q15< lbm::collision_model::SRT,      true  > D3Q15_SRT_COMP;
-typedef lbm::D3Q15< lbm::collision_model::TRT,      false > D3Q15_TRT_INCOMP;
-typedef lbm::D3Q15< lbm::collision_model::TRT,      true  > D3Q15_TRT_COMP;
-
-typedef lbm::D3Q19< lbm::collision_model::SRT,      false > D3Q19_SRT_INCOMP;
-typedef lbm::D3Q19< lbm::collision_model::SRT,      true  > D3Q19_SRT_COMP;
-typedef lbm::D3Q19< lbm::collision_model::TRT,      false > D3Q19_TRT_INCOMP;
-typedef lbm::D3Q19< lbm::collision_model::TRT,      true  > D3Q19_TRT_COMP;
-typedef lbm::D3Q19< lbm::collision_model::D3Q19MRT, false > D3Q19_MRT_INCOMP;
-
-typedef lbm::D3Q27< lbm::collision_model::SRT,      false > D3Q27_SRT_INCOMP;
-typedef lbm::D3Q27< lbm::collision_model::SRT,      true  > D3Q27_SRT_COMP;
-typedef lbm::D3Q27< lbm::collision_model::TRT,      false > D3Q27_TRT_INCOMP;
-typedef lbm::D3Q27< lbm::collision_model::TRT,      true  > D3Q27_TRT_COMP;
+using D2Q9_SRT_INCOMP = lbm::D2Q9<lbm::collision_model::SRT, false>;
+using D2Q9_SRT_COMP = lbm::D2Q9<lbm::collision_model::SRT, true>;
+using D2Q9_TRT_INCOMP = lbm::D2Q9<lbm::collision_model::TRT, false>;
+using D2Q9_TRT_COMP = lbm::D2Q9<lbm::collision_model::TRT, true>;
+
+using D3Q15_SRT_INCOMP = lbm::D3Q15<lbm::collision_model::SRT, false>;
+using D3Q15_SRT_COMP = lbm::D3Q15<lbm::collision_model::SRT, true>;
+using D3Q15_TRT_INCOMP = lbm::D3Q15<lbm::collision_model::TRT, false>;
+using D3Q15_TRT_COMP = lbm::D3Q15<lbm::collision_model::TRT, true>;
+
+using D3Q19_SRT_INCOMP = lbm::D3Q19<lbm::collision_model::SRT, false>;
+using D3Q19_SRT_COMP = lbm::D3Q19<lbm::collision_model::SRT, true>;
+using D3Q19_TRT_INCOMP = lbm::D3Q19<lbm::collision_model::TRT, false>;
+using D3Q19_TRT_COMP = lbm::D3Q19<lbm::collision_model::TRT, true>;
+using D3Q19_MRT_INCOMP = lbm::D3Q19<lbm::collision_model::D3Q19MRT, false>;
+
+using D3Q27_SRT_INCOMP = lbm::D3Q27<lbm::collision_model::SRT, false>;
+using D3Q27_SRT_COMP = lbm::D3Q27<lbm::collision_model::SRT, true>;
+using D3Q27_TRT_INCOMP = lbm::D3Q27<lbm::collision_model::TRT, false>;
+using D3Q27_TRT_COMP = lbm::D3Q27<lbm::collision_model::TRT, true>;

 template< typename LatticeModel_T >
 struct Types
@@ -204,25 +204,25 @@ template< typename LatticeModel_T, class Enable = void >
 struct StencilString;

 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D2Q9 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D2Q9 > > >
 {
   static const char * str() { return "D2Q9"; }
 };

 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q15 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q15 > > >
 {
   static const char * str() { return "D3Q15"; }
 };

 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q19 > > >
 {
   static const char * str() { return "D3Q19"; }
 };

 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q27 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q27 > > >
 {
   static const char * str() { return "D3Q27"; }
 };
@@ -232,22 +232,22 @@ template< typename LatticeModel_T, class Enable = void >
 struct CollisionModelString;

 template< typename LatticeModel_T >
-struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag,
-                                                                                          lbm::collision_model::SRT_tag >::value >::type >
+struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
+                                                                                          lbm::collision_model::SRT_tag > > >
 {
   static const char * str() { return "SRT"; }
 };

 template< typename LatticeModel_T >
-struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag,
-                                                                                          lbm::collision_model::TRT_tag >::value >::type >
+struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
+                                                                                          lbm::collision_model::TRT_tag > > >
 {
   static const char * str() { return "TRT"; }
 };

 template< typename LatticeModel_T >
-struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag,
-                                                                                          lbm::collision_model::MRT_tag >::value >::type >
+struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
+                                                                                          lbm::collision_model::MRT_tag > > >
 {
   static const char * str() { return "MRT"; }
 };
@@ -355,7 +355,7 @@ bool Cylinder::contains( const AABB & aabb ) const
 {
   if( setup_.circularCrossSection )
   {
-      Vector3< real_t > p[8];
+      std::array< Vector3< real_t >, 8 > p;
      p[0].set( aabb.xMin(), aabb.yMin(), aabb.zMin() );
      p[1].set( aabb.xMax(), aabb.yMin(), aabb.zMin() );
      p[2].set( aabb.xMin(), aabb.yMax(), aabb.zMin() );
@@ -521,12 +521,12 @@ public:

   void operator()( SetupBlockForest & forest )
   {
-      for( auto block = forest.begin(); block != forest.end(); ++block )
+      for(auto & block : forest)
      {
-         const AABB & aabb = block->getAABB();
+         const AABB & aabb = block.getAABB();

-         if( block->getLevel() < level_ && cylinder_.intersects( aabb, bufferDistance_ ) && !cylinder_.contains( aabb ) )
-            block->setMarker( true );
+         if( block.getLevel() < level_ && cylinder_.intersects( aabb, bufferDistance_ ) && !cylinder_.contains( aabb ) )
+            block.setMarker( true );
      }
   }

@@ -633,10 +633,10 @@ static void workloadMemoryAndSUIDAssignment( SetupBlockForest & forest, const me
   }
   else
   {
-      for( auto block = forest.begin(); block != forest.end(); ++block )
+      for(auto & block : forest)
      {
-         block->setWorkload( numeric_cast< workload_t >( uint_t(1) << block->getLevel() ) );
-         block->setMemory( memoryPerBlock );
+         block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) );
+         block.setMemory( memoryPerBlock );
      }
   }
 }
@@ -656,7 +656,8 @@ static shared_ptr< SetupBlockForest > createSetupBlockForest( const blockforest:
                                                             ( setup.xCells + uint_t(2) * FieldGhostLayers ) ) * memoryPerCell;

   forest->addRefinementSelectionFunction( refinementSelectionFunctions );
-   forest->addWorkloadMemorySUIDAssignmentFunction( std::bind( workloadMemoryAndSUIDAssignment, std::placeholders::_1, memoryPerBlock, std::cref( setup ) ) );
+   forest->addWorkloadMemorySUIDAssignmentFunction([memoryPerBlock, setup_cref = std::cref(setup)](SetupBlockForest & sbf) {
+      return workloadMemoryAndSUIDAssignment(sbf, memoryPerBlock, setup_cref); } );

   forest->init( AABB( real_c(0), real_c(0), real_c(0),
                       setup.H * ( real_c(setup.xBlocks) * real_c(setup.xCells) ) / ( real_c(setup.yzBlocks) * real_c(setup.yzCells) ), setup.H, setup.H ),
@@ -776,16 +777,15 @@ private:
 template< typename LatticeModel_T >
 struct MyBoundaryTypes
 {
-   typedef lbm::NoSlip< LatticeModel_T, flag_t >                                  NoSlip_T;
-   typedef lbm::NoSlip< LatticeModel_T, flag_t >                                  Obstacle_T;
-   typedef lbm::Curved< LatticeModel_T, FlagField_T >                             Curved_T;
-   typedef lbm::DynamicUBB< LatticeModel_T, flag_t,
-                            SinusInflowVelocity<Is2D< LatticeModel_T >::value> >  DynamicUBB_T;
-   typedef lbm::Outlet< LatticeModel_T, FlagField_T, 2, 1 >                       Outlet21_T;
-   typedef lbm::Outlet< LatticeModel_T, FlagField_T, 4, 3 >                       Outlet43_T;
-   typedef lbm::SimplePressure< LatticeModel_T, flag_t >                          PressureOutlet_T;
-
-   typedef BoundaryHandling< FlagField_T, typename Types<LatticeModel_T>::Stencil_T, NoSlip_T, Obstacle_T, Curved_T, DynamicUBB_T, Outlet21_T, Outlet43_T, PressureOutlet_T > BoundaryHandling_T;
+   using NoSlip_T = lbm::NoSlip<LatticeModel_T, flag_t>;
+   using Obstacle_T = lbm::NoSlip<LatticeModel_T, flag_t>;
+   using Curved_T = lbm::Curved<LatticeModel_T, FlagField_T>;
+   using DynamicUBB_T = lbm::DynamicUBB<LatticeModel_T, flag_t, SinusInflowVelocity<Is2D<LatticeModel_T>::value>>;
+   using Outlet21_T = lbm::Outlet<LatticeModel_T, FlagField_T, 2, 1>;
+   using Outlet43_T = lbm::Outlet<LatticeModel_T, FlagField_T, 4, 3>;
+   using PressureOutlet_T = lbm::SimplePressure<LatticeModel_T, flag_t>;
+
+   using BoundaryHandling_T = BoundaryHandling<FlagField_T, typename Types<LatticeModel_T>::Stencil_T, NoSlip_T, Obstacle_T, Curved_T, DynamicUBB_T, Outlet21_T, Outlet43_T, PressureOutlet_T>;
 };


@@ -1035,11 +1035,11 @@ Set<SUID> Pseudo2DBlockStateDetermination::operator()( const std::vector< std::p
   auto aabb = forest_.getAABBFromBlockId( destintation );
   
   Set<SUID> state;
-   for( auto it = source.begin(); it != source.end(); ++it )
+   for(const auto & it : source)
   {
-      for( auto suid = it->second.begin(); suid != it->second.end(); ++suid )
-         if( *suid != state_ )
-            state += *suid;
+      for(auto suid : it.second)
+         if( suid != state_ )
+            state += suid;
   }
   
   if( markEmptyBlocks_ )
@@ -1065,16 +1065,16 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
   uint_t maxInflowLevel( uint_t(0) );
   uint_t maxOutflowLevel( uint_t(0) );

-   // In addtion to keeping in- and outflow blocks at the same level, this callback also
+   // In addition to keeping in- and outflow blocks at the same level, this callback also
   // prevents these blocks from coarsening.

-   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+   for(auto & minTargetLevel : minTargetLevels)
   {
-      const Block * const block = it->first;
+      const Block * const block = minTargetLevel.first;
      if( forest.atDomainXMinBorder(*block) )
      {
-         it->second = std::max( it->second, block->getLevel() );
-         maxInflowLevel = std::max( maxInflowLevel, it->second );
+         minTargetLevel.second = std::max( minTargetLevel.second, block->getLevel() );
+         maxInflowLevel = std::max( maxInflowLevel, minTargetLevel.second );
      }
      if( setup.strictlyObeyL )
      {
@@ -1084,19 +1084,18 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
         p[0] = setup.L;
         if( aabb.contains(p) )
         {
-            it->second = std::max( it->second, block->getLevel() );
-            maxOutflowLevel = std::max( maxOutflowLevel, it->second );
+            minTargetLevel.second = std::max( minTargetLevel.second, block->getLevel() );
+            maxOutflowLevel = std::max( maxOutflowLevel, minTargetLevel.second );
         }
      }
      else if( forest.atDomainXMaxBorder(*block) )
      {
-         it->second = std::max( it->second, block->getLevel() );
-         maxOutflowLevel = std::max( maxOutflowLevel, it->second );
+         minTargetLevel.second = std::max( minTargetLevel.second, block->getLevel() );
+         maxOutflowLevel = std::max( maxOutflowLevel, minTargetLevel.second );
      }
   }
-   for( auto it = blocksAlreadyMarkedForRefinement.begin(); it != blocksAlreadyMarkedForRefinement.end(); ++it )
+   for(auto block : blocksAlreadyMarkedForRefinement)
   {
-      const Block * const block = *it;
      if( forest.atDomainXMinBorder(*block) )
      {
         maxInflowLevel = std::max( maxInflowLevel, block->getTargetLevel() );
@@ -1117,12 +1116,12 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
   mpi::allReduceInplace( maxInflowLevel, mpi::MAX );
   mpi::allReduceInplace( maxOutflowLevel, mpi::MAX );

-   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+   for(auto & minTargetLevel : minTargetLevels)
   {
-      const Block * const block = it->first;
+      const Block * const block = minTargetLevel.first;
      if( forest.atDomainXMinBorder(*block) )
      {
-         it->second = maxInflowLevel;
+         minTargetLevel.second = maxInflowLevel;
      }
      if( setup.strictlyObeyL )
      {
@@ -1131,10 +1130,10 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
         auto p = aabb.center();
         p[0] = setup.L;
         if( aabb.contains(p) )
-            it->second = maxOutflowLevel;
+            minTargetLevel.second = maxOutflowLevel;
      }
      else if( forest.atDomainXMaxBorder(*block) )
-         it->second = maxOutflowLevel;
+         minTargetLevel.second = maxOutflowLevel;
   }
 }

@@ -1144,10 +1143,10 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
 void pseudo2DTargetLevelCorrection( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
                                    std::vector< const Block * > &, const blockforest::BlockForest & forest )
 {
-   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+   for(auto & minTargetLevel : minTargetLevels)
   {
-      if( ! forest.atDomainZMinBorder( *(it->first ) ) )
-         it->second = uint_t(0);
+      if( ! forest.atDomainZMinBorder( *(minTargetLevel.first ) ) )
+         minTargetLevel.second = uint_t(0);
   }
 }

@@ -1178,12 +1177,12 @@ public:
   
   void operator()( std::vector< std::pair< const PhantomBlock *, walberla::any > > & blockData, const PhantomBlockForest & )
   {
-      for( auto it = blockData.begin(); it != blockData.end(); ++it )
+      for(auto & it : blockData)
      {
-         if( it->first->getState().contains( state_ ) )
-            it->second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(0) : uint8_t(1) );
+         if( it.first->getState().contains( state_ ) )
+            it.second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(0) : uint8_t(1) );
         else
-            it->second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(1) : uint8_t(0) );
+            it.second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(1) : uint8_t(0) );
      }
   }

@@ -1298,10 +1297,9 @@ public:
               const bool logToStream = true, const bool logToFile = true, const std::string& filename = std::string("SchaeferTurek.txt"),
               const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
               const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) :
-      initialized_( false ), blocks_( blocks ),
-      executionCounter_( uint_t(0) ), checkFrequency_( checkFrequency ), pdfFieldId_( pdfFieldId ), flagFieldId_( flagFieldId ),
-      fluid_( fluid ), obstacle_( obstacle ), setup_( setup ), D_( uint_t(0) ), AD_( real_t(0) ), AL_( real_t(0) ), forceEvaluationExecutionCount_( uint_t(0) ),
-      strouhalRising_( false ), strouhalNumberRealD_( real_t(0) ), strouhalNumberDiscreteD_( real_t(0) ), strouhalEvaluationExecutionCount_( uint_t(0) ),
+      blocks_( blocks ),
+      checkFrequency_( checkFrequency ), pdfFieldId_( pdfFieldId ), flagFieldId_( flagFieldId ),
+      fluid_( fluid ), obstacle_( obstacle ), setup_( setup ),
      logToStream_( logToStream ), logToFile_( logToFile ), filename_( filename ),
      requiredSelectors_( requiredSelectors ), incompatibleSelectors_( incompatibleSelectors )
   {
@@ -1318,9 +1316,9 @@ public:
                    "cD (real area) [5], cL (real area) [6], cD (discrete area) [7], cL (discrete area) [8], "
                    "pressure difference (in lattice units) [9], pressure difference (in Pa) [10], vortex velocity (in lattice units) [11], "
                    "Strouhal number (real D) [12], Strouhal number (discrete D) [13]" << std::endl;
-            if( setup_.evaluatePressure == false )
+            if( !setup_.evaluatePressure )
               file << "# ATTENTION: pressure was not evaluated, pressure difference is set to zero!" << std::endl;
-            if( setup_.evaluateStrouhal == false )
+            if( !setup_.evaluateStrouhal )
               file << "# ATTENTION: vortex velocities were not evaluated, Strouhal number is set to zero!" << std::endl;
            file.close();
         }
@@ -1346,13 +1344,13 @@ protected:



-   bool initialized_;
+   bool initialized_{ false };

   weak_ptr< StructuredBlockStorage > blocks_;
   std::map< IBlock *, std::vector< std::pair< Cell, stencil::Direction > > >  directions_;

-   uint_t executionCounter_;
-   uint_t checkFrequency_;
+   uint_t executionCounter_{ uint_c(0) };
+   uint_t checkFrequency_{ uint_c(0) };
   
   BlockDataID pdfFieldId_;
   BlockDataID flagFieldId_;
@@ -1362,23 +1360,23 @@ protected:
   
   Setup setup_;

-   uint_t D_;
-   real_t AD_;
-   real_t AL_;
+   uint_t D_{ uint_c(0) };
+   real_t AD_{ real_c(0) };
+   real_t AL_{ real_c(0) };
   
   Vector3< real_t > force_;
   std::vector< math::Sample > forceSample_;
-   uint_t forceEvaluationExecutionCount_;
+   uint_t forceEvaluationExecutionCount_{ uint_c(0) };
   
   std::vector< std::deque< real_t > > coefficients_;
   std::vector< std::pair< real_t, real_t > > coefficientExtremas_;

   std::vector< real_t > strouhalVelocities_;
   std::vector< uint_t > strouhalTimeStep_;
-   bool strouhalRising_;
-   real_t strouhalNumberRealD_;
-   real_t strouhalNumberDiscreteD_;
-   uint_t strouhalEvaluationExecutionCount_;
+   bool strouhalRising_{ false };
+   real_t strouhalNumberRealD_{ real_c(0) };
+   real_t strouhalNumberDiscreteD_{ real_c(0) };
+   uint_t strouhalEvaluationExecutionCount_{ uint_c(0) };

   bool logToStream_;
   bool logToFile_;
@@ -1473,7 +1471,7 @@ void Evaluation< LatticeModel_T >::operator()()
      {
         WALBERLA_LOG_RESULT_ON_ROOT( "force acting on cylinder (in dimensionless lattice units of the coarsest grid - evaluated in time step "
                                      << forceEvaluationExecutionCount_ << "):\n   " << force_ << oss.str() <<
-                                      "\ndrag and lift coefficients (including extremas of last " << ( coefficients_[0].size() * checkFrequency_ ) << " time steps):"
+                                      "\ndrag and lift coefficients (including extrema of last " << ( coefficients_[0].size() * checkFrequency_ ) << " time steps):"
                                      "\n   \"real\" area:"
                                      "\n      c_D: " << cDRealArea << " (min = " << coefficientExtremas_[0].first << ", max = " << coefficientExtremas_[0].second << ")" <<
                                      "\n      c_L: " << cLRealArea << " (min = " << coefficientExtremas_[1].first << ", max = " << coefficientExtremas_[1].second << ")" <<
@@ -1605,10 +1603,10 @@ void Evaluation< LatticeModel_T >::operator()( IBlock * block, const uint_t leve
      const PdfField_T * const pdfField = block->template getData< PdfField_T >( pdfFieldId_ );

      const auto & directions = directions_[ block ];
-      for( auto pair = directions.begin(); pair != directions.end(); ++pair )
+      for(const auto & pair : directions)
      {
-         const Cell cell( pair->first );
-         const stencil::Direction direction( pair->second );
+         const Cell cell( pair.first );
+         const stencil::Direction direction( pair.second );

         const real_t scaleFactor = real_t(1) / real_c( uint_t(1) << ( (Is2D< LatticeModel_T >::value ? uint_t(1) : uint_t(2)) * level ) );

@@ -1713,8 +1711,8 @@ void Evaluation< LatticeModel_T >::getResultsForSQLOnRoot( std::map< std::string
 {
   WALBERLA_ROOT_SECTION()
   {
-      for( auto result = sqlResults_.begin(); result != sqlResults_.end(); ++result )
-         realProperties[ result->first ] = result->second;
+      for(const auto & sqlResult : sqlResults_)
+         realProperties[ sqlResult.first ] = sqlResult.second;
         
      integerProperties[ "forceEvaluationTimeStep" ] = int_c( forceEvaluationExecutionCount_ );
      integerProperties[ "strouhalEvaluationTimeStep" ] = int_c( strouhalEvaluationExecutionCount_ );
@@ -2051,9 +2049,7 @@ void Evaluation< LatticeModel_T >::evaluate( real_t & cDRealArea, real_t & cLRea

   // force on obstacle

-   mpi::reduceInplace( force_[0], mpi::SUM );
-   mpi::reduceInplace( force_[1], mpi::SUM );
-   mpi::reduceInplace( force_[2], mpi::SUM );
+   mpi::reduceInplace( force_, mpi::SUM );

   if( setup_.evaluateForceComponents )
   {
@@ -2158,7 +2154,7 @@ public:

   SnapshotSimulator( const weak_ptr<blockforest::StructuredBlockForest> & blocks,
                      const uint_t failAt, const uint_t failRangeBegin, const uint_t failRangeEnd, const bool failRebalance ) :
-      blocks_( blocks ), executionCounter_( 0 ),
+      blocks_( blocks ),
      failAt_( failAt ), failRangeBegin_( failRangeBegin ), failRangeEnd_( failRangeEnd ), failRebalance_( failRebalance )
   {}

@@ -2219,7 +2215,7 @@ private:

   weak_ptr<blockforest::StructuredBlockForest> blocks_;

-   uint_t executionCounter_;
+   uint_t executionCounter_{ uint_c(0) };
   uint_t failAt_;

   uint_t failRangeBegin_;
@@ -2285,7 +2281,7 @@ struct AddRefinementTimeStep
         }
         else
         {
-            typedef lbm::SplitSweep< LatticeModel_T, FlagField_T > Sweep_T;
+            using Sweep_T = lbm::SplitSweep<LatticeModel_T, FlagField_T>;
            auto mySweep = make_shared< Sweep_T >( pdfFieldId, flagFieldId, Fluid_Flag );

            addRefinementTimeStep< LatticeModel_T, Sweep_T >( timeloop, blocks, pdfFieldId, boundaryHandlingId, timingPool, levelwiseTimingPool,
@@ -2305,11 +2301,11 @@ struct AddRefinementTimeStep
 };

 template< typename LatticeModel_T  >
-struct AddRefinementTimeStep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, lbm::collision_model::MRT_tag >::value ||
-                                                                       std::is_same< typename LatticeModel_T::Stencil, stencil::D2Q9 >::value ||
-                                                                       std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q15 >::value ||
-                                                                       std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q27 >::value
-                                                                       >::type >
+struct AddRefinementTimeStep< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag, lbm::collision_model::MRT_tag > ||
+                                                                       std::is_same_v< typename LatticeModel_T::Stencil, stencil::D2Q9 > ||
+                                                                       std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q15 > ||
+                                                                       std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q27 >
+                                                                       > >
 {
   static void add( SweepTimeloop & timeloop, shared_ptr< blockforest::StructuredBlockForest > & blocks,
                    const BlockDataID & pdfFieldId, const BlockDataID & flagFieldId, const BlockDataID & boundaryHandlingId,
@@ -2365,10 +2361,10 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
   
   // add velocity field + initialize velocity field writer (only used for simulations with an adaptive block structure)

-   typedef field::GhostLayerField< Vector3<real_t>, 1 > VelocityField_T;
-   BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::zyxf, FieldGhostLayers, true, None, Empty );
+   using VelocityField_T = field::GhostLayerField<Vector3<real_t>, 1>;
+   BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::fzyx, FieldGhostLayers, true, None, Empty );

-   typedef lbm::VelocityFieldWriter< typename Types<LatticeModel_T>::PdfField_T, VelocityField_T > VelocityFieldWriter_T;
+   using VelocityFieldWriter_T = lbm::VelocityFieldWriter<typename Types<LatticeModel_T>::PdfField_T, VelocityField_T>;
   BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldId, velocityFieldId ), None, Empty );
   velocityFieldWriter();

@@ -2432,9 +2428,9 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod

   if( vtkBeforeTimeStep )
   {
-      for( auto output = vtkOutputFunctions.begin(); output != vtkOutputFunctions.end(); ++output )
-         timeloop.addFuncBeforeTimeStep( output->second.outputFunction, std::string("VTK: ") + output->first,
-                                         output->second.requiredGlobalStates, output->second.incompatibleGlobalStates );
+      for(auto & output : vtkOutputFunctions)
+         timeloop.addFuncBeforeTimeStep( output.second.outputFunction, std::string("VTK: ") + output.first,
+                                        output.second.requiredGlobalStates, output.second.incompatibleGlobalStates );
   }

   // evaluation
@@ -2493,8 +2489,8 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
         adaptiveRefinementLog = oss.str();
      }

-      minTargetLevelDeterminationFunctions.add( std::bind( keepInflowOutflowAtTheSameLevel, std::placeholders::_1, std::placeholders::_2, 
-                                                           std::placeholders::_3, std::cref(setup) ) );
+      minTargetLevelDeterminationFunctions.add([setup_cref = std::cref(setup)](auto & minTargetLevels, auto & blocksAlreadyMarkedForRefinement, auto & bf) {
+         return keepInflowOutflowAtTheSameLevel(minTargetLevels, blocksAlreadyMarkedForRefinement, bf, setup_cref); } );

      if( Is2D< LatticeModel_T >::value )
         minTargetLevelDeterminationFunctions.add( pseudo2DTargetLevelCorrection );
@@ -2570,14 +2566,14 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
                     blockforest::DynamicDiffusionBalance< blockforest::NoPhantomData >( maxIterations, flowIterations ) );
      }

-      // add callback functions which are executed after all block data was unpakced after the dynamic load balancing
+      // add callback functions which are executed after all block data was unpacked after the dynamic load balancing

      // for blocks that have *not* migrated: store current flag field state (required for lbm::PostProcessing)
      blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::MarkerFieldGenerator< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >(
               pdfFieldId, markerDataId, flagFieldFilter ) );
      // (re)set boundaries = (re)initialize flag field for every block with respect to the new block structure (the size of neighbor blocks might have changed)
      blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( blockforest::BlockForest::RefreshCallbackWrappper( boundarySetter ) );
-      // treat boundary-fluid cell convertions
+      // treat boundary-fluid cell conversions
      blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::PostProcessing< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >(
               pdfFieldId, markerDataId, flagFieldFilter ) );
      // (re)set velocity field (velocity field data is not migrated!)
@@ -2613,9 +2609,9 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod

   if( !vtkBeforeTimeStep )
   {
-      for( auto output = vtkOutputFunctions.begin(); output != vtkOutputFunctions.end(); ++output )
-         timeloop.addFuncAfterTimeStep( output->second.outputFunction, std::string("VTK: ") + output->first,
-                                        output->second.requiredGlobalStates, output->second.incompatibleGlobalStates );
+      for(auto & output : vtkOutputFunctions)
+         timeloop.addFuncAfterTimeStep( output.second.outputFunction, std::string("VTK: ") + output.first,
+                                       output.second.requiredGlobalStates, output.second.incompatibleGlobalStates );
   }

   // stability check (non-finite values in the PDF field?)
@@ -2626,7 +2622,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod

   // remaining time logger

-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
   timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );

   // logging right before the simulation starts
@@ -2921,10 +2917,10 @@ int main( int argc, char **argv )
                              "//                                                                                                                      //\n"
                              "//                                               Schaefer Turek Benchmark                                               //\n"
                              "//                                                                                                                      //\n"
-                              "// Reference: Schaefer, M. and Turek, S. (1996) 'Benchmark computations of laminar flow around a cylinder (with support //\n"
+                              "// Reference: Schaefer, M. and Turek, S. (1996) Benchmark computations of laminar flow around a cylinder (with support  //\n"
                              "//            by F. Durst, E. Krause and R. Rannacher), in E. Hirschel (Ed.): Flow Simulation with High-Performance     //\n"
-                              "//            Computers II. DFG Priority Research Program Results 1993-1995, No. 52 in Notes Numer, Fluid Mech.,        //\n"
-                              "//            pp.547-566, Vieweg, Weisbaden.                                                                            //\n"
+                              "//            Computers II. DFG Priority Research Program Results 1993-1995, No. 48 in Notes on Numerical Fluid         //\n"
+                              "//            Mechanics, pp.547-566, Vieweg, Weisbaden.                                                                 //\n"
                              "//                                                                                                                      //\n"
                              "//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////" );

@@ -3063,7 +3059,7 @@ int main( int argc, char **argv )
      refinementSelectionFunctions.add( cylinderRefinementSelection );
   }

-   refinementSelectionFunctions.add( std::bind( setInflowOutflowToSameLevel, std::placeholders::_1, setup ) );
+   refinementSelectionFunctions.add([&setup](auto & bf) { return setInflowOutflowToSameLevel(bf, setup); });

   if( setup.pseudo2D )
      refinementSelectionFunctions.add( Pseudo2DRefinementSelectionCorrection );

--- a/apps/benchmarks/TurbulentChannel/CMakeLists.txt
+++ b/apps/benchmarks/TurbulentChannel/CMakeLists.txt
+waLBerla_link_files_to_builddir( *.prm )
+
+if( WALBERLA_BUILD_WITH_CODEGEN )
+
+    #   Turbulent Channel generation
+    walberla_generate_target_from_python( NAME TurbulentChannel_CodeGeneration
+            FILE TurbulentChannel.py
+            OUT_FILES CodegenIncludes.h
+            TurbulentChannel_Sweep.cpp TurbulentChannel_Sweep.h
+            TurbulentChannel_PackInfo.cpp TurbulentChannel_PackInfo.h
+            TurbulentChannel_Setter.cpp TurbulentChannel_Setter.h
+            TurbulentChannel_NoSlip.cpp TurbulentChannel_NoSlip.h
+            TurbulentChannel_FreeSlip_top.cpp TurbulentChannel_FreeSlip_top.h
+            TurbulentChannel_WFB_bottom.cpp TurbulentChannel_WFB_bottom.h
+            TurbulentChannel_WFB_top.cpp TurbulentChannel_WFB_top.h
+            TurbulentChannel_Welford.cpp TurbulentChannel_Welford.h
+            TurbulentChannel_Welford_TKE_SGS.cpp TurbulentChannel_Welford_TKE_SGS.h
+            TurbulentChannel_TKE_SGS_Writer.cpp TurbulentChannel_TKE_SGS_Writer.h
+    )
+
+    walberla_add_executable( NAME TurbulentChannel_Application
+          FILES TurbulentChannel.cpp
+          DEPENDS walberla::blockforest walberla::core walberla::domain_decomposition walberla::field walberla::geometry walberla::timeloop
+          walberla::lbm walberla::stencil walberla::vtk TurbulentChannel_CodeGeneration )
+
+endif()
\ No newline at end of file
--- a/apps/benchmarks/TurbulentChannel/TurbulentChannel.cpp
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TurbulentChannel.cpp
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+
+#include <memory>
+#include <cmath>
+#include <string>
+#include <iostream>
+
+#include <blockforest/all.h>
+#include <core/all.h>
+#include <domain_decomposition/all.h>
+#include <field/all.h>
+#include <field/vtk/VTKWriter.h>
+#include <geometry/all.h>
+#include <timeloop/all.h>
+#include <lbm/all.h>
+
+//    Codegen Includes
+#include "CodegenIncludes.h"
+
+namespace walberla {
+   ///////////////////////
+   /// Typedef Aliases ///
+   ///////////////////////
+
+   using Stencil_T = codegen::Stencil_T;
+
+   // Communication Pack Info
+   using PackInfo_T = pystencils::TurbulentChannel_PackInfo;
+
+   // PDF field type
+   using PdfField_T = field::GhostLayerField< real_t, Stencil_T::Size >;
+
+   // Field Types
+   using ScalarField_T = field::GhostLayerField< real_t, 1 >;
+   using VectorField_T = field::GhostLayerField< real_t, Stencil_T::D >;
+   using TensorField_T = field::GhostLayerField< real_t, Stencil_T::D*Stencil_T::D >;
+
+   using Setter_T = pystencils::TurbulentChannel_Setter;
+
+   using StreamCollideSweep_T = pystencils::TurbulentChannel_Sweep;
+   using WelfordSweep_T = pystencils::TurbulentChannel_Welford;
+   using TKEWelfordSweep_T = pystencils::TurbulentChannel_Welford_TKE_SGS;
+
+   using TkeSgsWriter_T = pystencils::TurbulentChannel_TKE_SGS_Writer;
+
+   // Boundary Handling
+   using flag_t = uint8_t;
+   using FlagField_T = FlagField< flag_t >;
+   using NoSlip_T = lbm::TurbulentChannel_NoSlip;
+   using FreeSlip_top_T = lbm::TurbulentChannel_FreeSlip_top;
+   using WFB_bottom_T = lbm::TurbulentChannel_WFB_bottom;
+   using WFB_top_T = lbm::TurbulentChannel_WFB_top;
+
+   /// DEAN CORRELATIONS
+
+   namespace dean_correlation {
+
+      real_t calculateFrictionReynoldsNumber(const real_t reynoldsBulk) {
+         return std::pow(0.073_r / 8_r, 1_r / 2_r) * std::pow(reynoldsBulk, 7_r / 8_r);
+      }
+
+      real_t calculateBulkReynoldsNumber(const real_t reynoldsFriction) {
+         return std::pow(8_r / 0.073_r, 4_r / 7_r) * std::pow(reynoldsFriction, 8_r / 7_r);
+      }
+   } // namespace dean_correlation
+
+
+   /// VELOCITY FIELD INITIALISATION
+
+   /*
+    * Initialises the velocity field with a logarithmic profile and sinusoidal perturbations to trigger turbulence.
+    * This initialisation is provided by Henrik Asmuth.
+    */
+   template<typename VelocityField_T>
+   void setVelocityFieldsAsmuth( const std::weak_ptr<StructuredBlockStorage>& forest,
+                                 const BlockDataID & velocityFieldId, const BlockDataID & meanVelocityFieldId,
+                                 const real_t frictionVelocity, const uint_t channel_half_width,
+                                 const real_t B, const real_t kappa, const real_t viscosity,
+                                 const uint_t wallAxis, const uint_t flowAxis ) {
+
+      auto blocks = forest.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+      const auto domainSize = blocks->getDomain().max();
+      const auto delta = real_c(channel_half_width);
+      const auto remAxis = 3 - wallAxis - flowAxis;
+
+      for( auto block = blocks->begin(); block != blocks->end(); ++block ) {
+
+         auto * velocityField = block->template getData<VelocityField_T>(velocityFieldId);
+         WALBERLA_CHECK_NOT_NULLPTR(velocityField)
+
+         auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityFieldId);
+         WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+
+         const auto ci = velocityField->xyzSizeWithGhostLayer();
+         for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
+
+            Cell globalCell(*cellIt);
+            blocks->transformBlockLocalToGlobalCell(globalCell, *block);
+            Vector3<real_t> cellCenter;
+            blocks->getCellCenter(cellCenter, globalCell);
+
+            const auto y = cellCenter[wallAxis];
+            const auto rel_x = cellCenter[flowAxis] / domainSize[flowAxis];
+            const auto rel_z = cellCenter[remAxis] / domainSize[remAxis];
+
+            const real_t pos = std::max(delta - std::abs(y - delta - 1_r), 0.05_r);
+            const auto rel_y = pos / delta;
+
+            auto initialVel = frictionVelocity * (std::log(frictionVelocity * pos / viscosity) / kappa + B);
+
+            Vector3<real_t> vel;
+            vel[flowAxis] = initialVel;
+
+            vel[remAxis] = 2_r * frictionVelocity / kappa * std::sin(math::pi * 16_r * rel_x) *
+                           std::sin(math::pi * 8_r * rel_y) / (std::pow(rel_y, 2_r) + 1_r);
+
+            vel[wallAxis] = 8_r * frictionVelocity / kappa *
+                            (std::sin(math::pi * 8_r * rel_z) * std::sin(math::pi * 8_r * rel_y) +
+                             std::sin(math::pi * 8_r * rel_x)) / (std::pow(0.5_r * delta - pos, 2_r) + 1_r);
+
+            for(uint_t d = 0; d < 3; ++d) {
+               velocityField->get(*cellIt, d) = vel[d];
+               meanVelocityField->get(*cellIt, d) = vel[d];
+            }
+
+         }
+      }
+
+   } // function setVelocityFieldsHenrik
+
+   /// SIMULATION PARAMETERS
+
+   struct SimulationParameters {
+
+      SimulationParameters(const Config::BlockHandle & config)
+      {
+         channelHalfWidth = config.getParameter<uint_t>("channel_half_width");
+         fullChannel = config.getParameter<bool>("full_channel", false);
+
+         /// TARGET QUANTITIES
+
+         targetFrictionReynolds = config.getParameter<real_t>("target_friction_reynolds");
+         targetBulkVelocity = config.getParameter<real_t>("target_bulk_velocity", 0.05_r);
+
+         targetBulkReynolds = dean_correlation::calculateBulkReynoldsNumber(targetFrictionReynolds);
+         viscosity = 2_r * real_c(channelHalfWidth) * targetBulkVelocity / targetBulkReynolds;
+         targetFrictionVelocity = targetFrictionReynolds * viscosity / real_c(channelHalfWidth);
+
+         /// TIMESTEPS
+
+         timesteps = config.getParameter<uint_t>("timesteps", 0);
+         const uint_t turnoverPeriods = config.getParameter<uint_t>("turnover_periods", 0);
+
+         WALBERLA_ASSERT((timesteps != 0) != (turnoverPeriods != 0),
+                         "Either timesteps OR turnover periods must be given.")
+
+         if(turnoverPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            timesteps = turnoverPeriods * uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
+         }
+
+         /// DOMAIN DEFINITIONS
+
+         // obtained from codegen script -> adapt there
+         wallAxis = codegen::wallAxis;
+         flowAxis = codegen::flowAxis;
+
+         uint_t sizeFlowAxis      = config.getParameter<uint_t>("size_flow_axis", 0);
+         uint_t sizeRemainingAxis = config.getParameter<uint_t>("size_remaining_axis", 0);
+
+         WALBERLA_ASSERT_NOT_IDENTICAL(wallAxis, flowAxis, "Wall and flow axis must be different.")
+
+         const auto sizeFactor = channelHalfWidth / uint_t(10);
+         if( !sizeFlowAxis) sizeFlowAxis = sizeFactor * 64;
+         if( !sizeRemainingAxis) sizeRemainingAxis = sizeFactor * 32;
+
+         domainSize[wallAxis] = fullChannel ? 2 * channelHalfWidth : channelHalfWidth;
+         domainSize[flowAxis] = sizeFlowAxis;
+         domainSize[3- wallAxis - flowAxis] = sizeRemainingAxis;
+
+         periodicity[wallAxis] = false;
+
+         boundaryCondition = config.getParameter<std::string>("wall_boundary_condition", "WFB");
+
+         /// OUTPUT
+
+         auto tsPerPeriod = uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
+
+         vtkFrequency = config.getParameter<uint_t>("vtk_frequency", 0);
+         vtkStart = config.getParameter<uint_t>("vtk_start", 0);
+         plotFrequency = config.getParameter<uint_t>("plot_frequency", 0);
+         plotStart = config.getParameter<uint_t>("plot_start", 0);
+
+         // vtk start
+         vtkStart = config.getParameter<uint_t>("vtk_start_timesteps", 0);
+         const uint_t vtkStartPeriods = config.getParameter<uint_t>("vtk_start_periods", 0);
+
+         if(vtkStart || vtkStartPeriods) {
+            WALBERLA_ASSERT((vtkStart != 0) != (vtkStartPeriods != 0),
+                            "VTK start must be given in timesteps OR periods, not both.")
+         }
+
+         if(vtkStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            vtkStart = vtkStartPeriods * tsPerPeriod;
+         }
+
+         // plot start
+         plotStart = config.getParameter<uint_t>("plot_start_timesteps", 0);
+         const uint_t plotStartPeriods = config.getParameter<uint_t>("plot_start_periods", 0);
+
+         if(plotStart || plotStartPeriods) {
+            WALBERLA_ASSERT((plotStart != 0) != (plotStartPeriods != 0),
+                            "Plotting start must be given in timesteps OR periods, not both.")
+         }
+
+         if(plotStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            plotStart = plotStartPeriods * tsPerPeriod;
+         }
+
+         // frequencies
+         if(plotFrequency) {
+            timesteps = uint_c(std::ceil(real_c(timesteps) / real_c(plotFrequency))) * plotFrequency;
+         }
+
+         // sampling start & interval
+         samplingStart = config.getParameter<uint_t>("sampling_start_timesteps", 0);
+         const uint_t samplingStartPeriods = config.getParameter<uint_t>("sampling_start_periods", 0);
+
+         if(samplingStart || samplingStartPeriods) {
+            WALBERLA_ASSERT((samplingStart != 0) != (samplingStartPeriods != 0),
+                            "Sampling start must be given in timesteps OR periods, not both.")
+         }
+
+         if(samplingStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            samplingStart = samplingStartPeriods * tsPerPeriod;
+         }
+
+         samplingInterval = config.getParameter<uint_t>("sampling_interval_timesteps", 0);
+         const uint_t samplingIntervalPeriods = config.getParameter<uint_t>("sampling_interval_periods", 0);
+
+         if(samplingInterval || samplingIntervalPeriods) {
+            WALBERLA_ASSERT((samplingInterval != 0) != (samplingIntervalPeriods != 0),
+                            "Sampling start must be given in timesteps OR periods, not both.")
+         }
+
+         if(samplingStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            samplingInterval = samplingIntervalPeriods * tsPerPeriod;
+         }
+
+         timesteps += 1;
+
+      }
+
+      uint_t channelHalfWidth{};
+      bool fullChannel{};
+      Vector3<uint_t> domainSize{};
+      Vector3<uint_t> periodicity{true};
+
+      real_t targetFrictionReynolds{};
+      real_t targetBulkReynolds{};
+      real_t targetFrictionVelocity{};
+      real_t targetBulkVelocity{};
+
+      real_t viscosity{};
+      const real_t density{1.0};
+
+      uint_t timesteps{};
+
+      std::string boundaryCondition{};
+
+      uint_t wallAxis{};
+      uint_t flowAxis{};
+
+      /// output
+      uint_t vtkFrequency{};
+      uint_t vtkStart{};
+      uint_t plotFrequency{};
+      uint_t plotStart{};
+      uint_t samplingStart{};
+      uint_t samplingInterval{};
+
+   };
+
+
+   namespace boundaries {
+      void createBoundaryConfig(const SimulationParameters & parameters, Config::Block & boundaryBlock) {
+
+         auto & bottomWall = boundaryBlock.createBlock("Border");
+         bottomWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, true)]);
+         bottomWall.addParameter("walldistance", "-1");
+         if(parameters.boundaryCondition == "NoSlip") {
+            bottomWall.addParameter("flag", "NoSlip");
+         } else if(parameters.boundaryCondition == "WFB") {
+            bottomWall.addParameter("flag", "WFB_bottom");
+         }
+
+         auto & topWall = boundaryBlock.createBlock("Border");
+         topWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, false)]);
+         topWall.addParameter("walldistance", "-1");
+         if(parameters.fullChannel) {
+            if (parameters.boundaryCondition == "NoSlip") {
+               topWall.addParameter("flag", "NoSlip");
+            } else if (parameters.boundaryCondition == "WFB") {
+               topWall.addParameter("flag", "WFB_top");
+            }
+         } else {
+            topWall.addParameter("flag", "FreeSlip");
+         }
+
+      }
+   }
+
+   /// BULK VELOCITY CALCULATION
+   template< typename VelocityField_T >
+   class ForceCalculator {
+
+    public:
+      ForceCalculator(const std::weak_ptr<StructuredBlockStorage> & blocks, const BlockDataID meanVelocityId,
+                      const SimulationParameters & parameter)
+         : blocks_(blocks), meanVelocityId_(meanVelocityId), channelHalfWidth_(real_c(parameter.channelHalfWidth)),
+           targetBulkVelocity_(parameter.targetBulkVelocity), targetFrictionVelocity_(parameter.targetFrictionVelocity)
+      {
+         const auto & domainSize = parameter.domainSize;
+
+         Cell maxCell;
+         maxCell[parameter.wallAxis] = int_c(parameter.channelHalfWidth) - 1;
+         maxCell[flowDirection_] = int_c(domainSize[flowDirection_]) - 1;
+         const auto remainingIdx = 3 - parameter.wallAxis - flowDirection_;
+         maxCell[remainingIdx] = int_c(domainSize[remainingIdx]) - 1;
+         ci_ = CellInterval(Cell{}, maxCell);
+
+         numCells_ = real_c(parameter.channelHalfWidth * domainSize[flowDirection_] * domainSize[remainingIdx]);
+      }
+
+      real_t bulkVelocity() const { return bulkVelocity_; }
+      void setBulkVelocity(const real_t bulkVelocity) { bulkVelocity_ = bulkVelocity; }
+
+      void calculateBulkVelocity() {
+
+         // reset bulk velocity
+         bulkVelocity_ = 0_r;
+
+         auto blocks = blocks_.lock();
+         WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+         for( auto block = blocks->begin(); block != blocks->end(); ++block) {
+
+            auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityId_);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+
+            auto fieldSize = meanVelocityField->xyzSize();
+            CellInterval localCi;
+            blocks->transformGlobalToBlockLocalCellInterval(localCi, *block, ci_);
+            fieldSize.intersect(localCi);
+
+            auto * slicedField = meanVelocityField->getSlicedField(fieldSize);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+
+            for(auto fieldIt = slicedField->beginXYZ(); fieldIt != slicedField->end(); ++fieldIt) {
+               const auto localMean = fieldIt[flowDirection_];
+               bulkVelocity_ += localMean;
+            }
+
+         }
+
+         mpi::allReduceInplace< real_t >(bulkVelocity_, mpi::SUM);
+         bulkVelocity_ /= numCells_;
+
+      }
+
+      real_t calculateDrivingForce() const {
+
+         // forcing term as in Malaspinas (2014) "Wall model for large-eddy simulation based on the lattice Boltzmann method"
+         const auto force = targetFrictionVelocity_ * targetFrictionVelocity_ / channelHalfWidth_
+                            + (targetBulkVelocity_ - bulkVelocity_) * targetBulkVelocity_ / channelHalfWidth_;
+
+         return force;
+      }
+
+    private:
+      const std::weak_ptr<StructuredBlockStorage> blocks_{};
+      const BlockDataID meanVelocityId_{};
+
+      const uint_t flowDirection_{};
+      const real_t channelHalfWidth_{};
+      const real_t targetBulkVelocity_{};
+      const real_t targetFrictionVelocity_{};
+
+      CellInterval ci_{};
+
+      real_t numCells_{};
+      real_t bulkVelocity_{};
+   };
+
+   template< typename Welford_T >
+   class TurbulentChannelPlotter {
+
+    public:
+      TurbulentChannelPlotter(SimulationParameters const * const parameters, Timeloop * const timeloop,
+                              ForceCalculator<VectorField_T> const * const forceCalculator,
+                              const std::weak_ptr<StructuredBlockStorage> & blocks,
+                              const BlockDataID velocityFieldId, const BlockDataID meanVelocityFieldId,
+                              const BlockDataID meanTkeSGSFieldId, const BlockDataID sosFieldId, Welford_T * velocityWelford,
+                              const bool separateFile = false)
+         : parameters_(parameters), forceCalculator_(forceCalculator), timeloop_(timeloop), blocks_(blocks),
+           velocityWelford_(velocityWelford), velocityFieldId_(velocityFieldId), meanVelocityFieldId_(meanVelocityFieldId),
+           meanTkeSGSFieldId_(meanTkeSGSFieldId), sosFieldId_(sosFieldId), plotFrequency_(parameters->plotFrequency), plotStart_(parameters->plotStart),
+           separateFiles_(separateFile)
+      {
+         if(!plotFrequency_)
+            return;
+
+         // prepare output folder
+         const filesystem::path path(baseFolder_);
+         std::string fileSuffix = parameters->boundaryCondition + "_";
+
+         if(parameters->fullChannel)
+            fileSuffix += "full_D";
+         else
+            fileSuffix += "half_D";
+
+         fileSuffix += std::to_string(parameters->channelHalfWidth) + "_Re" +
+                       std::to_string(int(parameters->targetFrictionReynolds)) ;
+
+         velocityProfilesFilePath_ = path / ("velocity_profiles_" + fileSuffix);
+         forcingDataFilePath_ = path / ("forcing_data_" + fileSuffix + "_t" +
+                                        std::to_string(parameters->timesteps-1) + ".txt");
+
+         WALBERLA_ROOT_SECTION() {
+            // create directory if not existent; empty if existent
+            if( !filesystem::exists(path) ) {
+               filesystem::create_directories(path);
+            } else {
+               for (const auto& entry : filesystem::directory_iterator(path))
+                  std::filesystem::remove_all(entry.path());
+            }
+         }
+
+         // write force header
+         std::ofstream os (forcingDataFilePath_, std::ios::out | std::ios::trunc);
+         if(os.is_open()) {
+            os << "# timestep\t bulk_velocity\t driving_force\n";
+            os.close();
+         } else {
+            WALBERLA_ABORT("Could not open forcing data file.")
+         }
+
+      }
+
+      void operator()() {
+
+         const auto ts = timeloop_->getCurrentTimeStep();
+         if(ts < plotStart_)
+            return;
+
+         if(!plotFrequency_ || (ts % plotFrequency_))
+            return;
+
+         const auto channelHalfWidth = real_c(parameters_->channelHalfWidth);
+         const auto bulkVelocity = forceCalculator_->bulkVelocity();
+
+         /// write force data
+
+         WALBERLA_ROOT_SECTION() {
+            std::ofstream forceOS(forcingDataFilePath_, std::ios::out | std::ios::app);
+            if (forceOS.is_open())
+            {
+               forceOS << ts << "\t" << bulkVelocity << "\t" << forceCalculator_->calculateDrivingForce() << "\n";
+               forceOS.close();
+            }
+         }
+
+         /// write velocity data
+
+         // gather velocity data
+         std::vector<real_t> instantaneousVelocityVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> meanVelocityVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> tkeSGSVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> tkeResolvedVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> reynoldsStressVector(parameters_->channelHalfWidth * TensorField_T::F_SIZE, 0_r);
+
+         const auto idxFlow = int_c(parameters_->domainSize[parameters_->flowAxis] / uint_t(2));
+         const auto idxRem = int_c(parameters_->domainSize[3 - parameters_->flowAxis - parameters_->wallAxis] / uint_t(2));
+
+         Cell point;
+         point[parameters_->flowAxis] = idxFlow;
+         point[3 - parameters_->flowAxis - parameters_->wallAxis] = idxRem;
+
+         const auto flowAxis = int_c(parameters_->flowAxis);
+
+         auto blocks = blocks_.lock();
+         WALBERLA_CHECK_NOT_NULLPTR(blocks)
+
+         for(auto block = blocks->begin(); block != blocks->end(); ++block) {
+
+            const auto * const velocity = block->template getData<VectorField_T>(velocityFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(velocity)
+
+            const auto * const meanVelocity = block->template getData<VectorField_T>(meanVelocityFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocity)
+
+            const auto * const tkeSGS = block->template getData<ScalarField_T>(meanTkeSGSFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(tkeSGS)
+
+            const auto * const sos = block->template getData<TensorField_T>(sosFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(sos)
+
+            for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
+
+               point[parameters_->wallAxis] = int_c(idx);
+
+               Cell localCell;
+               blocks->transformGlobalToBlockLocalCell(localCell, *block, point);
+
+               if(velocity->xyzSize().contains(localCell)){
+                  instantaneousVelocityVector[idx] = velocity->get(localCell, flowAxis);
+                  meanVelocityVector[idx] = meanVelocity->get(localCell, flowAxis);
+                  tkeSGSVector[idx] = tkeSGS->get(localCell);
+                  for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+i] = sos->get(localCell,i) / velocityWelford_->getCounter();
+                  }
+                  tkeResolvedVector[idx] = real_c(0.5) * (
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+0] +
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+4] +
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+8]
+                  );
+               }
+            }
+         }
+
+         // MPI exchange information
+         mpi::reduceInplace(instantaneousVelocityVector, mpi::SUM);
+         mpi::reduceInplace(meanVelocityVector, mpi::SUM);
+         mpi::reduceInplace(tkeSGSVector, mpi::SUM);
+         mpi::reduceInplace(tkeResolvedVector, mpi::SUM);
+         mpi::reduceInplace(reynoldsStressVector, mpi::SUM);
+
+         WALBERLA_ROOT_SECTION()
+         {
+            std::ofstream velocityOS;
+            filesystem::path path = velocityProfilesFilePath_;
+            if (separateFiles_) {
+               path.concat("_t" + std::to_string(timeloop_->getCurrentTimeStep()) + ".txt");
+               velocityOS.open(path, std::ios::out | std::ios::trunc);
+            } else {
+               path.concat("_t" + std::to_string(parameters_->timesteps-1) + ".txt");
+               velocityOS.open(path, std::ios::out | std::ios::trunc);
+            }
+
+            if (velocityOS.is_open()) {
+               if (!separateFiles_) velocityOS << "# t = " << ts << "\n";
+               velocityOS << "# y/delta\t y+\t u+\t u_mean\t u_instantaneous\t TKE_SGS\t TKE_resolved\t uu_rms\t uv_rms\t uw_rms\t vu_rms\t vv_rms\t vw_rms\t wu_rms\t wv_rms\t ww_rms\n";
+
+               const auto & viscosity = parameters_->viscosity;
+               const auto bulkReynolds = 2_r * channelHalfWidth * bulkVelocity / viscosity;
+               const auto frictionReynolds = dean_correlation::calculateFrictionReynoldsNumber(bulkReynolds);
+               const auto frictionVelocity = frictionReynolds * viscosity / channelHalfWidth;
+
+               for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
+                  // relative position
+                  velocityOS << (real_c(idx)+0.5_r) / channelHalfWidth << "\t";
+                  // y+
+                  velocityOS << (real_c(idx)+0.5_r) * frictionVelocity / viscosity << "\t";
+                  // u+
+                  velocityOS << meanVelocityVector[idx] / frictionVelocity << "\t";
+                  // mean velocity
+                  velocityOS << meanVelocityVector[idx] << "\t";
+                  // instantaneous velocity
+                  velocityOS << instantaneousVelocityVector[idx] << "\t";
+                  // subgrid-scale TKE
+                  velocityOS << tkeSGSVector[idx] << "\t";
+                  // resolved TKE
+                  velocityOS << tkeResolvedVector[idx] << "\t";
+                  // Reynolds stresses
+                  for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+                     velocityOS << reynoldsStressVector[idx*TensorField_T::F_SIZE+i] << "\t";
+                  }
+
+                  velocityOS << "\n";
+               }
+               velocityOS.close();
+            } else{
+               WALBERLA_ABORT("Could not open velocity plot file " << path.generic_string())
+            }
+         }
+      }
+
+    private:
+
+
+      SimulationParameters const * const parameters_{};
+      ForceCalculator<VectorField_T> const * const forceCalculator_{};
+
+      Timeloop * const timeloop_{};
+      const std::weak_ptr<StructuredBlockStorage> blocks_;
+      Welford_T * const velocityWelford_{};
+
+      const BlockDataID velocityFieldId_{};
+      const BlockDataID meanVelocityFieldId_{};
+      const BlockDataID meanTkeSGSFieldId_{};
+      const BlockDataID sosFieldId_{};
+
+      const uint_t plotFrequency_{};
+      const uint_t plotStart_{};
+
+      const bool separateFiles_{false};
+      const std::string baseFolder_{"output"};
+      filesystem::path velocityProfilesFilePath_;
+      filesystem::path forcingDataFilePath_;
+   };
+
+   /////////////////////
+   /// Main Function ///
+   /////////////////////
+
+   int main(int argc, char** argv) {
+
+      Environment walberlaEnv(argc, argv);
+
+      if (!walberlaEnv.config()) { WALBERLA_ABORT("No configuration file specified!") }
+
+      ///////////////////////////////////////////////////////
+      /// Block Storage Creation and Simulation Parameter ///
+      ///////////////////////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating block forest...")
+
+      const auto channelParameter = walberlaEnv.config()->getOneBlock("TurbulentChannel");
+      SimulationParameters simulationParameters(channelParameter);
+
+      // domain creation
+      std::shared_ptr<StructuredBlockForest> blocks;
+      {
+         Vector3< uint_t > numBlocks;
+         Vector3< uint_t > cellsPerBlock;
+         blockforest::calculateCellDistribution(simulationParameters.domainSize,
+                                                uint_c(mpi::MPIManager::instance()->numProcesses()),
+                                                numBlocks, cellsPerBlock);
+
+         const auto & periodicity = simulationParameters.periodicity;
+         auto & domainSize = simulationParameters.domainSize;
+         const Vector3<uint_t> newDomainSize(numBlocks[0] * cellsPerBlock[0], numBlocks[1] * cellsPerBlock[1], numBlocks[2] * cellsPerBlock[2]);
+
+         if(domainSize != newDomainSize) {
+            domainSize = newDomainSize;
+            WALBERLA_LOG_WARNING_ON_ROOT("\nWARNING: Domain size has changed due to the chosen domain decomposition.\n")
+         }
+
+         SetupBlockForest sforest;
+
+         sforest.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );
+
+         sforest.init( AABB(0_r, 0_r, 0_r, real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2])),
+                       numBlocks[0], numBlocks[1], numBlocks[2], periodicity[0], periodicity[1], periodicity[2] );
+
+         // calculate process distribution
+
+         const memory_t memoryLimit = numeric_cast< memory_t >( sforest.getNumberOfBlocks() );
+
+         const blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > metisConfig(
+            true, false, std::bind( blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2,
+                                    cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] ) );
+
+         sforest.calculateProcessDistribution_Default( uint_c( MPIManager::instance()->numProcesses() ), memoryLimit,
+                                                       "hilbert", 10, false, metisConfig );
+
+         if( !MPIManager::instance()->rankValid() )
+            MPIManager::instance()->useWorldComm();
+
+         // create StructuredBlockForest (encapsulates a newly created BlockForest)
+
+         WALBERLA_LOG_INFO_ON_ROOT("SetupBlockForest created successfully:\n" << sforest)
+
+         sforest.writeVTKOutput("domain_decomposition");
+
+         auto bf = std::make_shared< BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, false );
+
+         blocks = std::make_shared< StructuredBlockForest >( bf, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] );
+         blocks->createCellBoundingBoxes();
+
+      }
+
+      ////////////////////////////////////
+      /// PDF Field and Velocity Setup ///
+      ////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating fields...")
+
+      // Common Fields
+      const BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), codegen::layout);
+      const BlockDataID meanVelocityFieldId = field::addToStorage< VectorField_T >(blocks, "mean velocity", real_c(0.0), codegen::layout);
+      const BlockDataID sosFieldId = field::addToStorage< TensorField_T >(blocks, "sum of squares", real_c(0.0), codegen::layout);
+
+      const BlockDataID tkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "tke_SGS", real_c(0.0), codegen::layout);
+      const BlockDataID meanTkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "mean_tke_SGS", real_c(0.0), codegen::layout);
+
+      const BlockDataID omegaFieldId = field::addToStorage< ScalarField_T >(blocks, "omega_out", real_c(0.0), codegen::layout);
+
+      const BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+      // CPU Field for PDFs
+      const BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), codegen::layout);
+
+      ///////////////////////////////////////////
+      /// Force and bulk velocity calculation ///
+      ///////////////////////////////////////////
+
+      ForceCalculator<VectorField_T> forceCalculator(blocks, velocityFieldId, simulationParameters);
+
+      //////////////
+      /// Setter ///
+      //////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Setting up fields...")
+
+      // Velocity field setup
+      setVelocityFieldsAsmuth<VectorField_T>(
+         blocks, velocityFieldId, meanVelocityFieldId,
+         simulationParameters.targetFrictionVelocity, simulationParameters.channelHalfWidth,
+         5.5_r, 0.41_r, simulationParameters.viscosity,
+         simulationParameters.wallAxis, simulationParameters.flowAxis );
+
+      forceCalculator.setBulkVelocity(simulationParameters.targetBulkVelocity);
+      const auto initialForce = forceCalculator.calculateDrivingForce();
+
+      // pdfs setup
+      Setter_T pdfSetter(pdfFieldId, velocityFieldId, initialForce, simulationParameters.density);
+
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+         pdfSetter(blockIt.get());
+
+      /////////////
+      /// Sweep ///
+      /////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating sweeps...")
+
+      const auto omega = lbm::collision_model::omegaFromViscosity(simulationParameters.viscosity);
+      StreamCollideSweep_T streamCollideSweep(omegaFieldId, pdfFieldId, velocityFieldId, initialForce, omega);
+
+      WelfordSweep_T welfordSweep(meanVelocityFieldId, sosFieldId, velocityFieldId, 0_r);
+      TKEWelfordSweep_T welfordTKESweep(meanTkeSgsFieldId, tkeSgsFieldId, 0_r);
+
+      TkeSgsWriter_T tkeSgsWriter(omegaFieldId, pdfFieldId, tkeSgsFieldId, initialForce, omega);
+
+      /////////////////////////
+      /// Boundary Handling ///
+      /////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating boundary handling...")
+
+      const FlagUID fluidFlagUID("Fluid");
+
+      Config::Block boundaryBlock;
+      boundaries::createBoundaryConfig(simulationParameters, boundaryBlock);
+
+      std::unique_ptr<WFB_bottom_T> wfb_bottom_ptr = std::make_unique<WFB_bottom_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
+      std::unique_ptr<WFB_top_T > wfb_top_ptr = std::make_unique<WFB_top_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
+
+      NoSlip_T noSlip(blocks, pdfFieldId);
+      FreeSlip_top_T freeSlip_top(blocks, pdfFieldId);
+
+      geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, Config::BlockHandle(&boundaryBlock));
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+
+      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
+      freeSlip_top.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("FreeSlip"), fluidFlagUID);
+      wfb_bottom_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_bottom"), fluidFlagUID);
+      wfb_top_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_top"), fluidFlagUID);
+
+      //////////////
+      /// Output ///
+      //////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating field output...")
+
+      // vtk output
+      auto vtkWriter = vtk::createVTKOutput_BlockData(
+         blocks, "field_writer", simulationParameters.vtkFrequency, 0, false, "vtk_out", "simulation_step",
+         false, false, true, false
+      );
+      vtkWriter->setInitialWriteCallsToSkip(simulationParameters.vtkStart);
+
+      // velocity field writer
+      auto velocityWriter = std::make_shared<field::VTKWriter<VectorField_T>>(velocityFieldId, "instantaneous velocity");
+      vtkWriter->addCellDataWriter(velocityWriter);
+
+      auto meanVelocityFieldWriter = std::make_shared<field::VTKWriter<VectorField_T>>(meanVelocityFieldId, "mean velocity");
+      vtkWriter->addCellDataWriter(meanVelocityFieldWriter);
+
+      // vtk writer
+      {
+         auto flagOutput = vtk::createVTKOutput_BlockData(
+            blocks, "flag_writer", 1, 1, false, "vtk_out", "simulation_step",
+            false, true, true, false
+         );
+         auto flagWriter = std::make_shared<field::VTKWriter<FlagField_T>>(flagFieldId, "flag field");
+         flagOutput->addCellDataWriter(flagWriter);
+         flagOutput->write();
+      }
+
+
+      /////////////////
+      /// Time Loop ///
+      /////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Creating timeloop...")
+
+      SweepTimeloop timeloop(blocks->getBlockStorage(), simulationParameters.timesteps);
+
+      // Communication
+      blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
+      communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+
+      auto setNewForce = [&](const real_t newForce) {
+         streamCollideSweep.setF_x(newForce);
+         tkeSgsWriter.setF_x(newForce);
+      };
+
+      // plotting
+      const bool outputSeparateFiles = channelParameter.getParameter<bool>("separate_files", false);
+      const TurbulentChannelPlotter<WelfordSweep_T > plotter(&simulationParameters, &timeloop, &forceCalculator, blocks,
+                                                             velocityFieldId, meanVelocityFieldId,
+                                                             meanTkeSgsFieldId, sosFieldId, &welfordSweep,
+                                                             outputSeparateFiles);
+
+      //NOTE must convert sweeps that are altered to lambdas, otherwise copy and counter will stay 0
+      auto welfordLambda = [&welfordSweep, &welfordTKESweep](IBlock * block) {
+         welfordSweep(block);
+         welfordTKESweep(block);
+      };
+
+      auto wfbLambda = [&wfb_bottom_ptr, &wfb_top_ptr](IBlock * block) {
+         wfb_bottom_ptr->operator()(block);
+         wfb_top_ptr->operator()(block);
+      };
+
+      auto streamCollideLambda = [&streamCollideSweep](IBlock * block) {
+         streamCollideSweep(block);
+      };
+
+      // Timeloop
+      timeloop.add() << BeforeFunction(communication, "communication")
+                     << BeforeFunction([&](){forceCalculator.calculateBulkVelocity();}, "bulk velocity calculation")
+                     << BeforeFunction([&](){
+                           const auto newForce = forceCalculator.calculateDrivingForce();
+                           setNewForce(newForce);
+                        }, "new force setter")
+                     << Sweep([](IBlock *){}, "new force setter");
+      timeloop.add() << Sweep(freeSlip_top, "freeSlip");
+      timeloop.add() << Sweep(noSlip, "noSlip");
+      timeloop.add() << Sweep(wfbLambda, "wall function bounce");
+      timeloop.add() << Sweep(streamCollideLambda, "stream and collide");
+      timeloop.add() << BeforeFunction([&](){
+                           const uint_t velCtr = uint_c(welfordSweep.getCounter());
+                           if((timeloop.getCurrentTimeStep() == simulationParameters.samplingStart) ||
+                              (timeloop.getCurrentTimeStep() > simulationParameters.samplingStart && simulationParameters.samplingInterval && (velCtr % simulationParameters.samplingInterval == 0))) {
+                              welfordSweep.setCounter(real_c(0.0));
+                              welfordTKESweep.setCounter(real_c(0.0));
+                               for(auto & block : *blocks) {
+                                  auto * sopField = block.template getData<TensorField_T >(sosFieldId);
+                                  sopField->setWithGhostLayer(0.0);
+
+                                  auto * tkeField = block.template getData<ScalarField_T>(tkeSgsFieldId);
+                                  tkeField->setWithGhostLayer(0.0);
+                               }
+                           }
+
+                           welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
+                           welfordTKESweep.setCounter(welfordTKESweep.getCounter() + real_c(1.0));
+                        }, "welford sweep")
+                     << Sweep(welfordLambda, "welford sweep");
+      timeloop.add() << Sweep(tkeSgsWriter, "TKE_SGS writer");
+
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkWriter), "VTK field output");
+      timeloop.addFuncAfterTimeStep(plotter, "Turbulent quantity plotting");
+
+      // LBM stability check
+      timeloop.addFuncAfterTimeStep( makeSharedFunctor( field::makeStabilityChecker< PdfField_T, FlagField_T >(
+                                       walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID ) ),
+                                    "LBM stability check" );
+
+      // Time logger
+      timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), 5_r),
+                                    "remaining time logger");
+
+
+      WALBERLA_LOG_INFO_ON_ROOT("Running timeloop with " << timeloop.getNrOfTimeSteps() - 1 << " timesteps...")
+
+      WcTimingPool timing;
+
+      WcTimer timer;
+      timer.start();
+
+      timeloop.run(timing);
+
+      timer.end();
+
+      double time = timer.max();
+      walberla::mpi::reduceInplace(time, walberla::mpi::MAX);
+
+      const auto timeloopTiming = timing.getReduced();
+      WALBERLA_LOG_INFO_ON_ROOT("Timeloop timing:\n" << *timeloopTiming)
+
+      const walberla::lbm::PerformanceEvaluation<FlagField_T> performance(blocks, flagFieldId, fluidFlagUID);
+      performance.logResultOnRoot(simulationParameters.timesteps, time);
+
+      return EXIT_SUCCESS;
+   }
+
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
--- a/apps/benchmarks/TurbulentChannel/TurbulentChannel.prm
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.prm
+
+TurbulentChannel {
+
+    channel_half_width           20;
+    full_channel                  0;
+
+    wall_boundary_condition     WFB;
+
+    target_friction_Reynolds    395;
+    target_bulk_velocity        0.1;
+
+    // turnover_periods              50;
+    timesteps                   100;
+
+    // sampling_start_timesteps      50;
+    // sampling_start_periods      1;
+    // sampling_interval_timesteps   20;
+    // sampling_interval_periods   1;
+
+    // vtk_start_timesteps        50;
+    // vtk_start_periods          1000;
+    // plot_start_timesteps       50;
+    // plot_start_periods         1000;
+    vtk_frequency              10;
+    plot_frequency             10;
+    separate_files                0;
+
+}
+
+StabilityChecker
+{
+   checkFrequency 10000;
+   streamOutput   false;
+   vtkOutput      true;
+}
--- a/apps/benchmarks/TurbulentChannel/TurbulentChannel.py
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.py
+import sympy as sp
+import pystencils as ps
+
+from lbmpy.enums import SubgridScaleModel
+from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, ForceModel
+from lbmpy.flow_statistics import welford_assignments
+from lbmpy.relaxationrates import lattice_viscosity_from_relaxation_rate
+
+from lbmpy.creationfunctions import create_lb_update_rule
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+
+from lbmpy.boundaries import NoSlip, FreeSlip, WallFunctionBounce
+from lbmpy.boundaries.wall_function_models import SpaldingsLaw, MoninObukhovSimilarityTheory
+from lbmpy.utils import frobenius_norm, second_order_moment_tensor
+
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
+from lbmpy_walberla import generate_boundary
+
+#   =====================
+#      Code Generation
+#   =====================
+
+info_header = """
+#ifndef TURBULENTCHANNEL_INCLUDES
+#define TURBULENTCHANNEL_INCLUDES
+
+#include <stencil/D{d}Q{q}.h>
+
+#include "TurbulentChannel_Sweep.h"
+#include "TurbulentChannel_PackInfo.h"
+#include "TurbulentChannel_Setter.h"
+#include "TurbulentChannel_Welford.h"
+#include "TurbulentChannel_Welford_TKE_SGS.h"
+#include "TurbulentChannel_TKE_SGS_Writer.h"
+
+#include "TurbulentChannel_NoSlip.h"
+#include "TurbulentChannel_FreeSlip_top.h"
+#include "TurbulentChannel_WFB_top.h"
+#include "TurbulentChannel_WFB_bottom.h"
+
+namespace walberla {{
+    namespace codegen {{
+        using Stencil_T = walberla::stencil::D{d}Q{q};
+        static constexpr uint_t flowAxis = {flow_axis};
+        static constexpr uint_t wallAxis = {wall_axis};
+        
+        static constexpr field::Layout layout = field::{layout};
+    }}
+}}
+
+#endif // TURBULENTCHANNEL_INCLUDES
+"""
+
+
+def check_axis(flow_axis, wall_axis):
+    assert flow_axis != wall_axis
+    assert flow_axis < 3
+    assert wall_axis < 3
+
+
+with CodeGeneration() as ctx:
+
+    flow_axis = 0
+    wall_axis = 1
+
+    check_axis(flow_axis=flow_axis, wall_axis=wall_axis)
+
+    #   ========================
+    #      General Parameters
+    #   ========================
+    target = ps.Target.CPU
+
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    stencil = LBStencil(Stencil.D3Q19)
+    omega = sp.Symbol('omega')
+
+    F_x = sp.Symbol('F_x')
+    force_vector = [0] * 3
+    force_vector[flow_axis] = F_x
+
+    layout = 'fzyx'
+
+    normal_direction_top = [0] * 3
+    normal_direction_top[wall_axis] = -1
+    normal_direction_top = tuple(normal_direction_top)
+
+    normal_direction_bottom = [0] * 3
+    normal_direction_bottom[wall_axis] = 1
+    normal_direction_bottom = tuple(normal_direction_bottom)
+
+    #   PDF Fields
+    pdfs, pdfs_tmp = ps.fields(f'pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[{stencil.D}D]', layout=layout)
+
+    #   Output Fields
+    omega_field = ps.fields(f"omega_out: {data_type}[{stencil.D}D]", layout=layout)
+    sgs_tke = ps.fields(f"sgs_tke: {data_type}[{stencil.D}D]", layout=layout)
+    mean_sgs_tke = ps.fields(f"mean_sgs_tke: {data_type}[{stencil.D}D]", layout=layout)
+    velocity = ps.fields(f"velocity({stencil.D}): {data_type}[{stencil.D}D]", layout=layout)
+    mean_velocity = ps.fields(f"mean_velocity({stencil.D}): {data_type}[{stencil.D}D]", layout=layout)
+    sum_of_squares_field = ps.fields(f"sum_of_squares_field({stencil.D**2}): {data_type}[{stencil.D}D]", layout=layout)
+
+    # LBM Optimisation
+    lbm_opt = LBMOptimisation(cse_global=True,
+                              symbolic_field=pdfs,
+                              symbolic_temporary_field=pdfs_tmp,
+                              field_layout=layout)
+
+    #   ==================
+    #      Method Setup
+    #   ==================
+
+    lbm_config = LBMConfig(stencil=stencil,
+                           method=Method.CUMULANT,
+                           force_model=ForceModel.GUO,
+                           force=tuple(force_vector),
+                           relaxation_rate=omega,
+                           subgrid_scale_model=SubgridScaleModel.QR,
+                           # galilean_correction=True,
+                           compressible=True,
+                           omega_output_field=omega_field,
+                           output={'velocity': velocity})
+
+    update_rule = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+    lbm_method = update_rule.method
+
+    #   ========================
+    #      PDF Initialization
+    #   ========================
+
+    initial_rho = sp.Symbol('rho_0')
+
+    pdfs_setter = macroscopic_values_setter(lbm_method,
+                                            initial_rho,
+                                            velocity.center_vector,
+                                            pdfs.center_vector)
+
+    #   LBM Sweep
+    generate_sweep(ctx, "TurbulentChannel_Sweep", update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
+
+    #   Pack Info
+    generate_pack_info_from_kernel(ctx, "TurbulentChannel_PackInfo", update_rule, target=target)
+
+    #   Macroscopic Values Setter
+    generate_sweep(ctx, "TurbulentChannel_Setter", pdfs_setter, target=target, ghost_layers_to_include=1)
+
+    #   Welford update
+    # welford_update = welford_assignments(vector_field=velocity, mean_vector_field=mean_velocity)
+    welford_update = welford_assignments(field=velocity, mean_field=mean_velocity,
+                                         sum_of_squares_field=sum_of_squares_field)
+    generate_sweep(ctx, "TurbulentChannel_Welford", welford_update, target=target)
+
+    tke_welford_update = welford_assignments(field=sgs_tke, mean_field=mean_sgs_tke)
+    generate_sweep(ctx, "TurbulentChannel_Welford_TKE_SGS", tke_welford_update, target=target)
+
+    # subgrid TKE output
+    @ps.kernel
+    def tke_sgs_writer():
+        f_neq = sp.Matrix(pdfs.center_vector) - lbm_method.get_equilibrium_terms()
+        rho = lbm_method.conserved_quantity_computation.density_symbol
+        strain_rate = frobenius_norm(-3 * omega_field.center / (2 * rho) * second_order_moment_tensor(f_neq, lbm_method.stencil))
+        eddy_viscosity = lattice_viscosity_from_relaxation_rate(omega_field.center) - lattice_viscosity_from_relaxation_rate(omega)
+
+        sgs_tke.center @= (eddy_viscosity * strain_rate**2)**(2.0/3.0)
+
+    tke_sgs_ac = ps.AssignmentCollection(
+        [lbm_method.conserved_quantity_computation.equilibrium_input_equations_from_pdfs(pdfs.center_vector),
+         *tke_sgs_writer]
+    )
+    generate_sweep(ctx, "TurbulentChannel_TKE_SGS_Writer", tke_sgs_ac)
+
+    #   Boundary conditions
+    nu = lattice_viscosity_from_relaxation_rate(omega)
+    u_tau_target = sp.Symbol("target_u_tau")
+
+    noslip = NoSlip()
+    freeslip_top = FreeSlip(stencil, normal_direction=normal_direction_top)
+    wfb_top = WallFunctionBounce(lbm_method, pdfs, normal_direction=normal_direction_top,
+                                 wall_function_model=SpaldingsLaw(viscosity=nu,
+                                                                  kappa=0.41, b=5.5, newton_steps=5),
+                                 mean_velocity=mean_velocity, data_type=data_type,
+                                 target_friction_velocity=u_tau_target)
+    wfb_bottom = WallFunctionBounce(lbm_method, pdfs, normal_direction=normal_direction_bottom,
+                                    wall_function_model=SpaldingsLaw(viscosity=nu,
+                                                                     kappa=0.41, b=5.5, newton_steps=5),
+                                    mean_velocity=mean_velocity, data_type=data_type,
+                                    target_friction_velocity=u_tau_target)
+
+    generate_boundary(ctx, "TurbulentChannel_NoSlip", noslip, lbm_method, target=target)
+    generate_boundary(ctx, "TurbulentChannel_FreeSlip_top", freeslip_top, lbm_method, target=target)
+    generate_boundary(ctx, "TurbulentChannel_WFB_bottom", wfb_bottom, lbm_method, target=target)
+    generate_boundary(ctx, "TurbulentChannel_WFB_top", wfb_top, lbm_method, target=target)
+
+    info_header_params = {
+        'layout': layout,
+        'd': stencil.D,
+        'q': stencil.Q,
+        'flow_axis': flow_axis,
+        'wall_axis': wall_axis
+    }
+
+    ctx.write_file("CodegenIncludes.h", info_header.format(**info_header_params))
--- a/apps/benchmarks/UniformGrid/CMakeLists.txt
+++ b/apps/benchmarks/UniformGrid/CMakeLists.txt
@@ -2,5 +2,5 @@
 waLBerla_link_files_to_builddir( "*.dat" )


-waLBerla_add_executable ( NAME UniformGridBenchmark 
-                          DEPENDS blockforest boundary core domain_decomposition field lbm postprocessing timeloop vtk sqlite )
\ No newline at end of file
+waLBerla_add_executable ( NAME UniformGridBenchmark
+      DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::domain_decomposition walberla::field walberla::lbm walberla::postprocessing walberla::timeloop walberla::vtk walberla::sqlite )
\ No newline at end of file
--- a/apps/benchmarks/UniformGrid/UniformGrid.cpp
+++ b/apps/benchmarks/UniformGrid/UniformGrid.cpp
 //======================================================================================================================
 //
-//  This file is part of waLBerla. waLBerla is free software: you can 
+//  This file is part of waLBerla. waLBerla is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -101,12 +101,12 @@ using walberla::real_t;
 // TYPEDEFS //
 //////////////

-typedef lbm::D3Q19< lbm::collision_model::SRT,           false > D3Q19_SRT_INCOMP;
-typedef lbm::D3Q19< lbm::collision_model::SRT,           true  > D3Q19_SRT_COMP;
-typedef lbm::D3Q19< lbm::collision_model::TRT,           false > D3Q19_TRT_INCOMP;
-typedef lbm::D3Q19< lbm::collision_model::TRT,           true  > D3Q19_TRT_COMP;
-typedef lbm::D3Q19< lbm::collision_model::D3Q19MRT,      false > D3Q19_MRT_INCOMP;
-typedef lbm::D3Q27< lbm::collision_model::D3Q27Cumulant, true  > D3Q27_CUMULANT_COMP;
+using D3Q19_SRT_INCOMP = lbm::D3Q19<lbm::collision_model::SRT, false>;
+using D3Q19_SRT_COMP = lbm::D3Q19<lbm::collision_model::SRT, true>;
+using D3Q19_TRT_INCOMP = lbm::D3Q19<lbm::collision_model::TRT, false>;
+using D3Q19_TRT_COMP = lbm::D3Q19<lbm::collision_model::TRT, true>;
+using D3Q19_MRT_INCOMP = lbm::D3Q19<lbm::collision_model::D3Q19MRT, false>;
+using D3Q27_CUMULANT_COMP = lbm::D3Q27<lbm::collision_model::D3Q27Cumulant, true>;


 template< typename LatticeModel_T >
@@ -269,31 +269,20 @@ void createSetupBlockForest( blockforest::SetupBlockForest & sforest, const Conf

      WALBERLA_MPI_SECTION()
      {
-         if ( MPIManager::instance()->isCartesianCommValid() )
-         {
-            MPIManager::instance()->createCartesianComm(numberOfXProcesses, numberOfYProcesses, numberOfZProcesses, false, false, false);
+         MPIManager::instance()->createCartesianComm(numberOfXProcesses, numberOfYProcesses, numberOfZProcesses, false, false, false);

-            processIdMap = make_shared<std::vector<uint_t> >(numberOfXProcesses * numberOfYProcesses * numberOfZProcesses);
+         processIdMap = make_shared<std::vector<uint_t> >(numberOfXProcesses * numberOfYProcesses * numberOfZProcesses);

-            for (uint_t z = 0; z != numberOfZProcesses; ++z) {
-               for (uint_t y = 0; y != numberOfYProcesses; ++y) {
-                  for (uint_t x = 0; x != numberOfXProcesses; ++x)
-                  {
-                     (*processIdMap)[z * numberOfXProcesses * numberOfYProcesses + y * numberOfXProcesses + x] =
-                           uint_c(MPIManager::instance()->cartesianRank(x, y, z));
-                  }
+         for (uint_t z = 0; z != numberOfZProcesses; ++z) {
+            for (uint_t y = 0; y != numberOfYProcesses; ++y) {
+               for (uint_t x = 0; x != numberOfXProcesses; ++x)
+               {
+                  (*processIdMap)[z * numberOfXProcesses * numberOfYProcesses + y * numberOfXProcesses + x] =
+                     uint_c(MPIManager::instance()->cartesianRank(x, y, z));
               }
            }
         }
-         else {
-            WALBERLA_LOG_WARNING_ON_ROOT( "Your version of OpenMPI contains a bug. See waLBerla issue #73 for more "
-                                          "information. As a workaround, MPI_COMM_WORLD instead of a "
-                                          "Cartesian MPI communicator is used." );
-            MPIManager::instance()->useWorldComm();
-         }
      }
-      else
-         MPIManager::instance()->useWorldComm();

      sforest.balanceLoad( blockforest::CartesianDistribution( numberOfXProcesses, numberOfYProcesses, numberOfZProcesses, processIdMap.get() ),
                           numberOfXProcesses * numberOfYProcesses * numberOfZProcesses );
@@ -358,10 +347,10 @@ class MyBoundaryHandling
 {
 public:

-   typedef lbm::NoSlip< LatticeModel_T, flag_t >    NoSlip_T;
-   typedef lbm::SimpleUBB< LatticeModel_T, flag_t > UBB_T;
+   using NoSlip_T = lbm::NoSlip<LatticeModel_T, flag_t>;
+   using UBB_T = lbm::SimpleUBB<LatticeModel_T, flag_t>;

-   typedef BoundaryHandling< FlagField_T, typename Types<LatticeModel_T>::Stencil_T, NoSlip_T, UBB_T > BoundaryHandling_T;
+   using BoundaryHandling_T = BoundaryHandling<FlagField_T, typename Types<LatticeModel_T>::Stencil_T, NoSlip_T, UBB_T>;



@@ -576,7 +565,7 @@ struct AddLB
            }
            else
            {
-               typedef lbm::SplitSweep< LatticeModel_T, FlagField_T > Sweep_T;
+               using Sweep_T = lbm::SplitSweep<LatticeModel_T, FlagField_T>;
               auto sweep = make_shared< Sweep_T >( pdfFieldId, flagFieldId, Fluid_Flag );

               timeloop.add() << Sweep( lbm::CollideSweep< Sweep_T >( sweep ), "split LB sweep (collide)" );
@@ -615,7 +604,7 @@ struct AddLB< LatticeModel_T, typename std::enable_if< std::is_same< typename La
      std::function< void () > commFunction;
      if( directComm )
      {
-         
+
         if( fullComm )
         {
            blockforest::communication::UniformDirectScheme< stencil::D3Q27 > comm( blocks );

--- a/apps/benchmarks/UniformGrid/ecmModel.py
+++ b/apps/benchmarks/UniformGrid/ecmModel.py
@@ -3,75 +3,78 @@
 import numpy as np
 import matplotlib.pyplot as plt

-
 kernels = dict()

+
 class Kernel:
-    
-    def __init__(self,name, cyclesFirstLoop=0, cyclesSecondLoop=0, cyclesRegPerLUP =0):
+
+    def __init__(self, name, cyclesFirstLoop=0, cyclesSecondLoop=0, cyclesRegPerLUP=0):
        self.name = name
        if cyclesRegPerLUP <= 0:
-            self.cyclesFirstLoop  = cyclesFirstLoop
+            self.cyclesFirstLoop = cyclesFirstLoop
            self.cyclesSecondLoop = cyclesSecondLoop
-            self.cyclesRegPerLUP = cyclesFirstLoop + 9* cyclesSecondLoop  
+            self.cyclesRegPerLUP = cyclesFirstLoop + 9 * cyclesSecondLoop
        else:
            self.cyclesRegPerLUP = cyclesRegPerLUP
-         
-        self.cyclesRegPerCacheLine = 8*self.cyclesRegPerLUP

-        self.cyclesL1L2 = 3*19*2
-        self.cyclesL2L3 = 3*19*2
-        
+        self.cyclesRegPerCacheLine = 8 * self.cyclesRegPerLUP
+
+        self.cyclesL1L2 = 3 * 19 * 2
+        self.cyclesL2L3 = 3 * 19 * 2
+
        self.freq = 2.7e9
        self.cyclesMem = 305
-        #self.cyclesMem = 191
-     
+        # self.cyclesMem = 191
+
    def mlups(self, processes):
        singleCoreCycles = self.cyclesRegPerCacheLine + self.cyclesL1L2 + self.cyclesL2L3 + self.cyclesMem
-                                     
+
        timeSingleCore = singleCoreCycles / self.freq
-        
+
        mlups = 8 / timeSingleCore * 1e-6
-        
-        #todo         
+
+        # todo
        mlupsMax = 78
-        
-        return min ( processes * mlups, mlupsMax )
-       
-    def plot( self,  divideByProcesses=False,processes=8, label="" ):
-        
-        x = np.arange( 1, processes+1, 1 )
+
+        return min(processes * mlups, mlupsMax)
+
+    def plot(self, divideByProcesses=False, processes=8, label=""):
+
+        x = np.arange(1, processes + 1, 1)
        if divideByProcesses:
-            y = np.array( [ self.mlups(i)/i for i in x ] )
+            y = np.array([self.mlups(i) / i for i in x])
        else:
-            y = np.array( [ self.mlups(i)   for i in x ] )
-        
-        if label=="":
-            label = "ecm\_" + self.name
-        plt.plot( x, y, marker='^', markersize=5, label = label)
-        
+            y = np.array([self.mlups(i) for i in x])

-kernels=dict()
+        if label == "":
+            label = "ecm_" + self.name
+        plt.plot(x, y, marker='^', markersize=5, label=label)

-#kernels['srt_split'] = Kernel("srt_split", 46, 12 )

-kernels['srt_pure']   = Kernel("srt_pure", 40, 8 )
-kernels['trt_split']  = Kernel("trt\_split", 41, 11 )
-kernels['srt_nonopt'] = Kernel("srt_nonopt", cyclesRegPerLUP = 1045)   #SRTStreamCollide.h -  pgo and lto (20cycles first loop, 35 second)
+kernels = dict()
+
+# kernels['srt_split'] = Kernel("srt_split", 46, 12 )

-#kernels['trt_pure_intelOpt'] = Kernel("trt_pure_intelOpt", 41/2, 10/2 )  # vectorized (v*pd)
+kernels['srt_pure'] = Kernel("srt_pure", 40, 8)
+kernels['trt_split'] = Kernel("trt_split", 41, 11)
+# SRTStreamCollide.h -  pgo and lto (20cycles first loop, 35 second)
+kernels['srt_nonopt'] = Kernel("srt_nonopt",
+                               cyclesRegPerLUP=1045)


-def plotAllKernels( divideByProcesses = False ):
+# kernels['trt_pure_intelOpt'] = Kernel("trt_pure_intelOpt", 41/2, 10/2 )  # vectorized (v*pd)
+
+
+def plotAllKernels(divideByProcesses=False):
    for kernel in kernels:
-        kernel.plot( divideByProcesses )
+        kernel.plot(divideByProcesses)
+
+
+def plot(kernelName, divideByProcesses=False, label=""):
+    kernels[kernelName].plot(divideByProcesses, label=label)

-def plot( kernelName, divideByProcesses = False, label = ""):
-    kernels[kernelName].plot( divideByProcesses, label=label )
-    
-    

 if __name__ == "__main__":
    plotAllKernels()
    plt.legend()
-    plt.show()
\ No newline at end of file
+    plt.show()
No results found