Merge branch 'UpdateBenchmarks' into 'master'

Update benchmarks See merge request walberla/walberla!685

Merge branch 'UpdateBenchmarks' into 'master'
Update benchmarks See merge request walberla/walberla!685
fb98460a · Philipp Suffa · 0c0092f3 · a74df61b · fb98460a · fb98460a
Commit fb98460a authored 5 months ago by Philipp Suffa
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ qrc_*

 # CLion indexing
 *.uuid
+.fleet


 # Generated files
@@ -32,11 +33,13 @@ qrc_*
 # Visual Studio Code
 /.vscode

+# Zed
+/.cache*
+
 # CLion
 *.idea
 *.clion*

-
 # QtCreator
 CMakeLists.txt.user.*


--- a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
@@ -11,11 +11,6 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
        NonUniformGridCPUBoundaryCollection.h
        NonUniformGridCPUInfoHeader.h)

-waLBerla_add_executable( NAME NonUniformGridGenerator
-                         FILES NonUniformGridGenerator.cpp LdcSetup.h
-                         DEPENDS blockforest core field python_coupling )
-
-
 waLBerla_add_executable( NAME NonUniformGridCPU
-                         FILES NonUniformGridCPU.cpp LdcSetup.h
+                         FILES NonUniformGridCPU.cpp LdcSetup.h GridGeneration.h
                         DEPENDS blockforest boundary core domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridCPUGenerated )
--- a/apps/benchmarks/NonUniformGridCPU/GridGeneration.h
+++ b/apps/benchmarks/NonUniformGridCPU/GridGeneration.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GridGeneration.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include <string>
+
+#include "LdcSetup.h"
+#include "NonUniformGridCPUInfoHeader.h"
+
+using StorageSpecification_T = lbm::NonUniformGridCPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+
+using namespace walberla;
+
+void createSetupBlockForest(SetupBlockForest& setupBfs,
+                            const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
+                            const bool useMPIManager=false)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+
+   Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+   Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
+
+   const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
+   uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
+   const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
+   const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
+   const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
+
+   if(useMPIManager)
+      numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
+
+   const LDC ldc(refinementDepth);
+
+   auto refSelection = ldc.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
+   {
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      setupBfs.saveToFile(oss.str().c_str());
+   }
+
+   if(writeVtk){
+      setupBfs.writeVTKOutput(blockForestFilestem);
+   }
+
+   if(outputStatistics){
+      WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++)
+      {
+         const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+      }
+
+      const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
+      WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+
+      const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
+      const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+      const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+      const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+      const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
+      const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+      const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+      WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+      WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
+
+      WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
+   }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs,
+                       const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
+{
+   if (mpi::MPIManager::instance()->numProcesses() > 1)
+   {
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      std::ifstream infile(setupBlockForestFilepath.c_str());
+      if(!infile.good())
+      {
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else
+      {
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else
+   {
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
+}
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
@@ -48,14 +48,8 @@ class LDCRefinement
   {
      const AABB & domain = forest.getDomain();

-      const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
-
-      const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
-                            domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
-
-      const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
-                             domain.xMax(), domain.yMin() + ySize, domain.zMax() );
+      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
+      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );

      for(auto & block : forest)
      {

--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
@@ -38,6 +38,7 @@

 #include <cmath>

+#include "GridGeneration.h"
 #include "LdcSetup.h"
 #include "NonUniformGridCPUInfoHeader.h"
 #include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
@@ -77,23 +78,25 @@ int main(int argc, char** argv)

      auto config = *cfg;
      logging::configureLogging(config);
-
+      auto domainSetup      = config->getOneBlock("DomainSetup");
      auto blockForestSetup = config->getOneBlock("SetupBlockForest");
+      const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);
+
      const std::string blockForestFilestem =
         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
      const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));

-      auto domainSetup                = config->getOneBlock("DomainSetup");
      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");

-      // Load structured block forest from file
-      std::ostringstream oss;
-      oss << blockForestFilestem << ".bfs";
-      const std::string setupBlockForestFilepath = oss.str();
+      shared_ptr< BlockForest > bfs;
+      createBlockForest(bfs, domainSetup, blockForestSetup);
+
+      if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+         return EXIT_SUCCESS;
+      }

-      WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
-      auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
-                                                 setupBlockForestFilepath.c_str(), false);
      auto blocks =
         std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
      blocks->createCellBoundingBoxes();
@@ -173,6 +176,8 @@ int main(int argc, char** argv)
      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
      const bool useVTKAMRWriter = parameters.getParameter< bool >("useVTKAMRWriter", false);
      const bool oneFilePerProcess = parameters.getParameter< bool >("oneFilePerProcess", false);
+
+      auto finalDomain = blocks->getDomain();
      if (vtkWriteFrequency > 0)
      {
         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
@@ -180,6 +185,12 @@ int main(int argc, char** argv)
         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldID, "vel");
         vtkOutput->addCellDataWriter(velWriter);

+         if (parameters.getParameter< bool >("writeOnlySlice", true)){
+            const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - blocks->dz(refinementDepth),
+                               finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + blocks->dz(refinementDepth));
+            vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
+         }
+
         vtkOutput->addBeforeFunction([&]() {
            for (auto& block : *blocks)
               sweepCollection.calculateMacroscopicParameters(&block);
@@ -236,6 +247,8 @@ int main(int argc, char** argv)
               pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
               pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
               pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+               pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+               pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
               pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
               pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
               pythonCallbackResults.data().exposeValue("mlupsPerProcess",

--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
@@ -23,17 +23,23 @@ const bool infoCsePdfs = {cse_pdfs};

 with CodeGeneration() as ctx:
    field_type = "float64" if ctx.double_accuracy else "float32"
+    cpu_vec = {"instruction_set": None}

-    streaming_pattern = 'aa'
+    streaming_pattern = 'esopull'
    timesteps = get_timesteps(streaming_pattern)
    stencil = LBStencil(Stencil.D3Q19)
+    method_enum = Method.CUMULANT
+
+    fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
+    collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()

    assert stencil.D == 3, "This application supports only three-dimensional stencils"
    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}

-    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, compressible=True,
+    lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
+                           fourth_order_correction=fourth_order_correction,
                           streaming_pattern=streaming_pattern)
    lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx")

@@ -50,12 +56,12 @@ with CodeGeneration() as ctx:
                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
                         nonuniform=True, boundaries=[no_slip, ubb],
                         macroscopic_fields=macroscopic_fields,
-                         target=ps.Target.CPU)
+                         target=ps.Target.CPU, cpu_vectorize_info=cpu_vec,)

    infoHeaderParams = {
        'stencil': stencil.name.lower(),
        'streaming_pattern': streaming_pattern,
-        'collision_setup': lbm_config.method.name.lower(),
+        'collision_setup': collision_setup,
        'cse_global': int(lbm_opt.cse_global),
        'cse_pdfs': int(lbm_opt.cse_pdfs),
    }

--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp
-//======================================================================================================================
-//
-//  This file is part of waLBerla. waLBerla is free software: you can
-//  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of
-//  the License, or (at your option) any later version.
-//
-//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-//  for more details.
-//
-//  You should have received a copy of the GNU General Public License along
-//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
-//
-//! \file NonUniformGridGenerator.cpp
-//! \author Frederik Hennig <frederik.hennig@fau.de>
-//
-//======================================================================================================================
-
-#include "blockforest/Initialization.h"
-#include "blockforest/SetupBlock.h"
-#include "blockforest/SetupBlockForest.h"
-#include "blockforest/loadbalancing/StaticCurve.h"
-
-#include "core/all.h"
-
-#include "python_coupling/CreateConfig.h"
-
-#include <string>
-
-#include "LdcSetup.h"
-
-using namespace walberla;
-
-
-int main(int argc, char ** argv){
-   const mpi::Environment env(argc, argv);
-   mpi::MPIManager::instance()->useWorldComm();
-
-   if(mpi::MPIManager::instance()->numProcesses() > 1){
-      WALBERLA_ABORT("Commandment: Thou shalt not run thy grid generator with more than one process.");
-   }
-
-   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
-   {
-      auto config = *cfg;
-      auto domainSetup = config->getOneBlock("DomainSetup");
-
-      Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
-      Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
-      Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
-
-      auto blockForestSetup = config->getOneBlock("SetupBlockForest");
-      const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
-      const uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
-      const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
-      const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
-      const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
-
-      const LDC ldc(refinementDepth);
-      SetupBlockForest setupBfs;
-
-      auto refSelection = ldc.refinementSelector();
-      setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
-      const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
-      setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
-      setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
-      setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
-
-      {
-         std::ostringstream oss;
-         oss << blockForestFilestem << ".bfs";
-         setupBfs.saveToFile(oss.str().c_str());
-      }
-
-      if(writeVtk){
-         setupBfs.writeVTKOutput(blockForestFilestem);
-      }
-
-      if(outputStatistics){
-         WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
-         WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
-         for (uint_t level = 0; level <= refinementDepth; level++)
-         {
-            const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
-            WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
-         }
-
-         const uint_t avgBlocksPerProc = setupBfs.getNumberOfBlocks() / setupBfs.getNumberOfProcesses();
-         WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
-         WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
-      }
-
-
-      WALBERLA_LOG_INFO_ON_ROOT("Ending program")
-   }
-}
--- a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
 import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
 import sqlite3
 import os
 import sys

+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
 DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))


 class Scenario:
@@ -18,7 +33,8 @@ class Scenario:
                 vtk_write_frequency=0,
                 logger_frequency=0,
                 blockforest_filestem="blockforest",
-                 write_setup_vtk=False):
+                 write_setup_vtk=True,
+                 db_file_name=None):

        self.domain_size = domain_size
        self.root_blocks = root_blocks
@@ -34,6 +50,8 @@ class Scenario:
        self.vtk_write_frequency = vtk_write_frequency
        self.logger_frequency = logger_frequency

+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
+
        self.config_dict = self.config(print_dict=False)

    @wlb.member_callback
@@ -51,7 +69,8 @@ class Scenario:
                'numProcesses': self.num_processes,
                'blockForestFilestem': self.bfs_filestem,
                'writeVtk': self.write_setup_vtk,
-                'outputStatistics': False
+                'outputStatistics': True,
+                'writeSetupForestAndReturn': True,
            },
            'Parameters': {
                'omega': 1.95,
@@ -59,14 +78,15 @@ class Scenario:
                'remainingTimeLoggerFrequency': self.logger_frequency,
                'vtkWriteFrequency': self.vtk_write_frequency,
                'useVTKAMRWriter': True,
-                'oneFilePerProcess': False
+                'oneFilePerProcess': False,
+                'writeOnlySlice': False
            },
            'Logging': {
                'logLevel': "info",
            }
        }

-        if (print_dict):
+        if print_dict:
            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))

        return config_dict
@@ -82,6 +102,15 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
        sequenceValuesToScalars(data)

        result = data
@@ -92,52 +121,109 @@ class Scenario:
        table_name = table_name.replace("-", "_")
        for num_try in range(num_tries):
            try:
-                checkAndUpdateSchema(result, table_name, DB_FILE)
-                storeSingle(result, table_name, DB_FILE)
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
                break
            except sqlite3.OperationalError as e:
                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")


-def validation_run():
-    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
-    wlb.log_info_on_root("Validation run")
+def weak_scaling_ldc(num_proc, uniform=False):
+    wlb.log_info_on_root("Running weak scaling benchmark...")

-    domain_size = (96, 96, 96)
-    cells_per_block = (32, 32, 32)
+    # This benchmark must run from 16 processes onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if uniform:
+        factor = 3 * num_proc
+        name = "uniform"
+    else:
+        if num_proc % 16 != 0:
+            raise RuntimeError("Number of processes must be dividable by 16")
+        factor = int(num_proc // 16)
+        name = "nonuniform"
+
+    cells_per_block = (WeakX, WeakY, WeakZ)
+    domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)

    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])

    scenarios = wlb.ScenarioManager()
-    scenario = Scenario(domain_size=domain_size,
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
                        root_blocks=root_blocks,
-                        num_processes=1,
-                        refinement_depth=1,
+                        num_processes=num_proc,
                        cells_per_block=cells_per_block,
-                        timesteps=201,
-                        vtk_write_frequency=100,
-                        logger_frequency=5,
-                        write_setup_vtk=True)
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        db_file_name=f"weakScalingCPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+def strong_scaling_ldc(num_proc, uniform=False):
+    wlb.log_info_on_root("Running strong scaling benchmark...")
+
+    # This benchmark must run from 64 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if num_proc % 64 != 0:
+        raise RuntimeError("Number of processes must be dividable by 64")
+
+    cells_per_block = (StrongX, StrongY, StrongZ)
+
+    if uniform:
+        domain_size = (cells_per_block[0] * 2, cells_per_block[1] * 2, cells_per_block[2] * 16)
+        name = "uniform"
+    else:
+        factor = int(num_proc / 64)
+        blocks64 = block_decomposition(factor)
+        cells_per_block = tuple([int(c / b) for c, b in zip(cells_per_block, reversed(blocks64))])
+        domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+        name = "nonuniform"
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        db_file_name=f"strongScalingCPU{name}LDC.sqlite3")
    scenarios.add(scenario)


-def scaling():
-    wlb.log_info_on_root("Running scaling benchmark...")
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")

-    numProc = wlb.mpi.numProcesses()
+    domain_size = (96, 96, 32)
+    cells_per_block = (32, 32, 32)

-    domain_size = (256, 256, 128 * numProc)
-    cells_per_block = (64, 64, 64)
    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])

    scenarios = wlb.ScenarioManager()
    scenario = Scenario(domain_size=domain_size,
                        root_blocks=root_blocks,
+                        num_processes=1,
+                        refinement_depth=3,
                        cells_per_block=cells_per_block,
-                        refinement_depth=2,
-                        timesteps=10)
+                        timesteps=1001,
+                        vtk_write_frequency=100,
+                        logger_frequency=5,
+                        write_setup_vtk=True)
    scenarios.add(scenario)


-validation_run()
-# scaling()
+if BENCHMARK == 0:
+    validation_run()
+elif BENCHMARK == 1:
+    weak_scaling_ldc(1, False)
+elif BENCHMARK == 2:
+    strong_scaling_ldc(1, False)
+else:
+    print(f"Invalid benchmark case {BENCHMARK}")
--- a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
@@ -11,5 +11,5 @@ waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated
        NonUniformGridGPUBoundaryCollection.h
        NonUniformGridGPUInfoHeader.h)
 waLBerla_add_executable( NAME NonUniformGridGPU
-                         FILES NonUniformGridGPU.cpp LdcSetup.h
+                         FILES NonUniformGridGPU.cpp LdcSetup.h GridGeneration.h
                         DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridGPUGenerated )
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridGPU/GridGeneration.h
+++ b/apps/benchmarks/NonUniformGridGPU/GridGeneration.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GridGeneration.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include <string>
+
+#include "LdcSetup.h"
+#include "NonUniformGridGPUInfoHeader.h"
+
+using StorageSpecification_T = lbm::NonUniformGridGPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+
+using namespace walberla;
+
+void createSetupBlockForest(SetupBlockForest& setupBfs,
+                            const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
+                            const bool useMPIManager=false)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+
+   Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+   Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
+
+   const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
+   uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
+   const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
+   const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
+   const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
+
+   if(useMPIManager)
+      numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
+
+   const LDC ldc(refinementDepth);
+
+   auto refSelection = ldc.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
+   {
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      setupBfs.saveToFile(oss.str().c_str());
+   }
+
+   if(writeVtk){
+      setupBfs.writeVTKOutput(blockForestFilestem);
+   }
+
+   if(outputStatistics){
+      WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++){
+         const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+      }
+
+      const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
+      WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+
+      const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
+      const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+      const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+      const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+      const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
+      const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+      const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+      WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+      WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
+
+      WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
+   }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs,
+                       const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
+{
+   if (mpi::MPIManager::instance()->numProcesses() > 1){
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      std::ifstream infile(setupBlockForestFilepath.c_str());
+      if(!infile.good()){
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else{
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else{
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
+}
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
@@ -31,7 +31,9 @@
 #include "field/FlagUID.h"

 using namespace walberla;
+
 using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
 using FlagField_T          = FlagField< uint8_t >;

 class LDCRefinement
@@ -46,14 +48,8 @@ class LDCRefinement
   {
      const AABB & domain = forest.getDomain();

-      const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
-
-      const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
-                             domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
-
-      const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
-                              domain.xMax(), domain.yMin() + ySize, domain.zMax() );
+      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
+      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );

      for(auto & block : forest)
      {
@@ -99,8 +95,7 @@ class LDC
            Cell globalCell(localCell);
            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
-            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
-                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
+            else if (globalCell.y() < 0 || globalCell.x() < 0 || globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)))
            {
               flagField->addFlag(localCell, noslipFlag);
            }

--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
@@ -7,7 +7,7 @@ from pystencils.typing import TypedSymbol
 from lbmpy.advanced_streaming.utility import get_timesteps
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
-from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil, SubgridScaleModel

 from pystencils_walberla import CodeGeneration, generate_info_header
 from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
@@ -30,20 +30,25 @@ const char * infoCollisionSetup = "{collision_setup}";
 const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
- 
+
 with CodeGeneration() as ctx:
    field_type = "float64" if ctx.double_accuracy else "float32"

-    streaming_pattern = 'pull'
+    streaming_pattern = 'esopull'
    timesteps = get_timesteps(streaming_pattern)
    stencil = LBStencil(Stencil.D3Q19)
+    method_enum = Method.CUMULANT
+
+    fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
+    collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()

    assert stencil.D == 3, "This application supports only three-dimensional stencils"
    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}

-    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+    lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
+                           fourth_order_correction=fourth_order_correction,
                           streaming_pattern=streaming_pattern)
    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')

@@ -66,7 +71,7 @@ with CodeGeneration() as ctx:
    infoHeaderParams = {
        'stencil': stencil.name.lower(),
        'streaming_pattern': streaming_pattern,
-        'collision_setup': lbm_config.method.name.lower(),
+        'collision_setup': collision_setup,
        'cse_global': int(lbm_opt.cse_global),
        'cse_pdfs': int(lbm_opt.cse_pdfs),
    }

--- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
 import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sqlite3
+import os
+import sys
+
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
+DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))


 class Scenario:
-    def __init__(self, domain_size=(64, 64, 64), root_blocks=(2, 2, 2),
-                 cells_per_block=(32, 32, 32), refinement_depth=0):
+    def __init__(self,
+                 domain_size=(64, 64, 64),
+                 root_blocks=(2, 2, 2),
+                 num_processes=1,
+                 refinement_depth=0,
+                 cells_per_block=(32, 32, 32),
+                 timesteps=101,
+                 gpu_enabled_mpi=False,
+                 vtk_write_frequency=0,
+                 logger_frequency=30,
+                 blockforest_filestem="blockforest",
+                 write_setup_vtk=True,
+                 db_file_name=None):

        self.domain_size = domain_size
        self.root_blocks = root_blocks
        self.cells_per_block = cells_per_block
+        self.periodic = (0, 0, 1)
+
        self.refinement_depth = refinement_depth
+        self.num_processes = num_processes
+        self.bfs_filestem = blockforest_filestem
+        self.write_setup_vtk = write_setup_vtk
+
+        self.timesteps = timesteps
+        self.gpu_enabled_mpi = gpu_enabled_mpi
+        self.vtk_write_frequency = vtk_write_frequency
+        self.logger_frequency = logger_frequency

-        self.periodic = (0, 0, 0)
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name

        self.config_dict = self.config(print_dict=False)

@@ -22,39 +64,79 @@ class Scenario:
                'domainSize': self.domain_size,
                'rootBlocks': self.root_blocks,
                'cellsPerBlock': self.cells_per_block,
-                'periodic': self.periodic
+                'periodic': self.periodic,
+            },
+            'SetupBlockForest': {
+                'refinementDepth': self.refinement_depth,
+                'numProcesses': self.num_processes,
+                'blockForestFilestem': self.bfs_filestem,
+                'writeVtk': self.write_setup_vtk,
+                'outputStatistics': True,
+                'writeSetupForestAndReturn': True,
            },
            'Parameters': {
                'omega': 1.95,
-                'timesteps': 30001,
-
-                'refinementDepth': self.refinement_depth,
-                'writeSetupForestAndReturn': False,
-                'numProcesses': 1,
-
-                'cudaEnabledMPI': False,
-                'benchmarkKernelOnly': False,
-
-                'remainingTimeLoggerFrequency': 3,
-
-                'vtkWriteFrequency': 10000,
+                'timesteps': self.timesteps,
+                'remainingTimeLoggerFrequency': self.logger_frequency,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'useVTKAMRWriter': True,
+                'oneFilePerProcess': False,
+                'writeOnlySlice': False,
+                'gpuEnabledMPI': self.gpu_enabled_mpi,
+                'gpuBlockSize': (128, 1, 1),
            },
            'Logging': {
                'logLevel': "info",
            }
        }

-        if print_dict and config_dict["Parameters"]["writeSetupForestAndReturn"] is False:
+        if print_dict:
            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+
        return config_dict

+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
+        sequenceValuesToScalars(data)
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs"
+        table_name = table_name.replace("-", "_")
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+

 def validation_run():
    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
    wlb.log_info_on_root("Validation run")

-    domain_size = (96, 96, 96)
-    cells_per_block = (32, 32, 32)
+    domain_size = (192, 192, 64)
+    cells_per_block = (64, 64, 64)

    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])

@@ -62,8 +144,91 @@ def validation_run():
    scenario = Scenario(domain_size=domain_size,
                        root_blocks=root_blocks,
                        cells_per_block=cells_per_block,
-                        refinement_depth=1)
+                        timesteps=0,
+                        vtk_write_frequency=0,
+                        refinement_depth=3,
+                        gpu_enabled_mpi=False)
    scenarios.add(scenario)


-validation_run()
+def weak_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
+    wlb.log_info_on_root("Running weak scaling benchmark...")
+
+    # This benchmark must run from 16 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if uniform:
+        factor = 3 * num_proc
+        name = "uniform"
+    else:
+        if num_proc % 16 != 0:
+            raise RuntimeError("Number of processes must be dividable by 16")
+        factor = int(num_proc // 16)
+        name = "nonuniform"
+
+    cells_per_block = (WeakX, WeakY, WeakZ)
+    domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        gpu_enabled_mpi=gpu_enabled_mpi,
+                        db_file_name=f"weakScalingGPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+def strong_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
+    wlb.log_info_on_root("Running strong scaling benchmark...")
+
+    # This benchmark must run from 64 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if num_proc % 64 != 0:
+        raise RuntimeError("Number of processes must be dividable by 64")
+
+    cells_per_block = (StrongX, StrongY, StrongZ)
+
+    if uniform:
+        domain_size = (cells_per_block[0] * 2, cells_per_block[1] * 2, cells_per_block[2] * 16)
+        name = "uniform"
+    else:
+        factor = int(num_proc / 64)
+        blocks64 = block_decomposition(factor)
+        cells_per_block = tuple([int(c / b) for c, b in zip(cells_per_block, reversed(blocks64))])
+        domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+        name = "nonuniform"
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        gpu_enabled_mpi=gpu_enabled_mpi,
+                        db_file_name=f"strongScalingGPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+if BENCHMARK == 0:
+    validation_run()
+elif BENCHMARK == 1:
+    weak_scaling_ldc(1, True, False)
+elif BENCHMARK == 2:
+    strong_scaling_ldc(1, True, False)
+else:
+    print(f"Invalid benchmark case {BENCHMARK}")
+
+
--- a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
+++ b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
@@ -15,10 +15,10 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
 if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
+            DEPENDS blockforest core gpu field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
 else ()
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
+            DEPENDS blockforest core field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
 endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )

--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
@@ -8,6 +8,11 @@ from waLBerla.tools.config import block_decomposition
 import sys
 from math import prod

+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+

 def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
    """Checks if a single block of given size fits into GPU memory"""
@@ -20,7 +25,9 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, s


 class Scenario:
-    def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256),
+    def __init__(self, time_step_strategy,
+                 cuda_block_size,
+                 cells_per_block=(256, 256, 256),
                 cuda_enabled_mpi=False):
        # output frequencies
        self.vtkWriteFrequency = 0
@@ -89,6 +96,14 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
        sequenceValuesToScalars(data)

        df = pd.DataFrame.from_records([data])
@@ -101,43 +116,19 @@ class Scenario:
 def benchmark():
    scenarios = wlb.ScenarioManager()

-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
    gpu_mem = gpu_mem_gb * (2 ** 30)

-    block_size = (256, 256, 256)
+    block_size = (320, 320, 320)
+    cuda_enabled_mpi = True

    if not domain_block_size_ok(block_size, gpu_mem):
        wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
    else:
-        scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size))
-
+        scenarios.add(Scenario(time_step_strategy='normal',
+                               cuda_block_size=(128, 1, 1),
+                               cells_per_block=block_size,
+                               cuda_enabled_mpi=cuda_enabled_mpi))

-def kernel_benchmark():
-    scenarios = wlb.ScenarioManager()
-
-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
-    gpu_mem = gpu_mem_gb * (2 ** 30)

-    block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)]
-
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1),
-                   (32, 4, 1), (64, 4, 1),
-                   (32, 4, 2),
-                   (32, 8, 1),
-                   (16, 16, 1)]
-
-    for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
-        for cuda_block in cuda_blocks:
-            for block_size in block_sizes:
-                if not domain_block_size_ok(block_size, gpu_mem):
-                    wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
-                    continue
-                scenario = Scenario(time_step_strategy=time_step_strategy,
-                                    cuda_block_size=cuda_block,
-                                    cells_per_block=block_size)
-                scenarios.add(scenario)
-
-
-# benchmark()
-kernel_benchmark()
+benchmark()
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -29,6 +29,7 @@
 #include "field/vtk/VTKWriter.h"

 #include "geometry/InitBoundaryHandling.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"

 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/DictWrapper.h"
@@ -78,14 +79,10 @@ int main(int argc, char** argv)
      logging::configureLogging(config);
      shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGridFromConfig(config);

-      Vector3< uint_t > cellsPerBlock =
-         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
      // Reading parameters
      auto parameters                    = config->getOneBlock("Parameters");
      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
      const uint_t timesteps             = parameters.getParameter< uint_t >("timesteps", uint_c(50));
-      const real_t remainingTimeLoggerFrequency =
-         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0));
      const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
      const uint_t warmupSteps  = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));

@@ -102,6 +99,7 @@ int main(int argc, char** argv)
         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
      BlockDataID phase_field_gpu =
         gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
+      BlockDataID phase_field_tmp = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "temporary phasefield", true);
 #else
      BlockDataID lb_phase_field =
         field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
@@ -109,6 +107,7 @@ int main(int argc, char** argv)
         field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx);
      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
      BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
+      BlockDataID phase_field_tmp = field::addToStorage< PhaseField_T >(blocks, "phase tmp", real_c(0.0), field::fzyx);
 #endif

      if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only")
@@ -139,47 +138,80 @@ int main(int argc, char** argv)
      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);

      pystencils::phase_field_LB_step phase_field_LB_step(
-         lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
+         lb_phase_field_gpu, phase_field_gpu, phase_field_tmp, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
                                              gpuBlockSize[1], gpuBlockSize[2]);
 #else
      pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
      pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
-      pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, vel_field);
+      pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, phase_field_tmp, vel_field);
      pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
 #endif

 // add communication
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
-      auto Comm_velocity_based_distributions =
-         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
-      auto generatedPackInfo_velocity_based_distributions =
-         make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
-      Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      const bool gpuEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
+      const int streamLowPriority  = 0;
+      const int streamHighPriority = 0;
+      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
+
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field_gpu);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
-      Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);

-      auto Comm_phase_field_distributions =
-         make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
-      auto generatedPackInfo_phase_field_distributions =
-         make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
-      Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
-#else
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Full_Stencil_T > >(blocks, gpuEnabledMpi, false);
+      auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false, 65432);
+
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
+
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });

-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });

+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
+
+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< gpu::GPUField<real_t> >(phase_field_gpu);
+           auto phaseFieldTMP = b->getData< gpu::GPUField<real_t> >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
+
+#else
+
+      auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field);
+      auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
      auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
-      auto generatedPackInfo_velocity_based_distributions =
-         make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);

-      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field);
-      Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
+      auto UniformGPUSchemeVelocityBasedDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
+      auto UniformGPUSchemePhaseFieldDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
+      auto UniformGPUSchemePhaseField = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks, 65432);
+
+      UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
+      UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
+      UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);

-      blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
-      auto generatedPackInfo_phase_field_distributions =
-         make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
-      Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
+      auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
+      auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
+
+      auto Comm_phase_field_distributions = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
+      auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
+      auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
+
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
+
+      auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
+        {
+           auto phaseField    = b->getData< PhaseField_T >(phase_field);
+           auto phaseFieldTMP = b->getData< PhaseField_T >(phase_field_tmp);
+           phaseField->swapDataPointers(phaseFieldTMP);
+        });
 #endif

      BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
@@ -201,99 +233,37 @@ int main(int argc, char** argv)
            init_h(&block);
            init_g(&block);
         }
+         WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+         WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+         WALBERLA_MPI_BARRIER()
         WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions done")
      }

+      SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      int const streamLowPriority  = 0;
-      int const streamHighPriority = 0;
-      auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
-      auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
-#endif
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_step.getSweep(defaultStream), "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");

-      auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
-#if defined(WALBERLA_BUILD_WITH_CUDA)
-      auto normalTimeStep = [&]() {
-         Comm_velocity_based_distributions->startCommunication();
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block, defaultStream);
-         Comm_velocity_based_distributions->wait();
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_step.getSweep(defaultStream), "Hydro LB Step");
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
+
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");

-         Comm_phase_field_distributions->startCommunication();
-         for (auto& block : *blocks)
-            hydro_LB_step(&block, defaultStream);
-         Comm_phase_field_distributions->wait();
-      };
-      auto phase_only = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-      };
-      auto hydro_only = [&]() {
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-      auto without_comm = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
 #else
-      auto normalTimeStep = [&]() {
-            Comm_velocity_based_distributions.startCommunication();
-            for (auto& block : *blocks)
-               phase_field_LB_step(&block);
-            Comm_velocity_based_distributions.wait();
-
-            Comm_phase_field_distributions.startCommunication();
-            for (auto& block : *blocks)
-               hydro_LB_step(&block);
-            Comm_phase_field_distributions.wait();
-      };
-      auto phase_only = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-      };
-      auto hydro_only = [&]() {
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-      auto without_comm = [&]() {
-         for (auto& block : *blocks)
-            phase_field_LB_step(&block);
-         for (auto& block : *blocks)
-            hydro_LB_step(&block);
-      };
-#endif
-      std::function< void() > timeStep;
-      if (timeStepStrategy == "phase_only")
-      {
-         timeStep = std::function< void() >(phase_only);
-         WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
-      }
-      else if (timeStepStrategy == "hydro_only")
-      {
-         timeStep = std::function< void() >(hydro_only);
-         WALBERLA_LOG_INFO_ON_ROOT("started only hydro step without communication for benchmarking")
-      }
-      else if (timeStepStrategy == "kernel_only")
-      {
-         timeStep = std::function< void() >(without_comm);
-         WALBERLA_LOG_INFO_ON_ROOT("started complete phasefield model without communication for benchmarking")
-      }
-      else
-      {
-         timeStep = std::function< void() >(normalTimeStep);
-         WALBERLA_LOG_INFO_ON_ROOT("normal timestep with overlapping")
-      }
+      timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
+                     << Sweep(phase_field_LB_step.getSweep(), "Phase LB Step")
+                     << AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");

-      timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
+      timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
+                     << Sweep(hydro_LB_step.getSweep(), "Hydro LB Step");
+      timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
+                     << AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");

-      // remaining time logger
-      if (remainingTimeLoggerFrequency > 0)
-         timeLoop->addFuncAfterTimeStep(
-            timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
-            "remaining time logger");
+      timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
+#endif

      uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
      if (vtkWriteFrequency > 1)
@@ -307,40 +277,60 @@ int main(int argc, char** argv)
         auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
         vtkOutput->addCellDataWriter(phaseWriter);

-         timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+         timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
      }

+      lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
+      fluidCells();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Multiphase benchmark with " << fluidCells.numberOfCells() << " fluid cells")
+      WALBERLA_LOG_INFO_ON_ROOT("Running " << warmupSteps << " timesteps to warm up the system")
+
      for (uint_t i = 0; i < warmupSteps; ++i)
-         timeLoop->singleStep();
+         timeloop.singleStep();
+
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Warmup timesteps done")

-      timeLoop->setCurrentTimeStepToZero();
+      timeloop.setCurrentTimeStepToZero();
+      WALBERLA_MPI_BARRIER()
      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      WcTimingPool timeloopTiming;
      WcTimer simTimer;
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      cudaDeviceSynchronize();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
 #endif
      simTimer.start();
-      timeLoop->run();
+      timeloop.run(timeloopTiming);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
-      cudaDeviceSynchronize();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 #endif
+      WALBERLA_MPI_BARRIER()
      simTimer.end();
      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-      auto time            = real_c(simTimer.last());
-      auto nrOfCells       = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
-      auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
-      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
+      double time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+      WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << performance.mlupsPerProcess(timesteps, time))
      WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
      WALBERLA_ROOT_SECTION()
      {
         python_coupling::PythonCallback pythonCallbackResults("results_callback");
         if (pythonCallbackResults.isCallable())
         {
-            pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+            pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
            pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
            pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
            #if defined(WALBERLA_BUILD_WITH_CUDA)
-               pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", cudaEnabledMpi);
+               pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", gpuEnabledMpi);
            #endif
            // Call Python function to report results
            pythonCallbackResults();

--- a/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/multiphase_codegen.py
@@ -19,7 +19,7 @@ with CodeGeneration() as ctx:
    field_type = "float64" if ctx.double_accuracy else "float32"

    stencil_phase = LBStencil(Stencil.D3Q15)
-    stencil_hydro = LBStencil(Stencil.D3Q27)
+    stencil_hydro = LBStencil(Stencil.D3Q19)
    assert (stencil_phase.D == stencil_hydro.D)

    ########################
@@ -76,13 +76,13 @@ with CodeGeneration() as ctx:
                                 delta_equilibrium=False,
                                 force=sp.symbols(f"F_:{stencil_phase.D}"), velocity_input=u,
                                 weighted=True, relaxation_rates=rates,
-                                 output={'density': C_tmp}, kernel_type='stream_pull_collide')
+                                 output={'density': C_tmp})
    method_phase = create_lb_method(lbm_config=lbm_config_phase)

    lbm_config_hydro = LBMConfig(stencil=stencil_hydro, method=Method.MRT, compressible=False,
                                 weighted=True, relaxation_rate=omega,
                                 force=sp.symbols(f"F_:{stencil_hydro.D}"),
-                                 output={'velocity': u}, kernel_type='collide_stream_push')
+                                 output={'velocity': u})
    method_hydro = create_lb_method(lbm_config=lbm_config_hydro)

    # create the kernels for the initialization of the g and h field
@@ -137,7 +137,8 @@ with CodeGeneration() as ctx:
    sweep_params = {'block_size': sweep_block_size}

    stencil_typedefs = {'Stencil_phase_T': stencil_phase,
-                        'Stencil_hydro_T': stencil_hydro}
+                        'Stencil_hydro_T': stencil_hydro,
+                        'Full_Stencil_T': LBStencil(Stencil.D3Q27)}
    field_typedefs = {'PdfField_phase_T': h,
                      'PdfField_hydro_T': g,
                      'VelocityField_T': u,
@@ -156,7 +157,7 @@ with CodeGeneration() as ctx:
        generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target=Target.CPU)

        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                       field_swaps=[(h, h_tmp), (C, C_tmp)],
+                       field_swaps=[(h, h_tmp)],
                       inner_outer_split=True,
                       cpu_vectorize_info=cpu_vec,
                       target=Target.CPU)
@@ -172,7 +173,7 @@ with CodeGeneration() as ctx:
                              streaming_pattern='pull', target=Target.CPU)

        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
-                              streaming_pattern='push', target=Target.CPU)
+                              streaming_pattern='pull', target=Target.CPU)

        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.CPU)

@@ -183,7 +184,7 @@ with CodeGeneration() as ctx:
                       g_updates, target=Target.GPU)

        generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
-                       field_swaps=[(h, h_tmp), (C, C_tmp)],
+                       field_swaps=[(h, h_tmp)],
                       target=Target.GPU,
                       gpu_indexing_params=sweep_params,
                       varying_parameters=vp)
@@ -198,7 +199,7 @@ with CodeGeneration() as ctx:
                              streaming_pattern='pull', target=Target.GPU)

        generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
-                              streaming_pattern='push', target=Target.GPU)
+                              streaming_pattern='pull', target=Target.GPU)

        generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.GPU)


--- a/apps/benchmarks/UniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt
@@ -3,14 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )


-foreach(streaming_pattern pull push aa esotwist)
+foreach(streaming_pattern pull push aa esotwist esopull esopush)
    foreach(stencil d3q19 d3q27)
-        foreach (collision_setup srt trt w-mrt r-w-mrt cm r-cm k r-k entropic smagorinsky)
+        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
 	    # KBC methods only for D2Q9 and D3Q27 defined
 	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
 		    continue()
-	    endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
-
+        endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+        if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
+            continue()
+        endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
            set(config ${stencil}_${streaming_pattern}_${collision_setup})
            waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
                    FILE UniformGridCPU.py

--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
@@ -64,6 +64,9 @@ using SweepCollection_T = lbm::UniformGridCPUSweepCollection;

 using blockforest::communication::UniformBufferedScheme;

+using macroFieldType = VelocityField_T::value_type;
+using pdfFieldType = PdfField_T::value_type;
+
 int main(int argc, char** argv)
 {
   const mpi::Environment env(argc, argv);
@@ -87,10 +90,10 @@ int main(int argc, char** argv)

      // Creating fields
      const StorageSpecification_T StorageSpec = StorageSpecification_T();
-      auto fieldAllocator = make_shared< field::AllocateAligned< real_t, 64 > >();
+      auto fieldAllocator = make_shared< field::AllocateAligned< pdfFieldType, 64 > >();
      const BlockDataID pdfFieldId  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, field::fzyx, fieldAllocator);
-      const BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
-      const BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx);
+      const BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx);
+      const BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx);
      const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");

      // Initialize velocity on cpu
@@ -235,12 +238,16 @@ int main(int argc, char** argv)
                  pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
                  pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
                  pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+                  pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+                  pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
                  pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
                  pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
                  pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
                  pythonCallbackResults.data().exposeValue("stencil", infoStencil);
                  pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
                  pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+                  pythonCallbackResults.data().exposeValue("vectorised", vectorised);
+                  pythonCallbackResults.data().exposeValue("nontemporal", nontemporal);
                  pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
                  pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
                  // Call Python function to report results