2dee476c · 2dee476c · 2dee476c · 2dee476c · 2dee476c · 2dee476c
--- a/apps/benchmarks/MotionSingleHeavySphere/CMakeLists.txt
+++ b/apps/benchmarks/MotionSingleHeavySphere/CMakeLists.txt
-waLBerla_add_executable ( NAME MotionSingleHeavySphere 
-                          DEPENDS blockforest boundary core domain_decomposition field lbm pe pe_coupling postprocessing timeloop vtk )
\ No newline at end of file
+waLBerla_add_executable ( NAME MotionSingleHeavySphere
+      DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::domain_decomposition walberla::field walberla::lbm walberla::pe walberla::pe_coupling walberla::postprocessing walberla::timeloop walberla::vtk )
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGrid/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGrid/CMakeLists.txt
@@ -2,5 +2,5 @@
 waLBerla_link_files_to_builddir( "*.dat" )


-waLBerla_add_executable ( NAME NonUniformGridBenchmark 
-                          DEPENDS blockforest boundary core domain_decomposition field lbm postprocessing timeloop vtk sqlite)
\ No newline at end of file
+waLBerla_add_executable ( NAME NonUniformGridBenchmark
+      DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::domain_decomposition walberla::field walberla::lbm walberla::postprocessing walberla::timeloop walberla::vtk walberla::sqlite )
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp
+++ b/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp
@@ -243,23 +243,23 @@ static void refinementSelection( SetupBlockForest& forest )
   AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMax() - zSize,
                     domain.xMax(), domain.yMax(), domain.zMax() );

-   for( auto block = forest.begin(); block != forest.end(); ++block )
+   for(auto & block : forest)
   {
-      auto & aabb = block->getAABB();
+      auto & aabb = block.getAABB();
      if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
      {
-         if( block->getLevel() < ( BlockForestLevels - uint_t(1) ) )
-            block->setMarker( true );
+         if( block.getLevel() < ( BlockForestLevels - uint_t(1) ) )
+            block.setMarker( true );
      }
   }
 }

 static void workloadAndMemoryAssignment( SetupBlockForest & forest, const memory_t memoryPerBlock ) {

-   for( auto block = forest.begin(); block != forest.end(); ++block )
+   for(auto & block : forest)
   {
-      block->setWorkload( numeric_cast< workload_t >( uint_t(1) << block->getLevel() ) );
-      block->setMemory( memoryPerBlock );
+      block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) );
+      block.setMemory( memoryPerBlock );
   }
 }

@@ -295,7 +295,7 @@ void createSetupBlockForest( blockforest::SetupBlockForest & sforest, const Conf

 #ifndef NDEBUG
   WALBERLA_ASSERT_EQUAL( sforest.getNumberOfBlocks(), numberOfBlocks( workerProcesses, fineBlocksPerProcess ) );
-   for( uint_t i = uint_t(0); i != BlockForestLevels; ++i )
+   for( auto i = uint_t(0); i != BlockForestLevels; ++i )
   {
      std::vector< blockforest::SetupBlock * > blocks;
      sforest.getBlocks( blocks, i );
@@ -415,13 +415,13 @@ void ReGrid::operator()( std::vector< std::pair< const Block *, uint_t > > & min
                  domain.xMax(), domain.yMax(), domain.zMin() + real_c(0.99) * zSize );
   }
   
-   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+   for(auto & minTargetLevel : minTargetLevels)
   {
-      auto & aabb = it->first->getAABB();
+      auto & aabb = minTargetLevel.first->getAABB();
      if( left.intersects( aabb ) || right.intersects( aabb ) )
-         it->second = BlockForestLevels - uint_t(1);
+         minTargetLevel.second = BlockForestLevels - uint_t(1);
      else
-         it->second = uint_t(0);
+         minTargetLevel.second = uint_t(0);
   }
 }


--- a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+waLBerla_link_files_to_builddir( "simulation_setup" )
+
+waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
+        FILE NonUniformGridCPU.py
+        OUT_FILES NonUniformGridCPUStorageSpecification.h NonUniformGridCPUStorageSpecification.cpp
+        NonUniformGridCPUSweepCollection.h NonUniformGridCPUSweepCollection.cpp
+        NoSlip.h NoSlip.cpp
+        UBB.h UBB.cpp
+        NonUniformGridCPUBoundaryCollection.h
+        NonUniformGridCPUInfoHeader.h)
+
+waLBerla_add_executable( NAME NonUniformGridCPU
+                         FILES NonUniformGridCPU.cpp LdcSetup.h GridGeneration.h
+      DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::domain_decomposition walberla::field walberla::geometry walberla::lbm_generated walberla::python_coupling walberla::timeloop walberla::vtk NonUniformGridCPUGenerated )
--- a/apps/benchmarks/NonUniformGridCPU/GridGeneration.h
+++ b/apps/benchmarks/NonUniformGridCPU/GridGeneration.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GridGeneration.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include <string>
+
+#include "LdcSetup.h"
+#include "NonUniformGridCPUInfoHeader.h"
+
+using StorageSpecification_T = lbm::NonUniformGridCPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+
+using namespace walberla;
+
+void createSetupBlockForest(SetupBlockForest& setupBfs,
+                            const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
+                            const bool useMPIManager=false)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+
+   Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+   Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
+
+   const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
+   uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
+   const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
+   const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
+   const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
+
+   if(useMPIManager)
+      numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
+
+   const LDC ldc(refinementDepth);
+
+   auto refSelection = ldc.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
+   {
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      setupBfs.saveToFile(oss.str().c_str());
+   }
+
+   if(writeVtk){
+      setupBfs.writeVTKOutput(blockForestFilestem);
+   }
+
+   if(outputStatistics){
+      WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++)
+      {
+         const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+      }
+
+      const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
+      WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+
+      const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
+      const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+      const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+      const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+      const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
+      const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+      const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+      WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+      WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
+
+      WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
+   }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs,
+                       const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
+{
+   if (mpi::MPIManager::instance()->numProcesses() > 1)
+   {
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      std::ifstream infile(setupBlockForestFilepath.c_str());
+      if(!infile.good())
+      {
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else
+      {
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else
+   {
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
+}
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file LdcSetup.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/all.h"
+
+#include "field/FlagField.h"
+
+#include "field/FlagUID.h"
+
+using namespace walberla;
+
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
+using FlagField_T          = FlagField< uint8_t >;
+
+class LDCRefinement
+{
+ private:
+   const uint_t refinementDepth_;
+
+ public:
+   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest) const
+   {
+      const AABB & domain = forest.getDomain();
+
+      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
+      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );
+
+      for(auto & block : forest)
+      {
+         auto & aabb = block.getAABB();
+         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
+         {
+            if( block.getLevel() < refinementDepth_)
+               block.setMarker( true );
+         }
+      }
+   }
+};
+
+class LDC
+{
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID noSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+
+ public:
+   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+
+   RefinementSelectionFunctor refinementSelector() const
+   {
+      return LDCRefinement(refinementDepth_);
+   }
+
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         auto& b           = dynamic_cast< Block& >(*bIt);
+         const uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
+         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            const Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
+                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
+            {
+               flagField->addFlag(localCell, noslipFlag);
+            }
+         }
+      }
+   }
+};
--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonUniformGridCPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
+
+#include "timeloop/SweepTimeloop.h"
+
+#include <cmath>
+
+#include "GridGeneration.h"
+#include "LdcSetup.h"
+#include "NonUniformGridCPUInfoHeader.h"
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
+
+using namespace walberla;
+
+using StorageSpecification_T = lbm::NonUniformGridCPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+
+using PdfField_T           = lbm_generated::PdfField< StorageSpecification_T >;
+using BoundaryCollection_T = lbm::NonUniformGridCPUBoundaryCollection< FlagField_T >;
+
+using SweepCollection_T = lbm::NonUniformGridCPUSweepCollection;
+
+using blockforest::communication::NonUniformBufferedScheme;
+
+int main(int argc, char** argv)
+{
+   const mpi::Environment env(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+
+   const std::string input_filename(argv[1]);
+   const bool inputIsPython = string_ends_with(input_filename, ".py");
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      WALBERLA_MPI_BARRIER()
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                            BLOCK FOREST SETUP                                              ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto config = *cfg;
+      logging::configureLogging(config);
+      auto domainSetup      = config->getOneBlock("DomainSetup");
+      auto blockForestSetup = config->getOneBlock("SetupBlockForest");
+      const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);
+
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
+
+      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+
+      shared_ptr< BlockForest > bfs;
+      createBlockForest(bfs, domainSetup, blockForestSetup);
+
+      if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+         return EXIT_SUCCESS;
+      }
+
+      auto blocks =
+         std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+      blocks->createCellBoundingBoxes();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level))
+      }
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                    DISTRIBUTED DATA STRUCTURES                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto ldc                     = std::make_shared< LDC >(refinementDepth);
+
+      // Reading parameters
+      auto parameters                = config->getOneBlock("Parameters");
+      const real_t omega             = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const uint_t timesteps         = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const bool benchmarkKernelOnly = parameters.getParameter< bool >("benchmarkKernelOnly", false);
+
+      // Creating fields
+      const StorageSpecification_T StorageSpec = StorageSpecification_T();
+      const BlockDataID pdfFieldID =
+         lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx);
+      const BlockDataID velFieldID =
+         field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx, uint_c(2));
+      const BlockDataID densityFieldID =
+         field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(2));
+      const BlockDataID flagFieldID =
+         field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
+
+      const Cell innerOuterSplit =
+         Cell(parameters.getParameter< Vector3< cell_idx_t > >("innerOuterSplit", Vector3< cell_idx_t >(1, 1, 1)));
+      SweepCollection_T sweepCollection(blocks, pdfFieldID, densityFieldID, velFieldID, omega, innerOuterSplit);
+      for (auto& block : *blocks)
+      {
+         sweepCollection.initialise(&block, cell_idx_c(1));
+      }
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Initialisation done")
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      const FlagUID fluidFlagUID("Fluid");
+      ldc->setupBoundaryFlagField(*blocks, flagFieldID);
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID, 2);
+      BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldID, fluidFlagUID);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+      auto communication = std::make_shared< NonUniformBufferedScheme< CommunicationStencil_T > >(blocks);
+      auto packInfo      = lbm_generated::setupNonuniformPdfCommunication< PdfField_T >(blocks, pdfFieldID);
+      communication->addPackInfo(packInfo);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      lbm_generated::BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T > LBMMeshRefinement(
+         blocks, pdfFieldID, sweepCollection, boundaryCollection, communication, packInfo);
+
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+
+      if (benchmarkKernelOnly)
+      {
+         timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+      }
+      else { LBMMeshRefinement.addRefinementToTimeLoop(timeLoop); }
+
+      // VTK
+      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      const bool useVTKAMRWriter = parameters.getParameter< bool >("useVTKAMRWriter", false);
+      const bool oneFilePerProcess = parameters.getParameter< bool >("oneFilePerProcess", false);
+
+      auto finalDomain = blocks->getDomain();
+      if (vtkWriteFrequency > 0)
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                         "simulation_step", false, true, true, false, 0, useVTKAMRWriter, oneFilePerProcess);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldID, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+
+         if (parameters.getParameter< bool >("writeOnlySlice", true)){
+            const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - blocks->dz(refinementDepth),
+                               finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + blocks->dz(refinementDepth));
+            vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
+         }
+
+         vtkOutput->addBeforeFunction([&]() {
+            for (auto& block : *blocks)
+               sweepCollection.calculateMacroscopicParameters(&block);
+         });
+         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      }
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto remainingTimeLoggerFrequency =
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
+      if (remainingTimeLoggerFrequency > 0)
+      {
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency);
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
+      }
+
+      lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
+      fluidCells();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Non uniform Grid benchmark with " << fluidCells.numberOfCells()
+                                                                   << " fluid cells (in total on all levels)")
+
+      WcTimingPool timeloopTiming;
+      WcTimer simTimer;
+
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Starting benchmark with " << timesteps << " time steps")
+      WALBERLA_MPI_BARRIER()
+
+      simTimer.start();
+      timeLoop.run(timeloopTiming);
+      WALBERLA_MPI_BARRIER()
+      simTimer.end();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Benchmark finished")
+      double time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+      WALBERLA_ROOT_SECTION()
+      {
+         if (inputIsPython)
+         {
+            python_coupling::PythonCallback pythonCallbackResults("results_callback");
+            if (pythonCallbackResults.isCallable())
+            {
+               pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
+               pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
+               pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+               pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+               pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
+               pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess",
+                                                        performance.mlupsPerProcess(timesteps, time));
+               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+               // Call Python function to report results
+               pythonCallbackResults();
+            }
+         }
+      }
+   }
+   return EXIT_SUCCESS;
+}
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
+import sympy as sp
+
+import pystencils as ps
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
+
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+omega = sp.symbols("omega")
+omega_free = sp.Symbol("omega_free")
+
+info_header = """
+const char * infoStencil = "{stencil}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+
+with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
+    cpu_vec = {"instruction_set": None}
+
+    streaming_pattern = 'esopull'
+    timesteps = get_timesteps(streaming_pattern)
+    stencil = LBStencil(Stencil.D3Q19)
+    method_enum = Method.CUMULANT
+
+    fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
+    collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()
+
+    assert stencil.D == 3, "This application supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
+                           fourth_order_correction=fourth_order_correction,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx")
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
+
+    generate_lbm_package(ctx, name="NonUniformGridCPU",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields,
+                         target=ps.Target.CPU, cpu_vectorize_info=cpu_vec,)
+
+    infoHeaderParams = {
+        'stencil': stencil.name.lower(),
+        'streaming_pattern': streaming_pattern,
+        'collision_setup': collision_setup,
+        'cse_global': int(lbm_opt.cse_global),
+        'cse_pdfs': int(lbm_opt.cse_pdfs),
+    }
+
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
+    generate_info_header(ctx, 'NonUniformGridCPUInfoHeader',
+                         field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
--- a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sqlite3
+import os
+import sys
+
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
+DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))
+
+
+class Scenario:
+    def __init__(self,
+                 domain_size=(64, 64, 64),
+                 root_blocks=(2, 2, 2),
+                 num_processes=1,
+                 refinement_depth=0,
+                 cells_per_block=(32, 32, 32),
+                 timesteps=101,
+                 vtk_write_frequency=0,
+                 logger_frequency=0,
+                 blockforest_filestem="blockforest",
+                 write_setup_vtk=True,
+                 db_file_name=None):
+
+        self.domain_size = domain_size
+        self.root_blocks = root_blocks
+        self.cells_per_block = cells_per_block
+        self.periodic = (0, 0, 0)
+
+        self.refinement_depth = refinement_depth
+        self.num_processes = num_processes
+        self.bfs_filestem = blockforest_filestem
+        self.write_setup_vtk = write_setup_vtk
+
+        self.timesteps = timesteps
+        self.vtk_write_frequency = vtk_write_frequency
+        self.logger_frequency = logger_frequency
+
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
+
+        self.config_dict = self.config(print_dict=False)
+
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        from pprint import pformat
+        config_dict = {
+            'DomainSetup': {
+                'domainSize': self.domain_size,
+                'rootBlocks': self.root_blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic
+            },
+            'SetupBlockForest': {
+                'refinementDepth': self.refinement_depth,
+                'numProcesses': self.num_processes,
+                'blockForestFilestem': self.bfs_filestem,
+                'writeVtk': self.write_setup_vtk,
+                'outputStatistics': True,
+                'writeSetupForestAndReturn': True,
+            },
+            'Parameters': {
+                'omega': 1.95,
+                'timesteps': self.timesteps,
+                'remainingTimeLoggerFrequency': self.logger_frequency,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'useVTKAMRWriter': True,
+                'oneFilePerProcess': False,
+                'writeOnlySlice': False
+            },
+            'Logging': {
+                'logLevel': "info",
+            }
+        }
+
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+
+        return config_dict
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
+        sequenceValuesToScalars(data)
+
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs"
+        table_name = table_name.replace("-", "_")
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+
+
+def weak_scaling_ldc(num_proc, uniform=False):
+    wlb.log_info_on_root("Running weak scaling benchmark...")
+
+    # This benchmark must run from 16 processes onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if uniform:
+        factor = 3 * num_proc
+        name = "uniform"
+    else:
+        if num_proc % 16 != 0:
+            raise RuntimeError("Number of processes must be dividable by 16")
+        factor = int(num_proc // 16)
+        name = "nonuniform"
+
+    cells_per_block = (WeakX, WeakY, WeakZ)
+    domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        db_file_name=f"weakScalingCPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+def strong_scaling_ldc(num_proc, uniform=False):
+    wlb.log_info_on_root("Running strong scaling benchmark...")
+
+    # This benchmark must run from 64 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if num_proc % 64 != 0:
+        raise RuntimeError("Number of processes must be dividable by 64")
+
+    cells_per_block = (StrongX, StrongY, StrongZ)
+
+    if uniform:
+        domain_size = (cells_per_block[0] * 2, cells_per_block[1] * 2, cells_per_block[2] * 16)
+        name = "uniform"
+    else:
+        factor = int(num_proc / 64)
+        blocks64 = block_decomposition(factor)
+        cells_per_block = tuple([int(c / b) for c, b in zip(cells_per_block, reversed(blocks64))])
+        domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+        name = "nonuniform"
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        db_file_name=f"strongScalingCPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+
+    domain_size = (96, 96, 32)
+    cells_per_block = (32, 32, 32)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=1,
+                        refinement_depth=3,
+                        cells_per_block=cells_per_block,
+                        timesteps=1001,
+                        vtk_write_frequency=100,
+                        logger_frequency=5,
+                        write_setup_vtk=True)
+    scenarios.add(scenario)
+
+
+if BENCHMARK == 0:
+    validation_run()
+elif BENCHMARK == 1:
+    weak_scaling_ldc(1, False)
+elif BENCHMARK == 2:
+    strong_scaling_ldc(1, False)
+else:
+    print(f"Invalid benchmark case {BENCHMARK}")
--- a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+waLBerla_link_files_to_builddir( "simulation_setup" )
+
+waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated
+        FILE NonUniformGridGPU.py
+        OUT_FILES NonUniformGridGPUStorageSpecification.h NonUniformGridGPUStorageSpecification.${CODEGEN_FILE_SUFFIX}
+        NonUniformGridGPUSweepCollection.h NonUniformGridGPUSweepCollection.${CODEGEN_FILE_SUFFIX}
+        NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX}
+        UBB.h UBB.${CODEGEN_FILE_SUFFIX}
+        NonUniformGridGPUBoundaryCollection.h
+        NonUniformGridGPUInfoHeader.h)
+waLBerla_add_executable( NAME NonUniformGridGPU
+                         FILES NonUniformGridGPU.cpp LdcSetup.h GridGeneration.h
+      DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::gpu walberla::domain_decomposition walberla::field walberla::geometry walberla::lbm_generated walberla::python_coupling walberla::timeloop walberla::vtk NonUniformGridGPUGenerated )
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridGPU/GridGeneration.h
+++ b/apps/benchmarks/NonUniformGridGPU/GridGeneration.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file GridGeneration.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include <string>
+
+#include "LdcSetup.h"
+#include "NonUniformGridGPUInfoHeader.h"
+
+using StorageSpecification_T = lbm::NonUniformGridGPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+
+using namespace walberla;
+
+void createSetupBlockForest(SetupBlockForest& setupBfs,
+                            const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
+                            const bool useMPIManager=false)
+{
+   WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
+
+   Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+   Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+   Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+   Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
+
+   const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
+   uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
+   const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
+   const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
+   const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
+
+   if(useMPIManager)
+      numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
+
+   const LDC ldc(refinementDepth);
+
+   auto refSelection = ldc.refinementSelector();
+   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
+   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1)
+      return;
+
+   {
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      setupBfs.saveToFile(oss.str().c_str());
+   }
+
+   if(writeVtk){
+      setupBfs.writeVTKOutput(blockForestFilestem);
+   }
+
+   if(outputStatistics){
+      WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+      for (uint_t level = 0; level <= refinementDepth; level++){
+         const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+      }
+
+      const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
+      WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+
+      const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
+      const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+      const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
+      const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
+      const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
+      const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
+      const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
+
+      WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
+      WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
+      WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
+
+      WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
+   }
+}
+
+void createBlockForest(shared_ptr< BlockForest >& bfs,
+                       const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
+{
+   if (mpi::MPIManager::instance()->numProcesses() > 1){
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
+      std::ifstream infile(setupBlockForestFilepath.c_str());
+      if(!infile.good()){
+         WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
+         SetupBlockForest setupBfs;
+         createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      }
+      else{
+         bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                               setupBlockForestFilepath.c_str(), false);
+      }
+   }
+   else{
+      SetupBlockForest setupBfs;
+      createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
+      bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+   }
+}
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file LdcSetup.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/all.h"
+
+#include "field/FlagField.h"
+
+#include "field/FlagUID.h"
+
+using namespace walberla;
+
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
+using FlagField_T          = FlagField< uint8_t >;
+
+class LDCRefinement
+{
+ private:
+   const uint_t refinementDepth_;
+
+ public:
+   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest) const
+   {
+      const AABB & domain = forest.getDomain();
+
+      const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
+      const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );
+
+      for(auto & block : forest)
+      {
+         auto & aabb = block.getAABB();
+         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
+         {
+            if( block.getLevel() < refinementDepth_)
+               block.setMarker( true );
+         }
+      }
+   }
+};
+
+class LDC
+{
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID noSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+
+ public:
+   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+
+   RefinementSelectionFunctor refinementSelector() const
+   {
+      return LDCRefinement(refinementDepth_);
+   }
+
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         auto& b           = dynamic_cast< Block& >(*bIt);
+         const uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
+         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            const Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.y() < 0 || globalCell.x() < 0 || globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)))
+            {
+               flagField->addFlag(localCell, noslipFlag);
+            }
+         }
+      }
+   }
+};
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonUniformGridGPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/vtk/VTKWriter.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/ErrorChecking.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/NonUniformGPUScheme.h"
+
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
+
+#include <cmath>
+
+#include "GridGeneration.h"
+#include "LdcSetup.h"
+#include "NonUniformGridGPUInfoHeader.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/gpu/AddToStorage.h"
+#include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.h"
+#include "lbm_generated/gpu/GPUPdfField.h"
+#include "lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h"
+using namespace walberla;
+
+using StorageSpecification_T = lbm::NonUniformGridGPUStorageSpecification;
+using Stencil_T              = StorageSpecification_T::Stencil;
+using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
+
+using PdfField_T           = lbm_generated::PdfField< StorageSpecification_T >;
+using GPUPdfField_T        = lbm_generated::GPUPdfField< StorageSpecification_T >;
+using FlagField_T          = FlagField< uint8_t >;
+using BoundaryCollection_T = lbm::NonUniformGridGPUBoundaryCollection< FlagField_T >;
+
+using SweepCollection_T = lbm::NonUniformGridGPUSweepCollection;
+
+using gpu::communication::NonUniformGPUScheme;
+
+int main(int argc, char** argv)
+{
+   const mpi::Environment env(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+   gpu::selectDeviceBasedOnMpiRank();
+
+   const std::string input_filename(argv[1]);
+   const bool inputIsPython = string_ends_with(input_filename, ".py");
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      WALBERLA_MPI_WORLD_BARRIER()
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                        SETUP AND CONFIGURATION                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto config = *cfg;
+      logging::configureLogging(config);
+      auto domainSetup        = config->getOneBlock("DomainSetup");
+      auto blockForestSetup   = config->getOneBlock("SetupBlockForest");
+      const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);
+
+      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
+      // Reading parameters
+      auto parameters   = config->getOneBlock("Parameters");
+      const real_t omega             = parameters.getParameter< real_t >("omega", real_c(1.95));
+      const uint_t timesteps         = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const bool gpuEnabledMPI       = parameters.getParameter< bool >("gpuEnabledMPI", false);
+
+      shared_ptr< BlockForest > bfs;
+      createBlockForest(bfs, domainSetup, blockForestSetup);
+
+      if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
+         return EXIT_SUCCESS;
+      }
+
+      auto blocks =
+         std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
+      blocks->createCellBoundingBoxes();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks() << " on " << blocks->getNumberOfLevels() << " refinement levels")
+      for (uint_t level = 0; level < blocks->getNumberOfLevels(); level++)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level))
+      }
+
+      WALBERLA_LOG_INFO_ON_ROOT("Start field allocation")
+      // Creating fields
+      const StorageSpecification_T StorageSpec = StorageSpecification_T();
+      auto allocator                           = make_shared< gpu::HostFieldAllocator< real_t > >();
+      const BlockDataID pdfFieldCpuID =
+         lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(2), field::fzyx, allocator);
+      const BlockDataID velFieldCpuID =
+         field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx, uint_c(2), allocator);
+      const BlockDataID densityFieldCpuID =
+         field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx, uint_c(2), allocator);
+      const BlockDataID flagFieldID =
+         field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field", uint_c(3));
+
+      const BlockDataID pdfFieldGpuID =
+         lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, StorageSpec, "pdfs on GPU", true);
+      const BlockDataID velFieldGpuID =
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
+      const BlockDataID densityFieldGpuID =
+         gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldCpuID, "velocity on GPU", true);
+      WALBERLA_LOG_INFO_ON_ROOT("Finished field allocation")
+
+      const Cell innerOuterSplit =
+         Cell(parameters.getParameter< Vector3< cell_idx_t > >("innerOuterSplit", Vector3< cell_idx_t >(1, 1, 1)));
+      Vector3< int32_t > gpuBlockSize =
+         parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
+      SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0],
+                                        gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit);
+
+      for (auto& iBlock : *blocks){
+         sweepCollection.initialise(&iBlock, cell_idx_c(1));
+      }
+      sweepCollection.initialiseBlockPointer();
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Initialisation done")
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto ldc = std::make_shared< LDC >(blocks->getDepth());
+
+      const FlagUID fluidFlagUID("Fluid");
+      ldc->setupBoundaryFlagField(*blocks, flagFieldID);
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID, 0);
+      BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldGpuID, fluidFlagUID);
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      WALBERLA_LOG_INFO_ON_ROOT("Setting up communication...")
+      auto communication = std::make_shared< NonUniformGPUScheme< CommunicationStencil_T > >(blocks, gpuEnabledMPI);
+      auto packInfo      = lbm_generated::setupNonuniformGPUPdfCommunication< GPUPdfField_T >(blocks, pdfFieldGpuID);
+      communication->addPackInfo(packInfo);
+      WALBERLA_MPI_BARRIER()
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      int streamHighPriority = 0;
+      int streamLowPriority  = 0;
+      WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
+      sweepCollection.setOuterPriority(streamHighPriority);
+      auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+
+      lbm_generated::BasicRecursiveTimeStepGPU< GPUPdfField_T, SweepCollection_T, BoundaryCollection_T >
+         LBMMeshRefinement(blocks, pdfFieldGpuID, sweepCollection, boundaryCollection, communication, packInfo);
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+
+      LBMMeshRefinement.addRefinementToTimeLoop(timeLoop, uint_c(0));
+
+      // VTK
+      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      const bool useVTKAMRWriter     = parameters.getParameter< bool >("useVTKAMRWriter", false);
+      const bool oneFilePerProcess   = parameters.getParameter< bool >("oneFilePerProcess", false);
+
+      auto finalDomain = blocks->getDomain();
+      if (vtkWriteFrequency > 0){
+         auto vtkOutput =
+            vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", "simulation_step",
+                                           false, true, true, false, 0, useVTKAMRWriter, oneFilePerProcess);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldCpuID, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+
+         if (parameters.getParameter< bool >("writeOnlySlice", true)){
+            const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - blocks->dz(blocks->getDepth()),
+                               finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + blocks->dz(blocks->getDepth()));
+            vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
+         }
+
+         vtkOutput->addBeforeFunction([&]() {
+            for (auto& block : *blocks)
+               sweepCollection.calculateMacroscopicParameters(&block);
+            gpu::fieldCpy< VelocityField_T, gpu::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
+         });
+         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      }
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto remainingTimeLoggerFrequency =
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
+      if (remainingTimeLoggerFrequency > 0){
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency);
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
+      }
+
+      lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
+      fluidCells();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Non uniform Grid benchmark with " << fluidCells.numberOfCells()
+                                                                   << " fluid cells (in total on all levels)")
+
+      WcTimingPool timeloopTiming;
+      WcTimer simTimer;
+
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      WALBERLA_MPI_BARRIER()
+
+      simTimer.start();
+      timeLoop.run(timeloopTiming);
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_MPI_BARRIER()
+      simTimer.end();
+
+      WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+      double time = simTimer.max();
+      WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+      performance.logResultOnRoot(timesteps, time);
+
+      const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+      WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+      WALBERLA_ROOT_SECTION()
+      {
+         if (inputIsPython)
+         {
+            python_coupling::PythonCallback pythonCallbackResults("results_callback");
+            if (pythonCallbackResults.isCallable())
+            {
+               pythonCallbackResults.data().exposeValue("numProcesses", lbm_generated::PerformanceEvaluation< FlagField_T >::processes());
+               pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
+               pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+               pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+               pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
+               pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
+               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+               // Call Python function to report results
+               pythonCallbackResults();
+            }
+         }
+      }
+   }
+   return EXIT_SUCCESS;
+}
\ No newline at end of file
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.py
+import sympy as sp
+import numpy as np
+
+import pystencils as ps
+from pystencils.typing import TypedSymbol
+
+from lbmpy.advanced_streaming.utility import get_timesteps
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
+from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil, SubgridScaleModel
+
+from pystencils_walberla import CodeGeneration, generate_info_header
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+
+omega = sp.symbols("omega")
+omega_free = sp.Symbol("omega_free")
+compile_time_block_size = False
+max_threads = 256
+
+sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                    TypedSymbol("cudaBlockSize1", np.int32),
+                    TypedSymbol("cudaBlockSize2", np.int32))
+
+gpu_indexing_params = {'block_size': sweep_block_size}
+
+info_header = """
+const char * infoStencil = "{stencil}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+
+with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
+
+    streaming_pattern = 'esopull'
+    timesteps = get_timesteps(streaming_pattern)
+    stencil = LBStencil(Stencil.D3Q19)
+    method_enum = Method.CUMULANT
+
+    fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
+    collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()
+
+    assert stencil.D == 3, "This application supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+
+    lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
+                           fourth_order_correction=fourth_order_correction,
+                           streaming_pattern=streaming_pattern)
+    lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
+
+    method = create_lb_method(lbm_config=lbm_config)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
+
+    generate_lbm_package(ctx, name="NonUniformGridGPU",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=True, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields,
+                         target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
+                         max_threads=max_threads)
+
+    infoHeaderParams = {
+        'stencil': stencil.name.lower(),
+        'streaming_pattern': streaming_pattern,
+        'collision_setup': collision_setup,
+        'cse_global': int(lbm_opt.cse_global),
+        'cse_pdfs': int(lbm_opt.cse_pdfs),
+    }
+
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
+    generate_info_header(ctx, 'NonUniformGridGPUInfoHeader',
+                         field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
--- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sqlite3
+import os
+import sys
+
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+
+DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))
+
+
+class Scenario:
+    def __init__(self,
+                 domain_size=(64, 64, 64),
+                 root_blocks=(2, 2, 2),
+                 num_processes=1,
+                 refinement_depth=0,
+                 cells_per_block=(32, 32, 32),
+                 timesteps=101,
+                 gpu_enabled_mpi=False,
+                 vtk_write_frequency=0,
+                 logger_frequency=30,
+                 blockforest_filestem="blockforest",
+                 write_setup_vtk=True,
+                 db_file_name=None):
+
+        self.domain_size = domain_size
+        self.root_blocks = root_blocks
+        self.cells_per_block = cells_per_block
+        self.periodic = (0, 0, 1)
+
+        self.refinement_depth = refinement_depth
+        self.num_processes = num_processes
+        self.bfs_filestem = blockforest_filestem
+        self.write_setup_vtk = write_setup_vtk
+
+        self.timesteps = timesteps
+        self.gpu_enabled_mpi = gpu_enabled_mpi
+        self.vtk_write_frequency = vtk_write_frequency
+        self.logger_frequency = logger_frequency
+
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
+
+        self.config_dict = self.config(print_dict=False)
+
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        from pprint import pformat
+        config_dict = {
+            'DomainSetup': {
+                'domainSize': self.domain_size,
+                'rootBlocks': self.root_blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+            },
+            'SetupBlockForest': {
+                'refinementDepth': self.refinement_depth,
+                'numProcesses': self.num_processes,
+                'blockForestFilestem': self.bfs_filestem,
+                'writeVtk': self.write_setup_vtk,
+                'outputStatistics': True,
+                'writeSetupForestAndReturn': True,
+            },
+            'Parameters': {
+                'omega': 1.95,
+                'timesteps': self.timesteps,
+                'remainingTimeLoggerFrequency': self.logger_frequency,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'useVTKAMRWriter': True,
+                'oneFilePerProcess': False,
+                'writeOnlySlice': False,
+                'gpuEnabledMPI': self.gpu_enabled_mpi,
+                'gpuBlockSize': (128, 1, 1),
+            },
+            'Logging': {
+                'logLevel': "info",
+            }
+        }
+
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+
+        return config_dict
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
+        sequenceValuesToScalars(data)
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs"
+        table_name = table_name.replace("-", "_")
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+
+
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+
+    domain_size = (192, 192, 64)
+    cells_per_block = (64, 64, 64)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        cells_per_block=cells_per_block,
+                        timesteps=0,
+                        vtk_write_frequency=0,
+                        refinement_depth=3,
+                        gpu_enabled_mpi=False)
+    scenarios.add(scenario)
+
+
+def weak_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
+    wlb.log_info_on_root("Running weak scaling benchmark...")
+
+    # This benchmark must run from 16 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if uniform:
+        factor = 3 * num_proc
+        name = "uniform"
+    else:
+        if num_proc % 16 != 0:
+            raise RuntimeError("Number of processes must be dividable by 16")
+        factor = int(num_proc // 16)
+        name = "nonuniform"
+
+    cells_per_block = (WeakX, WeakY, WeakZ)
+    domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        gpu_enabled_mpi=gpu_enabled_mpi,
+                        db_file_name=f"weakScalingGPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+def strong_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
+    wlb.log_info_on_root("Running strong scaling benchmark...")
+
+    # This benchmark must run from 64 GPUs onwards
+    if wlb.mpi.numProcesses() > 1:
+        num_proc = wlb.mpi.numProcesses()
+
+    if num_proc % 64 != 0:
+        raise RuntimeError("Number of processes must be dividable by 64")
+
+    cells_per_block = (StrongX, StrongY, StrongZ)
+
+    if uniform:
+        domain_size = (cells_per_block[0] * 2, cells_per_block[1] * 2, cells_per_block[2] * 16)
+        name = "uniform"
+    else:
+        factor = int(num_proc / 64)
+        blocks64 = block_decomposition(factor)
+        cells_per_block = tuple([int(c / b) for c, b in zip(cells_per_block, reversed(blocks64))])
+        domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
+        name = "nonuniform"
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
+                        domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        num_processes=num_proc,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=0 if uniform else 3,
+                        timesteps=10,
+                        gpu_enabled_mpi=gpu_enabled_mpi,
+                        db_file_name=f"strongScalingGPU{name}LDC.sqlite3")
+    scenarios.add(scenario)
+
+
+if BENCHMARK == 0:
+    validation_run()
+elif BENCHMARK == 1:
+    weak_scaling_ldc(1, True, False)
+elif BENCHMARK == 2:
+    strong_scaling_ldc(1, True, False)
+else:
+    print(f"Invalid benchmark case {BENCHMARK}")
+
+
--- a/apps/benchmarks/Percolation/CMakeLists.txt
+++ b/apps/benchmarks/Percolation/CMakeLists.txt
+waLBerla_link_files_to_builddir("*.prm")
+
+if (WALBERLA_BUILD_WITH_CODEGEN)
+    if (NOT WALBERLA_BUILD_WITH_GPU_SUPPORT OR (WALBERLA_BUILD_WITH_GPU_SUPPORT AND (CMAKE_CUDA_ARCHITECTURES GREATER_EQUAL 60 OR WALBERLA_BUILD_WITH_HIP)))
+        waLBerla_add_executable(NAME Percolation FILES Percolation.cpp
+              DEPENDS walberla::blockforest walberla::core walberla::field walberla::geometry walberla::gpu walberla::lbm walberla::lbm_mesapd_coupling walberla::mesa_pd walberla::sqlite walberla::vtk PSMCodegenPython_trt-smagorinsky_sc1 )
+        target_compile_definitions(Percolation PRIVATE Weighting=2)
+    endif ()
+endif ()
--- a/apps/benchmarks/Percolation/Percolation.cpp
+++ b/apps/benchmarks/Percolation/Percolation.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Percolation.cpp
+//! \ingroup lbm_mesapd_coupling
+//! \author Samuel Kemmler <samuel.kemmler@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+
+#include "core/DataTypes.h"
+#include "core/Environment.h"
+#include "core/grid_generator/SCIterator.h"
+#include "core/logging/all.h"
+#include "core/timing/RemainingTimeLogger.h"
+
+#include "field/AddToStorage.h"
+#include "field/vtk/all.h"
+
+#include "geometry/InitBoundaryHandling.h"
+
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/communication/UniformGPUScheme.h"
+
+#include "lbm/PerformanceLogger.h"
+#include "lbm/vtk/all.h"
+
+#include "lbm_mesapd_coupling/DataTypesCodegen.h"
+#include "lbm_mesapd_coupling/partially_saturated_cells_method/codegen/PSMSweepCollection.h"
+#include "lbm_mesapd_coupling/utility/ParticleSelector.h"
+
+#include "mesa_pd/data/DataTypes.h"
+#include "mesa_pd/data/ParticleAccessorWithShape.h"
+#include "mesa_pd/data/ParticleStorage.h"
+#include "mesa_pd/data/ShapeStorage.h"
+#include "mesa_pd/data/shape/Sphere.h"
+#include "mesa_pd/domain/BlockForestDomain.h"
+#include "mesa_pd/mpi/SyncNextNeighbors.h"
+#include "mesa_pd/vtk/ParticleVtkOutput.h"
+
+#include "sqlite/SQLite.h"
+
+#include "vtk/all.h"
+
+#include "LBMSweep.h"
+#include "PSMPackInfo.h"
+#include "PSMSweep.h"
+#include "PSM_Density.h"
+#include "PSM_InfoHeader.h"
+#include "PSM_MacroGetter.h"
+
+namespace percolation
+{
+
+///////////
+// USING //
+///////////
+
+using namespace walberla;
+using namespace lbm_mesapd_coupling::psm::gpu;
+using PackInfo_T = pystencils::PSMPackInfo;
+
+using flag_t      = walberla::uint8_t;
+using FlagField_T = FlagField< flag_t >;
+
+///////////
+// FLAGS //
+///////////
+
+const FlagUID Fluid_Flag("Fluid");
+const FlagUID Density_Flag("Density");
+const FlagUID NoSlip_Flag("NoSlip");
+const FlagUID Inflow_Flag("Inflow");
+
+//////////
+// MAIN //
+//////////
+
+//*******************************************************************************************************************
+/*!\brief Benchmark of a percolation setup
+ *
+ * This code can be used as a percolation (useParticles=true) or as a channel flow (useParticles=false) benchmark.
+ * A constant inflow from west is applied and a pressure boundary condition is set at the east.
+ * For the percolation, mono-sized fixed spherical particles are generated on a structured grid with an offset for
+ * every second particle layer in flow direction to avoid channels in flow direction. The flow is described by Darcy's
+ * law. For the channel flow, the flow is described by the Hagen–Poiseuille equation.
+ *
+ * The domain is either periodic or bounded by (no slip) walls in the vertical directions (y and z).
+ *
+ * For the percolation, the PSM is used in combination with a two-way coupling, but no particle dynamics.
+ * For the channel flow, only the LBM is used.
+ *
+ * The parameters can be changed via the input file.
+ *
+ */
+//*******************************************************************************************************************
+int main(int argc, char** argv)
+{
+   Environment env(argc, argv);
+   auto cfgFile = env.config();
+   if (!cfgFile) { WALBERLA_ABORT("Usage: " << argv[0] << " path-to-configuration-file \n"); }
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   gpu::selectDeviceBasedOnMpiRank();
+#endif
+
+   WALBERLA_LOG_INFO_ON_ROOT("waLBerla revision: " << std::string(WALBERLA_GIT_SHA1).substr(0, 8));
+   WALBERLA_LOG_INFO_ON_ROOT("compiler flags: " << std::string(WALBERLA_COMPILER_FLAGS));
+   WALBERLA_LOG_INFO_ON_ROOT("build machine: " << std::string(WALBERLA_BUILD_MACHINE));
+   WALBERLA_LOG_INFO_ON_ROOT(*cfgFile);
+
+   // Read config file
+   Config::BlockHandle numericalSetup = cfgFile->getBlock("NumericalSetup");
+   const uint_t numXBlocks            = numericalSetup.getParameter< uint_t >("numXBlocks");
+   const uint_t numYBlocks            = numericalSetup.getParameter< uint_t >("numYBlocks");
+   const uint_t numZBlocks            = numericalSetup.getParameter< uint_t >("numZBlocks");
+   WALBERLA_CHECK_EQUAL(numXBlocks * numYBlocks * numZBlocks, uint_t(MPIManager::instance()->numProcesses()),
+                        "When using GPUs, the number of blocks ("
+                           << numXBlocks * numYBlocks * numZBlocks << ") has to match the number of MPI processes ("
+                           << uint_t(MPIManager::instance()->numProcesses()) << ")");
+   const bool periodicInY                 = numericalSetup.getParameter< bool >("periodicInY");
+   const bool periodicInZ                 = numericalSetup.getParameter< bool >("periodicInZ");
+   const uint_t numXCellsPerBlock         = numericalSetup.getParameter< uint_t >("numXCellsPerBlock");
+   const uint_t numYCellsPerBlock         = numericalSetup.getParameter< uint_t >("numYCellsPerBlock");
+   const uint_t numZCellsPerBlock         = numericalSetup.getParameter< uint_t >("numZCellsPerBlock");
+   const bool sendDirectlyFromGPU         = numericalSetup.getParameter< bool >("sendDirectlyFromGPU");
+   const bool useCommunicationHiding      = numericalSetup.getParameter< bool >("useCommunicationHiding");
+   const Vector3< uint_t > frameWidth     = numericalSetup.getParameter< Vector3< uint_t > >("frameWidth");
+   const uint_t timeSteps                 = numericalSetup.getParameter< uint_t >("timeSteps");
+   const bool useParticles                = numericalSetup.getParameter< bool >("useParticles");
+   const real_t particleDiameter          = numericalSetup.getParameter< real_t >("particleDiameter");
+   const real_t particleGenerationSpacing = numericalSetup.getParameter< real_t >("particleGenerationSpacing");
+   const Vector3< real_t > generationDomainFraction =
+      numericalSetup.getParameter< Vector3< real_t > >("generationDomainFraction");
+   const Vector3< uint_t > generationPointOfReferenceOffset =
+      numericalSetup.getParameter< Vector3< uint_t > >("generationPointOfReferenceOffset");
+   const bool useParticleOffset = numericalSetup.getParameter< bool >("useParticleOffset");
+   const Vector3< uint_t > particleSubBlockSize =
+      numericalSetup.getParameter< Vector3< uint_t > >("particleSubBlockSize");
+   const real_t uInflow        = numericalSetup.getParameter< real_t >("uInflow");
+   const real_t relaxationRate = numericalSetup.getParameter< real_t >("relaxationRate");
+   if ((periodicInY && numYBlocks == 1) || (periodicInZ && numZBlocks == 1))
+   {
+      WALBERLA_LOG_WARNING_ON_ROOT("Using only 1 block in periodic dimensions can lead to unexpected behavior.")
+   }
+   const real_t viscosity = lbm::collision_model::viscosityFromOmega(relaxationRate);
+   WALBERLA_LOG_DEVEL_VAR_ON_ROOT(viscosity)
+
+   Config::BlockHandle outputSetup      = cfgFile->getBlock("Output");
+   const uint_t vtkSpacing              = outputSetup.getParameter< uint_t >("vtkSpacing");
+   const std::string vtkFolder          = outputSetup.getParameter< std::string >("vtkFolder");
+   const uint_t performanceLogFrequency = outputSetup.getParameter< uint_t >("performanceLogFrequency");
+
+   ///////////////////////////
+   // BLOCK STRUCTURE SETUP //
+   ///////////////////////////
+
+   const bool periodicInX                     = false;
+   shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGrid(
+      numXBlocks, numYBlocks, numZBlocks, numXCellsPerBlock, numYCellsPerBlock, numZCellsPerBlock, real_t(1), uint_t(0),
+      false, false, periodicInX, periodicInY, periodicInZ, // periodicity
+      false);
+
+   auto simulationDomain = blocks->getDomain();
+
+   ////////////
+   // MesaPD //
+   ////////////
+
+   auto rpdDomain = std::make_shared< mesa_pd::domain::BlockForestDomain >(blocks->getBlockForestPointer());
+
+   // Init data structures
+   auto ps                  = walberla::make_shared< mesa_pd::data::ParticleStorage >(1);
+   auto ss                  = walberla::make_shared< mesa_pd::data::ShapeStorage >();
+   using ParticleAccessor_T = mesa_pd::data::ParticleAccessorWithShape;
+   auto accessor            = walberla::make_shared< ParticleAccessor_T >(ps, ss);
+   auto sphereShape         = ss->create< mesa_pd::data::Sphere >(particleDiameter * real_t(0.5));
+
+   // Create spheres
+   if (useParticles)
+   {
+      // Ensure that generation domain is computed correctly
+      WALBERLA_CHECK_FLOAT_EQUAL(simulationDomain.xMin(), real_t(0));
+      WALBERLA_CHECK_FLOAT_EQUAL(simulationDomain.yMin(), real_t(0));
+      WALBERLA_CHECK_FLOAT_EQUAL(simulationDomain.zMin(), real_t(0));
+
+      auto generationDomain = math::AABB::createFromMinMaxCorner(
+         math::Vector3< real_t >(simulationDomain.xMax() * (real_t(1) - generationDomainFraction[0]) / real_t(2),
+                                 simulationDomain.yMax() * (real_t(1) - generationDomainFraction[1]) / real_t(2),
+                                 simulationDomain.zMax() * (real_t(1) - generationDomainFraction[2]) / real_t(2)),
+         math::Vector3< real_t >(simulationDomain.xMax() * (real_t(1) + generationDomainFraction[0]) / real_t(2),
+                                 simulationDomain.yMax() * (real_t(1) + generationDomainFraction[1]) / real_t(2),
+                                 simulationDomain.zMax() * (real_t(1) + generationDomainFraction[2]) / real_t(2)));
+      real_t particleOffset = particleGenerationSpacing / real_t(2);
+      for (auto pt :
+           grid_generator::SCGrid(generationDomain, generationDomain.center() + generationPointOfReferenceOffset,
+                                  particleGenerationSpacing))
+      {
+         // Offset every second particle layer in flow direction to avoid channels in flow direction
+         if (useParticleOffset &&
+             uint_t(round(math::abs(generationDomain.center()[0] - pt[0]) / (particleGenerationSpacing))) % uint_t(2) !=
+                uint_t(0))
+         {
+            pt = pt + Vector3(real_t(0), particleOffset, particleOffset);
+         }
+         if (rpdDomain->isContainedInProcessSubdomain(uint_c(mpi::MPIManager::instance()->rank()), pt))
+         {
+            mesa_pd::data::Particle&& p = *ps->create();
+            p.setPosition(pt);
+            p.setInteractionRadius(particleDiameter * real_t(0.5));
+            p.setOwner(mpi::MPIManager::instance()->rank());
+            p.setShapeID(sphereShape);
+            p.setType(0);
+         }
+      }
+   }
+
+   ///////////////////////
+   // ADD DATA TO BLOCKS //
+   ////////////////////////
+
+   // Setting initial PDFs to nan helps to detect bugs in the initialization/BC handling
+   // Depending on WALBERLA_BUILD_WITH_GPU_SUPPORT, pdfFieldCPUGPUID is either a CPU or a CPU field
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   BlockDataID pdfFieldID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field (fzyx)", real_c(std::nan("")), field::fzyx);
+   BlockDataID BFieldID         = field::addToStorage< BField_T >(blocks, "B field", 0, field::fzyx, 1);
+   BlockDataID pdfFieldCPUGPUID = gpu::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldID, "pdf field GPU");
+#else
+   BlockDataID pdfFieldCPUGPUID =
+      field::addToStorage< PdfField_T >(blocks, "pdf field CPU", real_c(std::nan("")), field::fzyx);
+#endif
+   BlockDataID densityFieldID = field::addToStorage< DensityField_T >(blocks, "density field", real_t(0), field::fzyx);
+   BlockDataID velFieldID  = field::addToStorage< VelocityField_T >(blocks, "velocity field", real_t(0), field::fzyx);
+   BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+
+   // Synchronize particles between the blocks for the correct mapping of ghost particles
+   mesa_pd::mpi::SyncNextNeighbors syncNextNeighborFunc;
+   syncNextNeighborFunc(*ps, *rpdDomain);
+
+   // Assemble boundary block string
+   std::string boundariesBlockString = " Boundaries"
+                                       "{"
+                                       "Border { direction W;    walldistance -1;  flag Inflow; }"
+                                       "Border { direction E;    walldistance -1;  flag Density; }";
+
+   if (!periodicInY)
+   {
+      boundariesBlockString += "Border { direction S;    walldistance -1;  flag NoSlip; }"
+                               "Border { direction N;    walldistance -1;  flag NoSlip; }";
+   }
+
+   if (!periodicInZ)
+   {
+      boundariesBlockString += "Border { direction T;    walldistance -1;  flag NoSlip; }"
+                               "Border { direction B;    walldistance -1;  flag NoSlip; }";
+   }
+
+   boundariesBlockString += "}";
+   WALBERLA_ROOT_SECTION()
+   {
+      std::ofstream boundariesFile("boundaries.prm");
+      boundariesFile << boundariesBlockString;
+      boundariesFile.close();
+   }
+   WALBERLA_MPI_BARRIER()
+
+   auto boundariesCfgFile = Config();
+   boundariesCfgFile.readParameterFile("boundaries.prm");
+   auto boundariesConfig = boundariesCfgFile.getBlock("Boundaries");
+
+   // map boundaries into the LBM simulation
+   geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+   geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, Fluid_Flag);
+   lbm::PSM_Density density_bc(blocks, pdfFieldCPUGPUID, real_t(1.0));
+   density_bc.fillFromFlagField< FlagField_T >(blocks, flagFieldID, Density_Flag, Fluid_Flag);
+   lbm::PSM_NoSlip noSlip(blocks, pdfFieldCPUGPUID);
+   noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, NoSlip_Flag, Fluid_Flag);
+   lbm::PSM_UBB ubb(blocks, pdfFieldCPUGPUID, uInflow, real_t(0), real_t(0));
+   ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, Inflow_Flag, Fluid_Flag);
+
+   ///////////////
+   // TIME LOOP //
+   ///////////////
+
+   // Map particles into the fluid domain
+   ParticleAndVolumeFractionSoA_T< Weighting > particleAndVolumeFractionSoA(blocks, relaxationRate);
+   PSMSweepCollection psmSweepCollection(blocks, accessor, lbm_mesapd_coupling::RegularParticlesSelector(),
+                                         particleAndVolumeFractionSoA, particleSubBlockSize);
+   if (useParticles)
+   {
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+      {
+         psmSweepCollection.particleMappingSweep(&(*blockIt));
+      }
+   }
+
+   // Initialize PDFs
+   pystencils::InitializeDomainForPSM pdfSetter(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0), real_t(0), real_t(0),
+      real_t(1.0), real_t(0), real_t(0), real_t(0));
+
+   for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+   {
+      // pdfSetter requires particle velocities at cell centers
+      if (useParticles) { psmSweepCollection.setParticleVelocitiesSweep(&(*blockIt)); }
+      pdfSetter(&(*blockIt));
+   }
+
+   // Setup of the LBM communication for synchronizing the pdf field between neighboring blocks
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU, false);
+#else
+   walberla::blockforest::communication::UniformBufferedScheme< Stencil_T > com(blocks);
+#endif
+   com.addPackInfo(make_shared< PackInfo_T >(pdfFieldCPUGPUID));
+   auto communication = std::function< void() >([&]() { com.communicate(); });
+
+   SweepTimeloop commTimeloop(blocks->getBlockStorage(), timeSteps);
+   SweepTimeloop timeloop(blocks->getBlockStorage(), timeSteps);
+
+   timeloop.addFuncBeforeTimeStep(RemainingTimeLogger(timeloop.getNrOfTimeSteps()), "Remaining Time Logger");
+
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+   pystencils::PSM_MacroGetter getterSweep(BFieldID, densityFieldID, pdfFieldID, velFieldID, real_t(0.0), real_t(0.0),
+                                           real_t(0.0));
+#else
+   pystencils::PSM_MacroGetter getterSweep(particleAndVolumeFractionSoA.BFieldID, densityFieldID, pdfFieldCPUGPUID,
+                                           velFieldID, real_t(0.0), real_t(0.0), real_t(0.0));
+#endif
+   // VTK output
+   if (vtkSpacing != uint_t(0))
+   {
+      // Spheres
+      auto particleVtkOutput = make_shared< mesa_pd::vtk::ParticleVtkOutput >(ps);
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleUid >("uid");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleLinearVelocity >("velocity");
+      particleVtkOutput->addOutput< mesa_pd::data::SelectParticleInteractionRadius >("radius");
+      // Limit output to process-local spheres
+      particleVtkOutput->setParticleSelector([sphereShape](const mesa_pd::data::ParticleStorage::iterator& pIt) {
+         return pIt->getShapeID() == sphereShape &&
+                !(mesa_pd::data::particle_flags::isSet(pIt->getFlags(), mesa_pd::data::particle_flags::GHOST));
+      });
+      auto particleVtkWriter = vtk::createVTKOutput_PointData(particleVtkOutput, "particles", vtkSpacing, vtkFolder);
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(particleVtkWriter), "VTK (sphere data)");
+
+      // Fields
+      auto pdfFieldVTK = vtk::createVTKOutput_BlockData(blocks, "fluid", vtkSpacing, 0, false, vtkFolder);
+
+      pdfFieldVTK->addBeforeFunction(communication);
+
+      pdfFieldVTK->addBeforeFunction([&]() {
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+         gpu::fieldCpy< PdfField_T, gpu::GPUField< real_t > >(blocks, pdfFieldID, pdfFieldCPUGPUID);
+         gpu::fieldCpy< GhostLayerField< real_t, 1 >, BFieldGPU_T >(blocks, BFieldID,
+                                                                    particleAndVolumeFractionSoA.BFieldID);
+#endif
+         for (auto& block : *blocks)
+            getterSweep(&block);
+      });
+
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< VelocityField_T > >(velFieldID, "Velocity"));
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< DensityField_T > >(densityFieldID, "Density"));
+#ifdef WALBERLA_BUILD_WITH_GPU_SUPPORT
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< BField_T > >(BFieldID, "OverlapFraction"));
+#else
+      pdfFieldVTK->addCellDataWriter(
+         make_shared< field::VTKWriter< BField_T > >(particleAndVolumeFractionSoA.BFieldID, "OverlapFraction"));
+#endif
+      pdfFieldVTK->addCellDataWriter(make_shared< field::VTKWriter< FlagField_T > >(flagFieldID, "FlagField"));
+
+      timeloop.addFuncBeforeTimeStep(vtk::writeFiles(pdfFieldVTK), "VTK (fluid field data)");
+   }
+
+   if (vtkSpacing != uint_t(0)) { vtk::writeDomainDecomposition(blocks, "domain_decomposition", vtkFolder); }
+
+   // Add performance logging
+   lbm::PerformanceLogger< FlagField_T > performanceLogger(blocks, flagFieldID, Fluid_Flag, performanceLogFrequency);
+   if (performanceLogFrequency > 0)
+   {
+      timeloop.addFuncAfterTimeStep(performanceLogger, "Evaluate performance logging");
+   }
+
+   // Add LBM communication function and boundary handling sweep
+   if (useCommunicationHiding)
+   {
+      timeloop.add() << Sweep(deviceSyncWrapper(density_bc.getSweep()), "Boundary Handling (Density)");
+   }
+   else
+   {
+      timeloop.add() << BeforeFunction(communication, "LBM Communication")
+                     << Sweep(deviceSyncWrapper(density_bc.getSweep()), "Boundary Handling (Density)");
+   }
+   timeloop.add() << Sweep(deviceSyncWrapper(ubb.getSweep()), "Boundary Handling (UBB)");
+   if (!periodicInY || !periodicInZ)
+   {
+      timeloop.add() << Sweep(deviceSyncWrapper(noSlip.getSweep()), "Boundary Handling (NoSlip)");
+   }
+
+   // PSM kernel
+   pystencils::PSMSweep PSMSweep(particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+                                 particleAndVolumeFractionSoA.particleForcesFieldID,
+                                 particleAndVolumeFractionSoA.particleVelocitiesFieldID, pdfFieldCPUGPUID, real_t(0.0),
+                                 real_t(0.0), real_t(0.0), relaxationRate);
+   pystencils::PSMSweepSplit PSMSplitSweep(
+      particleAndVolumeFractionSoA.BsFieldID, particleAndVolumeFractionSoA.BFieldID,
+      particleAndVolumeFractionSoA.particleForcesFieldID, particleAndVolumeFractionSoA.particleVelocitiesFieldID,
+      pdfFieldCPUGPUID, real_t(0.0), real_t(0.0), real_t(0.0), relaxationRate, frameWidth);
+   pystencils::LBMSweep LBMSweep(pdfFieldCPUGPUID, real_t(0.0), real_t(0.0), real_t(0.0), relaxationRate);
+   pystencils::LBMSplitSweep LBMSplitSweep(pdfFieldCPUGPUID, real_t(0.0), real_t(0.0), real_t(0.0), relaxationRate,
+                                           frameWidth);
+
+   if (useParticles)
+   {
+      if (useCommunicationHiding)
+      {
+         addPSMSweepsToTimeloops(commTimeloop, timeloop, com, psmSweepCollection, PSMSplitSweep);
+      }
+      else { addPSMSweepsToTimeloop(timeloop, psmSweepCollection, PSMSweep); }
+   }
+   else
+   {
+      if (useCommunicationHiding)
+      {
+         commTimeloop.add() << BeforeFunction([&]() { com.startCommunication(); }, "LBM Communication (start)")
+                            << Sweep(deviceSyncWrapper(LBMSplitSweep.getInnerSweep()), "LBM inner sweep")
+                            << AfterFunction([&]() { com.wait(); }, "LBM Communication (wait)");
+         timeloop.add() << Sweep(deviceSyncWrapper(LBMSplitSweep.getOuterSweep()), "LBM outer sweep");
+      }
+      else { timeloop.add() << Sweep(deviceSyncWrapper(LBMSweep), "LBM sweep"); }
+   }
+
+   WcTimingPool timeloopTiming;
+   // TODO: maybe add warmup phase
+   for (uint_t timeStep = 0; timeStep < timeSteps; ++timeStep)
+   {
+      if (useCommunicationHiding) { commTimeloop.singleStep(timeloopTiming); }
+      timeloop.singleStep(timeloopTiming);
+   }
+   timeloopTiming.logResultOnRoot();
+   auto timeloopTimingReduced = timeloopTiming.getReduced();
+
+   // Write parameters and performance results in sqlite database
+   WALBERLA_ROOT_SECTION()
+   {
+      // Use DB_FILE environment variable if set
+      std::string dbFile;
+      if (std::getenv("DB_FILE") != nullptr) { dbFile = std::getenv("DB_FILE"); }
+      else
+      {
+         if (useParticles) { dbFile = "percolation_benchmark.sqlite3"; }
+         else { dbFile = "channel_flow_benchmark.sqlite3"; }
+      }
+
+      std::map< std::string, int > integerProperties;
+      std::map< std::string, double > realProperties;
+      std::map< std::string, std::string > stringProperties;
+
+      integerProperties["numXBlocks"]                = int(numXBlocks);
+      integerProperties["numYBlocks"]                = int(numYBlocks);
+      integerProperties["numZBlocks"]                = int(numZBlocks);
+      integerProperties["numXCellsPerBlock"]         = int(numXCellsPerBlock);
+      integerProperties["numYCellsPerBlock"]         = int(numYCellsPerBlock);
+      integerProperties["numZCellsPerBlock"]         = int(numZCellsPerBlock);
+      integerProperties["timeSteps"]                 = int(timeSteps);
+      integerProperties["sendDirectlyFromGPU"]       = int(sendDirectlyFromGPU);
+      integerProperties["useCommunicationHiding"]    = int(useCommunicationHiding);
+      integerProperties["communicationHidingXWidth"] = int(frameWidth[0]);
+      integerProperties["communicationHidingYWidth"] = int(frameWidth[1]);
+      integerProperties["communicationHidingZWidth"] = int(frameWidth[2]);
+      integerProperties["useParticles"]              = int(useParticles);
+      integerProperties["numParticles"]              = int(ps->size());
+      integerProperties["particleSubBlockXSize"]     = int(particleSubBlockSize[0]);
+      integerProperties["particleSubBlockYSize"]     = int(particleSubBlockSize[1]);
+      integerProperties["particleSubBlockZSize"]     = int(particleSubBlockSize[2]);
+
+      realProperties["particleDiameter"]          = double(particleDiameter);
+      realProperties["particleGenerationSpacing"] = double(particleGenerationSpacing);
+
+      performanceLogger.getBestResultsForSQLOnRoot(integerProperties, realProperties, stringProperties);
+
+      auto runId = sqlite::storeRunInSqliteDB(dbFile, integerProperties, stringProperties, realProperties);
+      sqlite::storeTimingPoolInSqliteDB(dbFile, runId, *timeloopTimingReduced, "Timeloop");
+   }
+
+   return EXIT_SUCCESS;
+}
+
+} // namespace percolation
+
+int main(int argc, char** argv) { percolation::main(argc, argv); }
--- a/apps/benchmarks/Percolation/benchmark.prm
+++ b/apps/benchmarks/Percolation/benchmark.prm
+
+NumericalSetup
+{
+    // product of number of blocks should be equal to number of used processes
+    numXBlocks 1;
+    numYBlocks 1;
+    numZBlocks 1;
+
+    periodicInY false;
+    periodicInZ false;
+
+    numXCellsPerBlock 256;
+    numYCellsPerBlock 128;
+    numZCellsPerBlock 128;
+
+    timeSteps 100;
+
+    sendDirectlyFromGPU false; // use GPU-GPU communication
+    useCommunicationHiding false;
+    frameWidth <1, 1, 1>; // width of the outer region if splitting the LBM/PSM into inner and outer (only used if useCommunicationHiding is true)
+
+    // particle distribution in LBM units
+    useParticles true; // if true, PSM/particle mapping/velocity computation/hydrodynamic force reduction is used, else LBM is used
+    particleDiameter 20.0;
+    particleGenerationSpacing 21.0;
+    generationDomainFraction <0.8, 1.0, 1.0>; // fraction of the domain where particles are generated
+    generationPointOfReferenceOffset <0, 0, 0>; // offset of point of reference from domain center, see SCIterator.h
+    useParticleOffset true; // offset every second particle layer in flow direction
+    particleSubBlockSize <8, 8, 8>;
+
+    // fluid quantities in LBM units
+    uInflow 0.00008;
+    relaxationRate 0.9;
+}
+
+Output
+{
+    vtkSpacing 0;
+    vtkFolder vtk_out;
+
+    performanceLogFrequency 100;
+}
--- a/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
+++ b/apps/benchmarks/PhaseFieldAllenCahn/CMakeLists.txt
@@ -12,13 +12,13 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
        PackInfo_velocity_based_distributions.${CODEGEN_FILE_SUFFIX} PackInfo_velocity_based_distributions.h
        GenDefines.h)

-if (WALBERLA_BUILD_WITH_CUDA)
+if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core cuda field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
+          DEPENDS walberla::blockforest walberla::core walberla::gpu walberla::field walberla::postprocessing walberla::python_coupling walberla::lbm_generated walberla::geometry walberla::timeloop BenchmarkPhaseFieldCodeGen )
 else ()
    waLBerla_add_executable(NAME benchmark_multiphase
            FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
-            DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
-endif (WALBERLA_BUILD_WITH_CUDA)
+          DEPENDS walberla::blockforest walberla::core walberla::field walberla::postprocessing walberla::python_coupling walberla::lbm_generated walberla::geometry walberla::timeloop BenchmarkPhaseFieldCodeGen )
+endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )

--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark.py
@@ -8,6 +8,11 @@ from waLBerla.tools.config import block_decomposition
 import sys
 from math import prod

+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+

 def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
    """Checks if a single block of given size fits into GPU memory"""
@@ -20,7 +25,9 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, s


 class Scenario:
-    def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256),
+    def __init__(self, time_step_strategy,
+                 cuda_block_size,
+                 cells_per_block=(256, 256, 256),
                 cuda_enabled_mpi=False):
        # output frequencies
        self.vtkWriteFrequency = 0
@@ -89,6 +96,14 @@ class Scenario:
        data['compile_flags'] = wlb.build_info.compiler_flags
        data['walberla_version'] = wlb.build_info.version
        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+
        sequenceValuesToScalars(data)

        df = pd.DataFrame.from_records([data])
@@ -101,43 +116,19 @@ class Scenario:
 def benchmark():
    scenarios = wlb.ScenarioManager()

-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
    gpu_mem = gpu_mem_gb * (2 ** 30)

-    block_size = (256, 256, 256)
+    block_size = (320, 320, 320)
+    cuda_enabled_mpi = True

    if not domain_block_size_ok(block_size, gpu_mem):
        wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
    else:
-        scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size))
-
+        scenarios.add(Scenario(time_step_strategy='normal',
+                               cuda_block_size=(128, 1, 1),
+                               cells_per_block=block_size,
+                               cuda_enabled_mpi=cuda_enabled_mpi))

-def kernel_benchmark():
-    scenarios = wlb.ScenarioManager()
-
-    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
-    gpu_mem = gpu_mem_gb * (2 ** 30)

-    block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)]
-
-    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
-                   (32, 2, 1), (64, 2, 1), (128, 2, 1),
-                   (32, 4, 1), (64, 4, 1),
-                   (32, 4, 2),
-                   (32, 8, 1),
-                   (16, 16, 1)]
-
-    for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
-        for cuda_block in cuda_blocks:
-            for block_size in block_sizes:
-                if not domain_block_size_ok(block_size, gpu_mem):
-                    wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
-                    continue
-                scenario = Scenario(time_step_strategy=time_step_strategy,
-                                    cuda_block_size=cuda_block,
-                                    cells_per_block=block_size)
-                scenarios.add(scenario)
-
-
-# benchmark()
-kernel_benchmark()
+benchmark()
No results found