diff --git a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
index f332e065fed35fa99367127e9d44b211849cc7b3..de1a2c1db0cbd078a80879f964f6a046f98afc95 100644
--- a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
@@ -10,6 +10,12 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
         UBB.h UBB.cpp
         NonUniformGridCPUBoundaryCollection.h
         NonUniformGridCPUInfoHeader.h)
+
+waLBerla_add_executable( NAME NonUniformGridGenerator
+                         FILES NonUniformGridGenerator.cpp LdcSetup.h
+                         DEPENDS blockforest core domain_decomposition field geometry python_coupling vtk )
+
+
 waLBerla_add_executable( NAME NonUniformGridCPU
-                         FILES NonUniformGridCPU.cpp
-                         DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated )
\ No newline at end of file
+                         FILES NonUniformGridCPU.cpp LdcSetup.h
+                         DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated )
diff --git a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
new file mode 100644
index 0000000000000000000000000000000000000000..1657e85e0e58dc0a1107ba62505e438c1821a1c1
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
@@ -0,0 +1,109 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file LdcSetup.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+
+#include "core/all.h"
+
+#include "field/FlagField.h"
+
+#include "field/FlagUID.h"
+
+using namespace walberla;
+
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+
+using FlagField_T          = FlagField< uint8_t >;
+
+class LDCRefinement
+{
+ private:
+   const uint_t refinementDepth_;
+
+ public:
+   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest) const
+   {
+      const AABB & domain = forest.getDomain();
+
+      real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
+      real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
+
+      AABB leftCorner( domain.xMin(), domain.yMax() - ySize, domain.zMin(),
+                      domain.xMin() + xSize, domain.yMax() + ySize, domain.zMax() );
+
+      AABB rightCorner( domain.xMax() - xSize, domain.yMax() - ySize, domain.zMin(),
+                       domain.xMax(), domain.yMax(), domain.zMax() );
+
+      for(auto & block : forest)
+      {
+         auto & aabb = block.getAABB();
+         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
+         {
+            if( block.getLevel() < refinementDepth_)
+               block.setMarker( true );
+         }
+      }
+   }
+};
+
+class LDC
+{
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID noSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+
+ public:
+   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+
+   RefinementSelectionFunctor refinementSelector() const
+   {
+      return LDCRefinement(refinementDepth_);
+   }
+
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         auto& b           = dynamic_cast< Block& >(*bIt);
+         const uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
+         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            const Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
+                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
+            {
+               flagField->addFlag(localCell, noslipFlag);
+            }
+         }
+      }
+   }
+};
diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
index 4d3ce332f4f6b23e7fbfb5e885b61c3268232155..6476d82ec1877b7f2732837d89cf76a4d81b2627 100644
--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
@@ -15,13 +15,10 @@
 //
 //! \file NonUniformGridCPU.cpp
 //! \author Markus Holzer <markus.holzer@fau.de>
+//! \author Frederik Hennig <frederik.hennig@fau.de>
 //
 //======================================================================================================================
 
-#include "blockforest/Initialization.h"
-#include "blockforest/SetupBlockForest.h"
-#include "blockforest/loadbalancing/StaticCurve.h"
-
 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
 #include "core/timing/RemainingTimeLogger.h"
@@ -33,20 +30,21 @@
 
 #include "geometry/InitBoundaryHandling.h"
 
-#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
-#include "lbm_generated/field/AddToStorage.h"
-#include "lbm_generated/field/PdfField.h"
-#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
-#include "lbm_generated/evaluation/PerformanceEvaluation.h"
-
 #include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
 #include "python_coupling/PythonCallback.h"
 
 #include "timeloop/SweepTimeloop.h"
 
 #include <cmath>
 
+#include "LdcSetup.h"
 #include "NonUniformGridCPUInfoHeader.h"
+#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/refinement/BasicRecursiveTimeStep.h"
 
 using namespace walberla;
 
@@ -55,157 +53,47 @@ using Stencil_T              = StorageSpecification_T::Stencil;
 using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil;
 
 using PdfField_T           = lbm_generated::PdfField< StorageSpecification_T >;
-using FlagField_T          = FlagField< uint8_t >;
 using BoundaryCollection_T = lbm::NonUniformGridCPUBoundaryCollection< FlagField_T >;
 
 using SweepCollection_T = lbm::NonUniformGridCPUSweepCollection;
 
 using blockforest::communication::NonUniformBufferedScheme;
-using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
-
-
-class LDCRefinement
-{
-private:
-   const uint_t refinementDepth_;
-
-public:
-   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
-
-   void operator()(SetupBlockForest& forest) const
-   {
-      const AABB & domain = forest.getDomain();
-
-      real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
-
-      AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
-                       domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
-
-      AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
-                        domain.xMax(), domain.yMin() + ySize, domain.zMax() );
-
-      for(auto & block : forest)
-      {
-         auto & aabb = block.getAABB();
-         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
-         {
-            if( block.getLevel() < refinementDepth_)
-               block.setMarker( true );
-         }
-      }
-   }
-};
-
-class LDC
-{
-private:
-   const std::string refinementProfile_;
-   const uint_t refinementDepth_;
-
-   const FlagUID noSlipFlagUID_;
-   const FlagUID ubbFlagUID_;
-
-public:
-   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
-
-   RefinementSelectionFunctor refinementSelector() const
-   {
-      return LDCRefinement(refinementDepth_);
-   }
-
-   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
-   {
-      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
-      {
-         auto& b           = dynamic_cast< Block& >(*bIt);
-         const uint_t level       = b.getLevel();
-         auto flagField     = b.getData< FlagField_T >(flagFieldID);
-         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
-         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
-         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
-         {
-            const Cell localCell = cIt.cell();
-            Cell globalCell(localCell);
-            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
-            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
-            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
-                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
-            {
-               flagField->addFlag(localCell, noslipFlag);
-            }
-         }
-      }
-   }
-};
-
-static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses()))
-{
-   Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
-   Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
-   Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
-
-   auto refSelection = ldcSetup.refinementSelector();
-   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
-   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
-   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
-   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
-   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
-}
 
 int main(int argc, char** argv)
 {
    const mpi::Environment env(argc, argv);
    mpi::MPIManager::instance()->useWorldComm();
 
+   const std::string input_filename(argv[1]);
+   const bool inputIsPython = string_ends_with(input_filename, ".py");
+
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
       WALBERLA_MPI_BARRIER()
+
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      ///                                        SETUP AND CONFIGURATION                                             ///
+      ///                                            BLOCK FOREST SETUP                                              ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
       auto config = *cfg;
       logging::configureLogging(config);
-      auto domainSetup = config->getOneBlock("DomainSetup");
 
-      // Reading parameters
-      auto parameters              = config->getOneBlock("Parameters");
-      const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
-      const uint_t refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1));
-      const uint_t timesteps   = parameters.getParameter< uint_t >("timesteps", uint_c(50));
-      const bool writeSetupForestAndReturn = parameters.getParameter< bool >("writeSetupForestAndReturn", false);
-      const bool benchmarkKernelOnly = parameters.getParameter< bool >("benchmarkKernelOnly", false);
-      const uint_t numProcesses = parameters.getParameter< uint_t >( "numProcesses");
-
-      auto ldc = std::make_shared< LDC >(refinementDepth);
-      SetupBlockForest setupBfs;
-      if (writeSetupForestAndReturn)
-      {
-         WALBERLA_LOG_INFO_ON_ROOT("Creating SetupBlockForest for " << numProcesses << " processes")
-         WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
-         createSetupBlockForest(setupBfs, domainSetup, *ldc, numProcesses);
-
-         WALBERLA_ROOT_SECTION() { setupBfs.writeVTKOutput("SetupBlockForest"); }
+      auto blockForestSetup = config->getOneBlock("SetupBlockForest");
+      const std::string blockForestFilestem =
+         blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
+      const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
 
-         WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
-         for (uint_t level = 0; level <= refinementDepth; level++)
-         {
-            const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
-            WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
-         }
-
-         WALBERLA_LOG_INFO_ON_ROOT("Ending program")
-         return EXIT_SUCCESS;
-      }
+      auto domainSetup                = config->getOneBlock("DomainSetup");
+      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
 
-      WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
-      createSetupBlockForest(setupBfs, domainSetup, *ldc);
+      // Load structured block forest from file
+      std::ostringstream oss;
+      oss << blockForestFilestem << ".bfs";
+      const std::string setupBlockForestFilepath = oss.str();
 
-      // Create structured block forest
-      Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
       WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
-      auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
+      auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
+                                                 setupBlockForestFilepath.c_str(), false);
       auto blocks =
          std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
       blocks->createCellBoundingBoxes();
@@ -216,6 +104,18 @@ int main(int argc, char** argv)
          WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level))
       }
 
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                    DISTRIBUTED DATA STRUCTURES                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      auto ldc                     = std::make_shared< LDC >(refinementDepth);
+
+      // Reading parameters
+      auto parameters                = config->getOneBlock("Parameters");
+      const real_t omega             = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const uint_t timesteps         = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const bool benchmarkKernelOnly = parameters.getParameter< bool >("benchmarkKernelOnly", false);
+
       // Creating fields
       const StorageSpecification_T StorageSpec = StorageSpecification_T();
       const BlockDataID pdfFieldID =
@@ -263,12 +163,11 @@ int main(int argc, char** argv)
 
       SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
 
-      if(benchmarkKernelOnly){
+      if (benchmarkKernelOnly)
+      {
          timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
       }
-      else{
-         LBMMeshRefinement.addRefinementToTimeLoop(timeLoop);
-      }
+      else { LBMMeshRefinement.addRefinementToTimeLoop(timeLoop); }
 
       // VTK
       const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
@@ -298,11 +197,12 @@ int main(int argc, char** argv)
          timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
       }
 
-      lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID);
-      field::CellCounter< FlagField_T > fluidCells( blocks, flagFieldID, fluidFlagUID );
+      lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
+      field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
       fluidCells();
 
-      WALBERLA_LOG_INFO_ON_ROOT( "Non uniform Grid benchmark with " << fluidCells.numberOfCells() << " fluid cells (in total on all levels)")
+      WALBERLA_LOG_INFO_ON_ROOT("Non uniform Grid benchmark with " << fluidCells.numberOfCells()
+                                                                   << " fluid cells (in total on all levels)")
 
       WcTimingPool timeloopTiming;
       WcTimer simTimer;
@@ -323,6 +223,31 @@ int main(int argc, char** argv)
 
       const auto reducedTimeloopTiming = timeloopTiming.getReduced();
       WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+
+      WALBERLA_ROOT_SECTION()
+      {
+         if (inputIsPython)
+         {
+            python_coupling::PythonCallback pythonCallbackResults("results_callback");
+            if (pythonCallbackResults.isCallable())
+            {
+               pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
+               pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
+               pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+               pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess",
+                                                        performance.mlupsPerProcess(timesteps, time));
+               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+               // Call Python function to report results
+               pythonCallbackResults();
+            }
+         }
+      }
    }
    return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
index 3b350b6c9c48e0418244101cb3de1daec26c34ce..25e9342058f24b7ae6008ee66a42beb8dd230a5a 100644
--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py
@@ -24,7 +24,7 @@ const bool infoCsePdfs = {cse_pdfs};
 with CodeGeneration() as ctx:
     field_type = "float64" if ctx.double_accuracy else "float32"
 
-    streaming_pattern = 'pull'
+    streaming_pattern = 'aa'
     timesteps = get_timesteps(streaming_pattern)
     stencil = LBStencil(Stencil.D3Q19)
 
@@ -33,7 +33,7 @@ with CodeGeneration() as ctx:
     density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
     macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
 
-    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
+    lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, compressible=True,
                            streaming_pattern=streaming_pattern)
     lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx")
 
diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7eab3046a8969f7ccc09bc7941315d924412822
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp
@@ -0,0 +1,98 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file NonUniformGridGenerator.cpp
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+
+#include "blockforest/Initialization.h"
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/loadbalancing/StaticCurve.h"
+
+#include "core/all.h"
+
+#include "python_coupling/CreateConfig.h"
+
+#include <string>
+
+#include "LdcSetup.h"
+
+using namespace walberla;
+
+
+int main(int argc, char ** argv){
+   const mpi::Environment env(argc, argv);
+   mpi::MPIManager::instance()->useWorldComm();
+
+   if(mpi::MPIManager::instance()->numProcesses() > 1){
+      WALBERLA_ABORT("Commandment: Thou shalt not run thy grid generator with more than one process.");
+   }
+
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      auto config = *cfg;
+      auto domainSetup = config->getOneBlock("DomainSetup");
+
+      Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+      Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+      Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
+
+      auto blockForestSetup = config->getOneBlock("SetupBlockForest");
+      const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
+      const uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
+      const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
+      const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
+      const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
+
+      const LDC ldc(refinementDepth);
+      SetupBlockForest setupBfs;
+
+      auto refSelection = ldc.refinementSelector();
+      setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
+      const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+      setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+      setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+      setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+
+      {
+         std::ostringstream oss;
+         oss << blockForestFilestem << ".bfs";
+         setupBfs.saveToFile(oss.str().c_str());
+      }
+
+      if(writeVtk){
+         setupBfs.writeVTKOutput(blockForestFilestem);
+      }
+
+      if(outputStatistics){
+         WALBERLA_LOG_INFO_ON_ROOT("===========================  BLOCK FOREST STATISTICS ============================");
+         WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
+         for (uint_t level = 0; level <= refinementDepth; level++)
+         {
+            const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
+            WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
+         }
+
+         const uint_t avgBlocksPerProc = setupBfs.getNumberOfBlocks() / setupBfs.getNumberOfProcesses();
+         WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
+         WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
+      }
+
+
+      WALBERLA_LOG_INFO_ON_ROOT("Ending program")
+   }
+}
diff --git a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
index ccfcecacfb5c0f5a79b56aea13409cc3da4d2748..8c99c62c2fc7c3791239b417b813fa6a1d62f1cd 100644
--- a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
@@ -1,16 +1,37 @@
 import waLBerla as wlb
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sqlite3
+import os
+import sys
 
+DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
 
 class Scenario:
-    def __init__(self, domain_size=(64, 64, 64), root_blocks=(2, 2, 2),
-                 cells_per_block=(32, 32, 32), refinement_depth=0):
+    def __init__(self,
+                 domain_size=(64, 64, 64),
+                 root_blocks=(2, 2, 2),
+                 num_processes=1,
+                 refinement_depth=0,
+                 cells_per_block=(32, 32, 32),
+                 timesteps=101,
+                 vtk_write_frequency=0,
+                 logger_frequency=0,
+                 blockforest_filestem="blockforest",
+                 write_setup_vtk=False):
 
         self.domain_size = domain_size
         self.root_blocks = root_blocks
         self.cells_per_block = cells_per_block
+        self.periodic = (0, 0, 0)
+
         self.refinement_depth = refinement_depth
+        self.num_processes = num_processes
+        self.bfs_filestem = blockforest_filestem
+        self.write_setup_vtk = write_setup_vtk
 
-        self.periodic = (0, 0, 0)
+        self.timesteps = timesteps
+        self.vtk_write_frequency = vtk_write_frequency
+        self.logger_frequency = logger_frequency
 
         self.config_dict = self.config(print_dict=False)
 
@@ -24,30 +45,56 @@ class Scenario:
                 'cellsPerBlock': self.cells_per_block,
                 'periodic': self.periodic
             },
+            'SetupBlockForest': {
+                'refinementDepth': self.refinement_depth,
+                'numProcesses': self.num_processes,
+                'blockForestFilestem': self.bfs_filestem,
+                'writeVtk': self.write_setup_vtk,
+                'outputStatistics': False
+            },
             'Parameters': {
                 'omega': 1.95,
-                'timesteps': 10001,
-
-                'refinementDepth': self.refinement_depth,
-                'writeSetupForestAndReturn': False,
-                'numProcesses': 1,
-
-                'cudaEnabledMPI': False,
-                'benchmarkKernelOnly': False,
-
-                'remainingTimeLoggerFrequency': 3,
-
-                'vtkWriteFrequency': 5000,
+                'timesteps': self.timesteps,
+                'remainingTimeLoggerFrequency': self.logger_frequency,
+                'vtkWriteFrequency': self.vtk_write_frequency,
             },
             'Logging': {
                 'logLevel': "info",
             }
         }
 
-        if print_dict and config_dict["Parameters"]["writeSetupForestAndReturn"] is False:
+        if(print_dict):
             wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+
         return config_dict
 
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        sequenceValuesToScalars(data)
+
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs"
+        table_name = table_name.replace("-", "_")
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, DB_FILE)
+                storeSingle(result, table_name, DB_FILE)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+
 
 def validation_run():
     """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
@@ -61,9 +108,31 @@ def validation_run():
     scenarios = wlb.ScenarioManager()
     scenario = Scenario(domain_size=domain_size,
                         root_blocks=root_blocks,
+                        num_processes=1,
+                        refinement_depth=1,
                         cells_per_block=cells_per_block,
-                        refinement_depth=1)
+                        timesteps=201,
+                        vtk_write_frequency=100,
+                        logger_frequency=5,
+                        write_setup_vtk=True)
     scenarios.add(scenario)
 
+def scaling():
+    wlb.log_info_on_root("Running scaling benchmark...")
+
+    numProc = wlb.mpi.numProcesses()
+
+    domain_size = (256, 256, 128 * numProc)
+    cells_per_block = (64, 64, 64)
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
+
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=2,
+                        timesteps=10)
+    scenarios.add(scenario)
 
 validation_run()
+# scaling()
diff --git a/apps/benchmarks/UniformGridCPU/CMakeLists.txt b/apps/benchmarks/UniformGridCPU/CMakeLists.txt
index 0d159bc542c6ada48999dace8e2b7dce4a085519..8e9c1e7af92fcde2102b9e279abed9e451c28858 100644
--- a/apps/benchmarks/UniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt
@@ -5,7 +5,7 @@ waLBerla_link_files_to_builddir( "simulation_setup" )
 
 foreach(streaming_pattern pull push aa esotwist)
     foreach(stencil d3q19 d3q27)
-        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky)
+        foreach (collision_setup srt trt w-mrt r-w-mrt cm r-cm k r-k entropic smagorinsky)
 	    # KBC methods only for D2Q9 and D3Q27 defined
 	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
 		    continue()
diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
index 64d94ce3d0dd843b29e693d446485bac73b84119..fc229eb116f86ee6c967a8fba5a1948f7b73fc64 100644
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
@@ -68,6 +68,9 @@ int main(int argc, char** argv)
 {
    const mpi::Environment env(argc, argv);
 
+   std::string input_filename(argv[1]);
+   bool inputIsPython = string_ends_with(input_filename, ".py");
+
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
       WALBERLA_MPI_WORLD_BARRIER()
@@ -157,7 +160,7 @@ int main(int argc, char** argv)
       } else if (timeStepStrategy == "kernelOnly") {
          timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
       } else if (timeStepStrategy == "StreamOnly") {
-         timeLoop.add() << Sweep(StreamOnlyKernel, "LBM Stream Only");
+         timeLoop.add() << Sweep(sweepCollection.stream(SweepCollection_T::ALL), "LBM Stream-Only");
       } else {
          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'")
       }
@@ -225,17 +228,24 @@ int main(int argc, char** argv)
 
          WALBERLA_ROOT_SECTION()
          {
-            python_coupling::PythonCallback pythonCallbackResults("results_callback");
-            if (pythonCallbackResults.isCallable())
-            {
-               pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
-               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
-               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
-               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
-               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
-               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
-               // Call Python function to report results
-               pythonCallbackResults();
+            if(inputIsPython){
+               python_coupling::PythonCallback pythonCallbackResults("results_callback");
+               if (pythonCallbackResults.isCallable())
+               {
+                  pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
+                  pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
+                  pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+                  pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
+                  pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
+                  pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
+                  pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+                  pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+                  pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+                  pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+                  pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+                  // Call Python function to report results
+                  pythonCallbackResults();
+               }
             }
          }
       }
diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
index cd1a36114788a0ad440f89d750abc8af26109eda..ae9ec4f1bd6e26a8474099cd6f03d4c40f114854 100644
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
@@ -7,7 +7,7 @@ from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases
     insert_symbol_times_minus_one
 
 from lbmpy.advanced_streaming import is_inplace
-from lbmpy.advanced_streaming.utility import streaming_patterns
+from lbmpy.advanced_streaming.utility import streaming_patterns, get_accessor, Timestep
 from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
 from lbmpy.enums import Method, Stencil
@@ -26,40 +26,40 @@ options_dict = {
     'srt': {
         'method': Method.SRT,
         'relaxation_rate': omega,
-        'compressible': False,
+        'compressible': True,
     },
     'trt': {
         'method': Method.TRT,
         'relaxation_rate': omega,
-        'compressible': False,
+        'compressible': True,
     },
-    'mrt': {
+    'r-w-mrt': {
         'method': Method.MRT,
         'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
-        'compressible': False,
+        'compressible': True,
     },
-    'mrt-overrelax': {
+    'w-mrt': {
         'method': Method.MRT,
         'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
-        'compressible': False,
+        'compressible': True,
     },
-    'central': {
+    'r-cm': {
         'method': Method.CENTRAL_MOMENT,
         'relaxation_rate': omega,
         'compressible': True,
     },
-    'central-overrelax': {
+    'cm': {
         'method': Method.CENTRAL_MOMENT,
         'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
         'compressible': True,
     },
-    'cumulant': {
-        'method': Method.MONOMIAL_CUMULANT,
+    'r-k': {
+        'method': Method.CUMULANT,
         'relaxation_rate': omega,
         'compressible': True,
     },
-    'cumulant-overrelax': {
-        'method': Method.MONOMIAL_CUMULANT,
+    'k': {
+        'method': Method.CUMULANT,
         'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
         'compressible': True,
     },
@@ -87,9 +87,6 @@ const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
 
-# DEFAULTS
-optimize = True
-
 with CodeGeneration() as ctx:
     openmp = True if ctx.openmp else False
     field_type = "float64" if ctx.double_accuracy else "float32"
@@ -105,9 +102,6 @@ with CodeGeneration() as ctx:
     streaming_pattern = config_tokens[1]
     collision_setup = config_tokens[2]
 
-    if len(config_tokens) >= 4:
-        optimize = (config_tokens[3] != 'noopt')
-
     if stencil_str == "d3q27":
         stencil = LBStencil(Stencil.D3Q27)
     elif stencil_str == "d3q19":
@@ -139,20 +133,15 @@ with CodeGeneration() as ctx:
     # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
     # is_inplace is set to False to ensure that the streaming is done with src and dst field.
     # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
-    accessor = CollideOnlyInplaceAccessor()
-    accessor.is_inplace = False
-    field_swaps_stream_only = [(pdfs, pdfs_tmp)]
-    stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
+    # accessor = CollideOnlyInplaceAccessor()
+    accessor = get_accessor(streaming_pattern, Timestep.EVEN)
+    #accessor.is_inplace = False
+    field_swaps_stream_only = () if accessor.is_inplace else [(pdfs, pdfs_tmp)]
+    stream_only_kernel = create_stream_only_kernel(stencil, pdfs, None if accessor.is_inplace else pdfs_tmp, accessor=accessor)
 
     # LB Sweep
     collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
 
-    if optimize:
-        collision_rule = insert_constants(collision_rule)
-        collision_rule = insert_zeros(collision_rule)
-        collision_rule = insert_aliases(collision_rule)
-        collision_rule = insert_symbol_times_minus_one(collision_rule)
-
     no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
                                      boundary_object=NoSlip())
     ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
@@ -166,7 +155,8 @@ with CodeGeneration() as ctx:
                          cpu_openmp=openmp, cpu_vectorize_info=cpu_vec)
 
     # Stream only kernel
-    generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
+    generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel,
+                   field_swaps=field_swaps_stream_only,
                    target=ps.Target.CPU, cpu_openmp=openmp)
 
     infoHeaderParams = {
diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
old mode 100755
new mode 100644
index 6a54df4522354c3f2f2ca51da370e7b4eba5229d..c3076cb3e1f457ecae1f6a5090adac2314c34e97
--- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
@@ -24,15 +24,15 @@ def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK
 ldc_setup = {'Border': [
     {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'UBB'},
     {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
 ]}
 
 
 class Scenario:
-    def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1),
+    def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1,
                  timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
                  warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
                  vtk_write_frequency=0, remaining_time_logger_frequency=-1):
@@ -41,7 +41,8 @@ class Scenario:
             init_shear_flow = False
             periodic = (0, 0, 0)
 
-        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+        self.blocks_per_process = blocks_per_process
+        self.blocks = block_decomposition(self.blocks_per_process * wlb.mpi.numProcesses())
 
         self.cells_per_block = cells_per_block
         self.periodic = periodic
@@ -68,7 +69,7 @@ class Scenario:
                 'blocks': self.blocks,
                 'cellsPerBlock': self.cells_per_block,
                 'periodic': self.periodic,
-                'oneBlockPerProcess': True
+                'cartesianSetup': (self.blocks_per_process == 1)
             },
             'Parameters': {
                 'omega': self.omega,
@@ -106,7 +107,7 @@ class Scenario:
         sequenceValuesToScalars(result)
         num_tries = 4
         # check multiple times e.g. may fail when multiple benchmark processes are running
-        table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
+        table_name = f"runs"
         table_name = table_name.replace("-", "_")
         for num_try in range(num_tries):
             try:
diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
index 29f7de7657e6d923b216d1ed4eae0229325a3762..215f5a5c1f5dae1c7b514026bd19def5bff8786f 100644
--- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
+++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h
@@ -186,7 +186,7 @@ void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T
    for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){
       uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it);
       // Propagate on ghost layers shadowing coarse or no blocks
-      if(!block->neighborhoodSectionHasSmallerBlocks(nSecIdx)){
+      if(block->neighborhoodSectionHasLargerBlock(nSecIdx)){
          CellInterval ci;
          pdfField->getGhostRegion(*it, ci, 1);
          sweepCollection_.streamOnlyNoAdvancementCellInterval(block, ci);