diff --git a/.gitignore b/.gitignore
index c755c9f0b48f05521eb5de8638ae4b3229f16044..4c63e3b65e5c1acb0f32cbc2d7f9975427a68c57 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,9 @@ qrc_*
 # macOS
 **/.DS_Store
 
+# CLion indexing
+*.uuid
+
 
 # Generated files
 *.out
diff --git a/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp b/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp
index 817097ccab6f4a08d1eb0bd80b169d1dca326111..4a674d48b66838c4de39e4316c551b7ef6d2de89 100644
--- a/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp
+++ b/apps/benchmarks/NonUniformGrid/NonUniformGrid.cpp
@@ -243,23 +243,23 @@ static void refinementSelection( SetupBlockForest& forest )
    AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMax() - zSize,
                      domain.xMax(), domain.yMax(), domain.zMax() );
 
-   for( auto block = forest.begin(); block != forest.end(); ++block )
+   for(auto & block : forest)
    {
-      auto & aabb = block->getAABB();
+      auto & aabb = block.getAABB();
       if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
       {
-         if( block->getLevel() < ( BlockForestLevels - uint_t(1) ) )
-            block->setMarker( true );
+         if( block.getLevel() < ( BlockForestLevels - uint_t(1) ) )
+            block.setMarker( true );
       }
    }
 }
 
 static void workloadAndMemoryAssignment( SetupBlockForest & forest, const memory_t memoryPerBlock ) {
 
-   for( auto block = forest.begin(); block != forest.end(); ++block )
+   for(auto & block : forest)
    {
-      block->setWorkload( numeric_cast< workload_t >( uint_t(1) << block->getLevel() ) );
-      block->setMemory( memoryPerBlock );
+      block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) );
+      block.setMemory( memoryPerBlock );
    }
 }
 
@@ -295,7 +295,7 @@ void createSetupBlockForest( blockforest::SetupBlockForest & sforest, const Conf
 
 #ifndef NDEBUG
    WALBERLA_ASSERT_EQUAL( sforest.getNumberOfBlocks(), numberOfBlocks( workerProcesses, fineBlocksPerProcess ) );
-   for( uint_t i = uint_t(0); i != BlockForestLevels; ++i )
+   for( auto i = uint_t(0); i != BlockForestLevels; ++i )
    {
       std::vector< blockforest::SetupBlock * > blocks;
       sforest.getBlocks( blocks, i );
@@ -415,13 +415,13 @@ void ReGrid::operator()( std::vector< std::pair< const Block *, uint_t > > & min
                   domain.xMax(), domain.yMax(), domain.zMin() + real_c(0.99) * zSize );
    }
    
-   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+   for(auto & minTargetLevel : minTargetLevels)
    {
-      auto & aabb = it->first->getAABB();
+      auto & aabb = minTargetLevel.first->getAABB();
       if( left.intersects( aabb ) || right.intersects( aabb ) )
-         it->second = BlockForestLevels - uint_t(1);
+         minTargetLevel.second = BlockForestLevels - uint_t(1);
       else
-         it->second = uint_t(0);
+         minTargetLevel.second = uint_t(0);
    }
 }
 
diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
index 3a44867523a9b94f8f2f9b57bc8b5aeb6aac6819..4d3ce332f4f6b23e7fbfb5e885b61c3268232155 100644
--- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
+++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp
@@ -66,22 +66,32 @@ using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction
 
 class LDCRefinement
 {
- private:
+private:
    const uint_t refinementDepth_;
 
- public:
-   LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+public:
+   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
 
-   void operator()(SetupBlockForest& forest)
+   void operator()(SetupBlockForest& forest) const
    {
-      std::vector< SetupBlock* > blocks;
-      forest.getBlocks(blocks);
+      const AABB & domain = forest.getDomain();
 
-      for (auto block : blocks)
+      real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
+      real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
+
+      AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
+                       domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
+
+      AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
+                        domain.xMax(), domain.yMin() + ySize, domain.zMax() );
+
+      for(auto & block : forest)
       {
-         if (forest.atDomainYMaxBorder(*block))
+         auto & aabb = block.getAABB();
+         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
          {
-            if (block->getLevel() < refinementDepth_) { block->setMarker(true); }
+            if( block.getLevel() < refinementDepth_)
+               block.setMarker( true );
          }
       }
    }
@@ -89,17 +99,26 @@ class LDCRefinement
 
 class LDC
 {
- public:
-   LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID noSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+
+public:
+   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
 
-   Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); }
-   RefinementSelectionFunctor refinementSelector() { return LDCRefinement(refinementDepth_); }
+   RefinementSelectionFunctor refinementSelector() const
+   {
+      return LDCRefinement(refinementDepth_);
+   }
 
    void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
    {
       for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
       {
-         Block& b           = dynamic_cast< Block& >(*bIt);
+         auto& b           = dynamic_cast< Block& >(*bIt);
          const uint_t level       = b.getLevel();
          auto flagField     = b.getData< FlagField_T >(flagFieldID);
          const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
@@ -118,26 +137,20 @@ class LDC
          }
       }
    }
- private:
-   const std::string refinementProfile_;
-   const uint_t refinementDepth_;
-
-   const FlagUID noSlipFlagUID_;
-   const FlagUID ubbFlagUID_;
 };
 
 static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses()))
 {
-   Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize");
-   Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks");
-   Vector3< bool > periodic     = domainSetup.getParameter< Vector3< bool > >("periodic");
+   Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+   Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+   Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
 
    auto refSelection = ldcSetup.refinementSelector();
-   setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection));
+   setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
    const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
-   setupBfs.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );
+   setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
    setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
-   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalance(true), numProcesses);
+   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
 }
 
 int main(int argc, char** argv)
@@ -147,7 +160,7 @@ int main(int argc, char** argv)
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
-      WALBERLA_MPI_WORLD_BARRIER()
+      WALBERLA_MPI_BARRIER()
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                        SETUP AND CONFIGURATION                                             ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -197,8 +210,6 @@ int main(int argc, char** argv)
          std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
       blocks->createCellBoundingBoxes();
 
-      WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); }
-
       WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks())
       for (uint_t level = 0; level <= refinementDepth; level++)
       {
@@ -223,6 +234,8 @@ int main(int argc, char** argv)
       {
          sweepCollection.initialise(&block, 2);
       }
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Initialisation done")
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -294,9 +307,13 @@ int main(int argc, char** argv)
       WcTimingPool timeloopTiming;
       WcTimer simTimer;
 
+      WALBERLA_MPI_BARRIER()
       WALBERLA_LOG_INFO_ON_ROOT("Starting benchmark with " << timesteps << " time steps")
+      WALBERLA_MPI_BARRIER()
+
       simTimer.start();
       timeLoop.run(timeloopTiming);
+      WALBERLA_MPI_BARRIER()
       simTimer.end();
 
       WALBERLA_LOG_INFO_ON_ROOT("Benchmark finished")
diff --git a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
index 1de18d9f0f6ed8ef684eddee74f4712c8f72c852..ccfcecacfb5c0f5a79b56aea13409cc3da4d2748 100644
--- a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py
@@ -2,12 +2,13 @@ import waLBerla as wlb
 
 
 class Scenario:
-    def __init__(self, domain_size=(32, 32, 32), root_blocks=(2, 2, 2),
-                 cells_per_block=(16, 16, 16)):
+    def __init__(self, domain_size=(64, 64, 64), root_blocks=(2, 2, 2),
+                 cells_per_block=(32, 32, 32), refinement_depth=0):
 
         self.domain_size = domain_size
         self.root_blocks = root_blocks
         self.cells_per_block = cells_per_block
+        self.refinement_depth = refinement_depth
 
         self.periodic = (0, 0, 0)
 
@@ -25,21 +26,25 @@ class Scenario:
             },
             'Parameters': {
                 'omega': 1.95,
-                'timesteps': 101,
+                'timesteps': 10001,
 
-                'refinementDepth': 1,
+                'refinementDepth': self.refinement_depth,
                 'writeSetupForestAndReturn': False,
                 'numProcesses': 1,
 
+                'cudaEnabledMPI': False,
                 'benchmarkKernelOnly': False,
 
                 'remainingTimeLoggerFrequency': 3,
 
-                'vtkWriteFrequency': 50,
+                'vtkWriteFrequency': 5000,
+            },
+            'Logging': {
+                'logLevel': "info",
             }
         }
 
-        if print_dict:
+        if print_dict and config_dict["Parameters"]["writeSetupForestAndReturn"] is False:
             wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
         return config_dict
 
@@ -47,10 +52,17 @@ class Scenario:
 def validation_run():
     """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
     wlb.log_info_on_root("Validation run")
-    wlb.log_info_on_root("")
+
+    domain_size = (96, 96, 96)
+    cells_per_block = (32, 32, 32)
+
+    root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
 
     scenarios = wlb.ScenarioManager()
-    scenario = Scenario()
+    scenario = Scenario(domain_size=domain_size,
+                        root_blocks=root_blocks,
+                        cells_per_block=cells_per_block,
+                        refinement_depth=1)
     scenarios.add(scenario)
 
 
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
index fa3905b4236295275d82e2e4aad91be4ddcbb5ba..8110dbbb4437b8d3c56d9b855de501fd55521454 100644
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
@@ -50,11 +50,8 @@
 #include "lbm_generated/gpu/BasicRecursiveTimeStepGPU.h"
 
 #include "python_coupling/CreateConfig.h"
-#include "python_coupling/DictWrapper.h"
 #include "python_coupling/PythonCallback.h"
 
-#include "timeloop/SweepTimeloop.h"
-
 #include <cmath>
 
 #include "NonUniformGridGPUInfoHeader.h"
@@ -80,18 +77,28 @@ class LDCRefinement
    const uint_t refinementDepth_;
 
  public:
-   LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
 
-   void operator()(SetupBlockForest& forest)
+   void operator()(SetupBlockForest& forest) const
    {
-      std::vector< SetupBlock* > blocks;
-      forest.getBlocks(blocks);
+      const AABB & domain = forest.getDomain();
+
+      real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
+      real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
+
+      AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
+                       domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
 
-      for (auto block : blocks)
+      AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
+                        domain.xMax(), domain.yMin() + ySize, domain.zMax() );
+
+      for(auto & block : forest)
       {
-         if (forest.atDomainYMaxBorder(*block))
+         auto & aabb = block.getAABB();
+         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
          {
-            if (block->getLevel() < refinementDepth_) { block->setMarker(true); }
+            if( block.getLevel() < refinementDepth_)
+               block.setMarker( true );
          }
       }
    }
@@ -107,10 +114,9 @@ class LDC
    const FlagUID ubbFlagUID_;
 
  public:
-   LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
 
-   Vector3< real_t > acceleration() const { return Vector3< real_t >(0.0); }
-   RefinementSelectionFunctor refinementSelector()
+   RefinementSelectionFunctor refinementSelector() const
    {
       return LDCRefinement(refinementDepth_);
    }
@@ -119,7 +125,7 @@ class LDC
    {
       for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
       {
-         Block& b           = dynamic_cast< Block& >(*bIt);
+         auto& b           = dynamic_cast< Block& >(*bIt);
          const uint_t level       = b.getLevel();
          auto flagField     = b.getData< FlagField_T >(flagFieldID);
          const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
@@ -140,18 +146,19 @@ class LDC
    }
 };
 
-static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses()))
-{
-   Vector3< real_t > domainSize = domainSetup.getParameter< Vector3< real_t > >("domainSize");
-   Vector3< uint_t > rootBlocks = domainSetup.getParameter< Vector3< uint_t > >("rootBlocks");
-   Vector3< bool > periodic     = domainSetup.getParameter< Vector3< bool > >("periodic");
-
-   auto refSelection = ldcSetup.refinementSelector();
-   setupBfs.addRefinementSelectionFunction(std::function< void(SetupBlockForest&) >(refSelection));
-   const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
-   setupBfs.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );
-   setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
-   setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+namespace {
+void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) {
+    Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
+    Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
+    Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
+
+    auto refSelection = ldcSetup.refinementSelector();
+    setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
+    const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
+    setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
+    setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
+    setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
+}
 }
 
 int main(int argc, char** argv)
@@ -203,7 +210,7 @@ int main(int argc, char** argv)
             totalCellUpdates += timesteps * math::uintPow2(level)  * numberOfCells;
             WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
          }
-         cudaDeviceProp prop;
+         cudaDeviceProp prop{};
          WALBERLA_GPU_CHECK(gpuGetDeviceProperties(&prop, 0))
 
          const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
@@ -232,8 +239,6 @@ int main(int argc, char** argv)
       auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
       blocks->createCellBoundingBoxes();
 
-      WALBERLA_ROOT_SECTION() { vtk::writeDomainDecomposition(blocks, "domainDecomposition", "vtk_out"); }
-
       WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << blocks->getNumberOfBlocks())
       for (uint_t level = 0; level <= refinementDepth; level++)
       {
@@ -263,6 +268,10 @@ int main(int argc, char** argv)
       {
          sweepCollection.initialise(&iBlock, 2, nullptr);
       }
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
+      WALBERLA_LOG_INFO_ON_ROOT("Initialisation done")
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -280,6 +289,7 @@ int main(int argc, char** argv)
       auto communication = std::make_shared< NonUniformGPUScheme <CommunicationStencil_T>> (blocks, cudaEnabledMPI);
       auto packInfo = lbm_generated::setupNonuniformGPUPdfCommunication<GPUPdfField_T>(blocks, pdfFieldGpuID);
       communication->addPackInfo(packInfo);
+      WALBERLA_MPI_BARRIER()
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                          TIME STEP DEFINITIONS                                             ///
@@ -343,10 +353,14 @@ int main(int argc, char** argv)
 
       WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
       WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      WALBERLA_MPI_BARRIER()
       WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+      WALBERLA_MPI_BARRIER()
+
       simTimer.start();
       timeLoop.run(timeloopTiming);
       WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+      WALBERLA_MPI_BARRIER()
       simTimer.end();
 
       WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
index d05852fd1934c71ea67d6cce3a8ae3f4cc80e61a..ccfcecacfb5c0f5a79b56aea13409cc3da4d2748 100644
--- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
@@ -26,7 +26,7 @@ class Scenario:
             },
             'Parameters': {
                 'omega': 1.95,
-                'timesteps': 1501,
+                'timesteps': 10001,
 
                 'refinementDepth': self.refinement_depth,
                 'writeSetupForestAndReturn': False,
@@ -37,7 +37,10 @@ class Scenario:
 
                 'remainingTimeLoggerFrequency': 3,
 
-                'vtkWriteFrequency': 500,
+                'vtkWriteFrequency': 5000,
+            },
+            'Logging': {
+                'logLevel': "info",
             }
         }
 
@@ -50,7 +53,7 @@ def validation_run():
     """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
     wlb.log_info_on_root("Validation run")
 
-    domain_size = (64, 64, 64)
+    domain_size = (96, 96, 96)
     cells_per_block = (32, 32, 32)
 
     root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py
index ac475f72c9489d9e7b74ce25d9bf303413ae7834..59fd6028fafcd2bab784c1a595f8511464108def 100644
--- a/python/pystencils_walberla/codegen.py
+++ b/python/pystencils_walberla/codegen.py
@@ -428,6 +428,7 @@ class KernelInfo:
     def generate_kernel_invocation_code(self, **kwargs):
         ast = self.ast
         ast_params = self.parameters
+        fnc_name = ast.function_name
         is_cpu = self.ast.target == Target.CPU
         call_parameters = ", ".join([p.symbol.name for p in ast_params])
 
@@ -446,20 +447,18 @@ class KernelInfo:
 
             indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
             sp_printer_c = CudaSympyPrinter()
-
             block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block'])
             grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid'])
 
-            kernel_launch = f"internal_{ast.function_name}::{ast.function_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
-
             kernel_call_lines = [
-                f"dim3 _block(uint32_t({block[0]}), uint32_t({block[1]}), uint32_t({block[2]}));",
-                f"dim3 _grid(uint32_t({grid[0]}), uint32_t({grid[1]}), uint32_t({grid[2]}));",
-                kernel_launch]
+                f"dim3 _block(uint64_c({block[0]}), uint64_c({block[1]}), uint64_c({block[2]}));",
+                f"dim3 _grid(uint64_c({grid[0]}), uint64_c({grid[1]}), uint64_c({grid[2]}));",
+                f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
+            ]
 
             return "\n".join(kernel_call_lines)
         else:
-            return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});"
+            return f"internal_{fnc_name}::{fnc_name}({call_parameters});"
 
 
 def get_vectorize_instruction_set(generation_context):
diff --git a/python/pystencils_walberla/kernel_info.py b/python/pystencils_walberla/kernel_info.py
index 1382d94f4220495da28bf02113636fdf8addbaf1..019843f903407717a222c5a44cc945f59a88fcfa 100644
--- a/python/pystencils_walberla/kernel_info.py
+++ b/python/pystencils_walberla/kernel_info.py
@@ -35,6 +35,7 @@ class KernelInfo:
     def generate_kernel_invocation_code(self, **kwargs):
         ast = self.ast
         ast_params = self.parameters
+        fnc_name = ast.function_name
         is_cpu = self.ast.target == Target.CPU
         call_parameters = ", ".join([p.symbol.name for p in ast_params])
 
@@ -53,15 +54,15 @@ class KernelInfo:
 
             indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
             sp_printer_c = CudaSympyPrinter()
+            block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block'])
+            grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid'])
+
             kernel_call_lines = [
-                "dim3 _block(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                  for e in indexing_dict['block']),
-                "dim3 _grid(int(%s), int(%s), int(%s));" % tuple(sp_printer_c.doprint(e)
-                                                                 for e in indexing_dict['grid']),
-                "internal_%s::%s<<<_grid, _block, 0, %s>>>(%s);" % (ast.function_name, ast.function_name,
-                                                                    stream, call_parameters),
+                f"dim3 _block(uint64_c({block[0]}), uint64_c({block[1]}), uint64_c({block[2]}));",
+                f"dim3 _grid(uint64_c({grid[0]}), uint64_c({grid[1]}), uint64_c({grid[2]}));",
+                f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
             ]
 
             return "\n".join(kernel_call_lines)
         else:
-            return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});"
+            return f"internal_{fnc_name}::{fnc_name}({call_parameters});"
diff --git a/python/pystencils_walberla/kernel_selection.py b/python/pystencils_walberla/kernel_selection.py
index c946f85105185159e317ea18d4740667cd7761c7..544b86de027af9f810df34f9bf10a21db7444fd6 100644
--- a/python/pystencils_walberla/kernel_selection.py
+++ b/python/pystencils_walberla/kernel_selection.py
@@ -172,6 +172,7 @@ class KernelCallNode(AbstractKernelSelectionNode):
     def get_code(self, **kwargs):
         ast = self.ast
         ast_params = self.parameters
+        fnc_name = ast.function_name
         is_cpu = self.ast.target == Target.CPU
         call_parameters = ", ".join([p.symbol.name for p in ast_params])
 
@@ -190,20 +191,18 @@ class KernelCallNode(AbstractKernelSelectionNode):
 
             indexing_dict = ast.indexing.call_parameters(spatial_shape_symbols)
             sp_printer_c = CudaSympyPrinter()
-
             block = tuple(sp_printer_c.doprint(e) for e in indexing_dict['block'])
             grid = tuple(sp_printer_c.doprint(e) for e in indexing_dict['grid'])
 
-            kernel_launch = f"internal_{ast.function_name}::{ast.function_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
-
             kernel_call_lines = [
-                f"dim3 _block(uint32_t({block[0]}), uint32_t({block[1]}), uint32_t({block[2]}));",
-                f"dim3 _grid(uint32_t({grid[0]}), uint32_t({grid[1]}), uint32_t({grid[2]}));",
-                kernel_launch]
+                f"dim3 _block(uint64_c({block[0]}), uint64_c({block[1]}), uint64_c({block[2]}));",
+                f"dim3 _grid(uint64_c({grid[0]}), uint64_c({grid[1]}), uint64_c({grid[2]}));",
+                f"internal_{fnc_name}::{fnc_name}<<<_grid, _block, 0, {stream}>>>({call_parameters});"
+            ]
 
             return "\n".join(kernel_call_lines)
         else:
-            return f"internal_{ast.function_name}::{ast.function_name}({call_parameters});"
+            return f"internal_{fnc_name}::{fnc_name}({call_parameters});"
 
 
 class SimpleBooleanCondition(AbstractConditionNode):
diff --git a/src/blockforest/BlockForest.h b/src/blockforest/BlockForest.h
index 6108ea3e2ce523ca6f8e11549e354f97324524b9..58df1ba4cfa8d99f7a5c1b437c67dfec61109d4e 100644
--- a/src/blockforest/BlockForest.h
+++ b/src/blockforest/BlockForest.h
@@ -239,7 +239,7 @@ public:
    bool storesUniformBlockGrid() const { return depth_ == 0; }
 
    uint_t getTreeIdDigits()   const { return treeIdDigits_; }
-   uint_t getBlockIdBytes()   const { uint_t bits = treeIdDigits_ + 3 * depth_; return (bits >> 3) + (( bits & 7 ) ? uint_c(1) : uint_c(0)); }
+   uint_t getBlockIdBytes()   const { uint_t const bits = treeIdDigits_ + 3 * depth_; return (bits >> 3) + (( bits & 7 ) ? uint_c(1) : uint_c(0)); }
    
    uint_t getDepth()          const { return depth_; }
    uint_t getNumberOfLevels() const { return depth_ + uint_t(1); }
@@ -635,7 +635,7 @@ inline void BlockForest::getBlocksOverlappedByAABB( std::vector< IBlock* >& bloc
 
 inline const Block* BlockForest::getBlock( const IBlockID& id ) const {
 
-   WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id );
+   WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id )
 
    auto it = blocks_.find( *static_cast< const BlockID* >( &id ) );
 
@@ -649,7 +649,7 @@ inline const Block* BlockForest::getBlock( const IBlockID& id ) const {
 
 inline Block* BlockForest::getBlock( const IBlockID& id ) {
 
-   WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id );
+   WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id )
 
    auto it = blocks_.find( *static_cast< const BlockID* >( &id ) );
 
@@ -708,7 +708,7 @@ inline void BlockForest::getAllBlocks( std::vector< shared_ptr< IBlockID > >& bl
 
    for( auto block = begin(); block != end(); ++block ) {
       const IBlockID& id = block->getId();
-      WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id );
+      WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id )
       blocks.push_back( walberla::make_shared< BlockID >( *static_cast< const BlockID* >( &id ) ) );
    }
 }
@@ -744,7 +744,7 @@ inline bool BlockForest::blockExistsRemotely( const real_t x, const real_t y, co
 
 inline bool BlockForest::blockExists( const IBlockID& id ) const {
 
-   WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id );
+   WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id )
 
    if( blockInformation_->active() )
       return blockInformation_->exists( *static_cast< const BlockID* >( &id ) );
@@ -763,7 +763,7 @@ inline bool BlockForest::blockExistsLocally( const IBlockID& id ) const {
 
 inline bool BlockForest::blockExistsRemotely( const IBlockID& id ) const {
 
-   WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id );
+   WALBERLA_ASSERT_EQUAL( dynamic_cast< const BlockID* >( &id ), &id )
 
    if( blockInformation_->active() )
       return blockInformation_->existsRemotely( *static_cast< const BlockID* >( &id ) );
@@ -801,7 +801,7 @@ inline bool BlockForest::rootBlockExistsRemotely( const uint_t x, const uint_t y
 
 inline uint_t BlockForest::getLevel( const IBlock& block ) const {
 
-   WALBERLA_ASSERT_EQUAL( dynamic_cast< const Block* >( &block ), &block );
+   WALBERLA_ASSERT_EQUAL( dynamic_cast< const Block* >( &block ), &block )
 
    return static_cast< const Block* >( &block )->getLevel();
 }
@@ -810,8 +810,8 @@ inline uint_t BlockForest::getLevel( const IBlock& block ) const {
 
 inline uint_t BlockForest::getLevelFromBlockId( const BlockID& id ) const {
 
-   WALBERLA_ASSERT_GREATER_EQUAL( id.getUsedBits(), treeIdDigits_ );
-   WALBERLA_ASSERT_EQUAL( ( id.getUsedBits() - treeIdDigits_ ) % 3, 0 );
+   WALBERLA_ASSERT_GREATER_EQUAL( id.getUsedBits(), treeIdDigits_ )
+   WALBERLA_ASSERT_EQUAL( ( id.getUsedBits() - treeIdDigits_ ) % 3, 0 )
 
    return ( id.getUsedBits() - treeIdDigits_ ) / 3;
 }
@@ -820,7 +820,7 @@ inline uint_t BlockForest::getLevelFromBlockId( const BlockID& id ) const {
 
 inline uint_t BlockForest::getAABBFromBlockId( AABB& aabb, const BlockID& id ) const {
 
-   BlockReconstruction::AABBReconstruction aabbReconstruction( domain_, size_[0], size_[1], size_[2], treeIdDigits_ );
+   BlockReconstruction::AABBReconstruction const aabbReconstruction( domain_, size_[0], size_[1], size_[2], treeIdDigits_ );
 
    return aabbReconstruction( aabb, id );
 }
@@ -838,9 +838,9 @@ inline AABB BlockForest::getAABBFromBlockId( const BlockID& id ) const {
 
 inline void BlockForest::getRootBlockID( BlockID& id, const uint_t x, const uint_t y, const uint_t z ) const {
 
-   WALBERLA_ASSERT_LESS( x, size_[0] );
-   WALBERLA_ASSERT_LESS( y, size_[1] );
-   WALBERLA_ASSERT_LESS( z, size_[2] );
+   WALBERLA_ASSERT_LESS( x, size_[0] )
+   WALBERLA_ASSERT_LESS( y, size_[1] )
+   WALBERLA_ASSERT_LESS( z, size_[2] )
 
    id.clear();
    id.init( z * size_[1] * size_[0] + y * size_[0] + x, uint_c(1) << ( treeIdDigits_ - 1 ) );
@@ -1017,7 +1017,7 @@ public:
       for( auto function = functions_.begin(); function != functions_.end(); ++function, ++iter )
       {
          (*function)( *iter, blocksAlreadyMarkedForRefinement, forest );
-         WALBERLA_ASSERT_EQUAL(iter->size(), numberOfBlocks, "Number of blocks has changed during min target level determination!");
+         WALBERLA_ASSERT_EQUAL(iter->size(), numberOfBlocks, "Number of blocks has changed during min target level determination!")
       }
 
       // combine the outcome of the different functions into a single target level
@@ -1026,7 +1026,7 @@ public:
       {
          for( uint_t fct = 0; fct < functions_.size(); ++fct)
          {
-            WALBERLA_ASSERT_EQUAL(minTargetLevelsPerFunction[fct][block].first->getId(), minTargetLevels[block].first->getId());
+            WALBERLA_ASSERT_EQUAL(minTargetLevelsPerFunction[fct][block].first->getId(), minTargetLevels[block].first->getId())
             targetLevels[fct] = minTargetLevelsPerFunction[fct][block].second;
          }
          minTargetLevels[block].second = targetLevelReductionFct_(targetLevels);
diff --git a/src/blockforest/SetupBlockForest.h b/src/blockforest/SetupBlockForest.h
index cc6268bde5190492d072717ae2d37d84358b8a41..1d105148f1d1c6beff3114d70b26099528ede467 100644
--- a/src/blockforest/SetupBlockForest.h
+++ b/src/blockforest/SetupBlockForest.h
@@ -175,7 +175,7 @@ public:
    uint_t getMinLevel()           const;
    uint_t getMaxLevel()           const;
    uint_t getTreeIdDigits()       const { return treeIdDigits_; }
-   uint_t getBlockIdBytes()       const { uint_t bits = treeIdDigits_ + 3 * depth_; return (bits >> 3) + (( bits & 7 ) ? uint_c(1) : uint_c(0)); }
+   uint_t getBlockIdBytes()       const { uint_t const bits = treeIdDigits_ + 3 * depth_; return (bits >> 3) + (( bits & 7 ) ? uint_c(1) : uint_c(0)); }
 
    uint_t getNumberOfTrees()      const { return forest_.size(); }
    uint_t getNumberOfRootBlocks() const { return numberOfRootBlocks_; }
@@ -312,7 +312,7 @@ public:
    uint_t getNumberOfProcesses      () const { return numberOfProcesses_; }
    uint_t getNumberOfWorkerProcesses() const { return numberOfProcesses_ - numberOfBufferProcesses_; }
    uint_t getNumberOfBufferProcesses() const { return numberOfBufferProcesses_; }
-   uint_t getProcessIdBytes         () const { uint_t bits = uintMSBPosition( numberOfProcesses_ - 1 );
+   uint_t getProcessIdBytes         () const { uint_t const bits = uintMSBPosition( numberOfProcesses_ - 1 );
                                                return ( bits >> 3 ) + ( ( bits & 7 ) ? uint_c(1) : uint_c(0) ); }
 
    inline bool insertBuffersIntoProcessNetwork() const { return insertBuffersIntoProcessNetwork_; }
@@ -505,19 +505,20 @@ inline uint_t SetupBlockForest::mapForestCoordinatesToTreeIndex( const uint_t x,
 
 inline void SetupBlockForest::mapTreeIndexToForestCoordinates( const uint_t treeIndex, const uint_t xSize, const uint_t ySize,
                                                                uint_t& x, uint_t& y, uint_t& z )
-{
-              z = treeIndex / ( xSize * ySize );
-   uint_t index = treeIndex % ( xSize * ySize );
-              y = index / xSize;
-              x = index % xSize;
+                                                               {
+    uint_t const index = treeIndex % ( xSize * ySize );
+
+    z = treeIndex / ( xSize * ySize );
+    y = index / xSize;
+    x = index % xSize;
 }
 
 
 
-inline void SetupBlockForest::mapTreeIndexToForestCoordinates( const uint_t treeIndex, uint_t& x, uint_t& y, uint_t& z ) const {
+inline void SetupBlockForest::mapTreeIndexToForestCoordinates( const uint_t treeIndex, uint_t& x, uint_t& y, uint_t& z ) const
+{
 
    WALBERLA_ASSERT_LESS( treeIndex, size_[0] * size_[1] * size_[2] )
-
    mapTreeIndexToForestCoordinates( treeIndex, size_[0], size_[1], x, y, z );
 }
 
@@ -530,7 +531,8 @@ inline void SetupBlockForest::getRootBlockAABB( AABB& aabb, const uint_t x, cons
 
 
 
-inline void SetupBlockForest::getRootBlockAABB( AABB& aabb, const uint_t treeIndex ) const {
+inline void SetupBlockForest::getRootBlockAABB( AABB& aabb, const uint_t treeIndex ) const
+{
 
    uint_t x;
    uint_t y;
@@ -615,10 +617,10 @@ inline void SetupBlockForest::initWorkloadMemorySUID( const Set<SUID>& selector
 
    if( workloadMemorySUIDAssignmentFunctions.empty() ) {
       WALBERLA_LOG_PROGRESS( "Initializing SetupBlockForest: No callback functions for assigning workload, memory requirements, and SUIDs to blocks found.\n"
-                             "                               Default values (zero/nothing) will be assigned ..." );
+                             "                               Default values (zero/nothing) will be assigned ..." )
    }
    else {
-      WALBERLA_LOG_PROGRESS( "Initializing SetupBlockForest: Assigning workload, memory requirements, and SUIDs to blocks ..." );
+      WALBERLA_LOG_PROGRESS( "Initializing SetupBlockForest: Assigning workload, memory requirements, and SUIDs to blocks ..." )
    }
 
    for( uint_t i = 0; i != workloadMemorySUIDAssignmentFunctions.size(); ++i )
diff --git a/src/blockforest/StructuredBlockForest.h b/src/blockforest/StructuredBlockForest.h
index 04b61a94fbc7b4b84ab60714db58a78932471a7a..4e1f0cb5f9619f29a63c9f195b4f44bac909985c 100644
--- a/src/blockforest/StructuredBlockForest.h
+++ b/src/blockforest/StructuredBlockForest.h
@@ -67,9 +67,9 @@ public:
    uint_t getNumberOfYCells( const IBlock& /*block*/ ) const override { return blockCells_[1]; }
    uint_t getNumberOfZCells( const IBlock& /*block*/ ) const override { return blockCells_[2]; }
 #else
-   uint_t getNumberOfXCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ); return blockCells_[0]; }
-   uint_t getNumberOfYCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ); return blockCells_[1]; }
-   uint_t getNumberOfZCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ); return blockCells_[2]; }
+   uint_t getNumberOfXCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ) return blockCells_[0]; }
+   uint_t getNumberOfYCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ) return blockCells_[1]; }
+   uint_t getNumberOfZCells( const IBlock& block ) const override { WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) ) return blockCells_[2]; }
 #endif
 
    using StructuredBlockStorage::getNumberOfCells;
@@ -231,7 +231,7 @@ inline bool StructuredBlockForest::blockExistsRemotely( const Cell& cell, const
 
 inline uint_t StructuredBlockForest::getLevel( const IBlock& block ) const {
 
-   WALBERLA_ASSERT_EQUAL( dynamic_cast< const Block* >( &block ), &block );
+   WALBERLA_ASSERT_EQUAL( dynamic_cast< const Block* >( &block ), &block )
 
    return static_cast< const Block* >( &block )->getLevel();
 }
@@ -244,8 +244,8 @@ inline uint_t StructuredBlockForest::getNumberOfCells( const IBlock& /*block*/,
 inline uint_t StructuredBlockForest::getNumberOfCells( const IBlock& block, const uint_t index ) const {
 #endif
 
-   WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) );
-   WALBERLA_ASSERT_LESS( index, 3 );
+   WALBERLA_ASSERT_EQUAL( &(getBlockStorage()), &(block.getBlockStorage()) )
+   WALBERLA_ASSERT_LESS( index, 3 )
 
    return blockCells_[ index ];
 }
diff --git a/src/blockforest/communication/NonUniformBufferedScheme.h b/src/blockforest/communication/NonUniformBufferedScheme.h
index be27a51ec805285144983d2d3a3618c502596d50..ff1d3ba3b85f97d558378f1125b00816d734307b 100644
--- a/src/blockforest/communication/NonUniformBufferedScheme.h
+++ b/src/blockforest/communication/NonUniformBufferedScheme.h
@@ -43,9 +43,7 @@
 #include <vector>
 
 
-namespace walberla {
-namespace blockforest {
-namespace communication {
+namespace walberla::blockforest::communication {
 
 
 template< typename Stencil >
@@ -66,12 +64,12 @@ public:
    /*! \name Construction & Destruction */
    //@{
    explicit NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf,
-                                      const int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97
+                                      int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97
 
    NonUniformBufferedScheme( const weak_ptr<StructuredBlockForest>& bf,
                              const Set<SUID> & requiredBlockSelectors, 
                              const Set<SUID> & incompatibleBlockSelectors,
-                             const int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97
+                             int baseTag = 778 ); // waLBerla = 119+97+76+66+101+114+108+97
 
    ~NonUniformBufferedScheme();
    //@}
@@ -87,15 +85,17 @@ public:
    //** Synchronous Communication **************************************************************************************
    /*! \name Synchronous Communication */
    //@{
-   inline void operator() () { communicateEqualLevel(); communicateCoarseToFine(); communicateFineToCoarse(); }
+   inline void operator() () { communicate(); }
+
+   inline void communicate() {communicateEqualLevel(); communicateCoarseToFine(); communicateFineToCoarse();}
 
    inline void communicateEqualLevel();
    inline void communicateCoarseToFine();
    inline void communicateFineToCoarse();
 
-   inline void communicateEqualLevel  ( const uint_t level );
-   inline void communicateCoarseToFine( const uint_t fineLevel );
-   inline void communicateFineToCoarse( const uint_t fineLevel );
+   inline void communicateEqualLevel  ( uint_t level );
+   inline void communicateCoarseToFine( uint_t fineLevel );
+   inline void communicateFineToCoarse( uint_t fineLevel );
 
    std::function<void()>  communicateEqualLevelFunctor(const uint_t level) {
       return [level, this](){ NonUniformBufferedScheme::communicateEqualLevel(level);};
@@ -110,7 +110,7 @@ public:
    //*******************************************************************************************************************
    
 
-   LocalCommunicationMode localMode() const { return localMode_; }
+   [[nodiscard]] LocalCommunicationMode localMode() const { return localMode_; }
    inline void setLocalMode( const LocalCommunicationMode & mode );
 
 
@@ -125,9 +125,9 @@ public:
    inline void startCommunicateCoarseToFine();
    inline void startCommunicateFineToCoarse();
 
-   inline void startCommunicateEqualLevel  ( const uint_t level );
-   inline void startCommunicateCoarseToFine( const uint_t fineLevel );
-   inline void startCommunicateFineToCoarse( const uint_t fineLevel );
+   inline void startCommunicateEqualLevel  ( uint_t level );
+   inline void startCommunicateCoarseToFine( uint_t fineLevel );
+   inline void startCommunicateFineToCoarse( uint_t fineLevel );
    
    void wait() { waitCommunicateEqualLevel(); waitCommunicateCoarseToFine(); waitCommunicateFineToCoarse(); }
    std::function<void() >  getWaitFunctor() { return std::bind( &NonUniformBufferedScheme::wait, this ); }
@@ -136,9 +136,9 @@ public:
    inline void waitCommunicateCoarseToFine();
    inline void waitCommunicateFineToCoarse();
 
-   inline void waitCommunicateEqualLevel  ( const uint_t level );
-   inline void waitCommunicateCoarseToFine( const uint_t fineLevel );
-   inline void waitCommunicateFineToCoarse( const uint_t fineLevel );
+   inline void waitCommunicateEqualLevel  ( uint_t level );
+   inline void waitCommunicateCoarseToFine( uint_t fineLevel );
+   inline void waitCommunicateFineToCoarse( uint_t fineLevel );
    //@}
    //*******************************************************************************************************************
 
@@ -147,14 +147,14 @@ protected:
    void init();
    void refresh();
 
-   void startCommunicationEqualLevel  ( const uint_t index, std::set< uint_t > & participatingLevels );
-   void startCommunicationCoarseToFine( const uint_t index, const uint_t coarsestLevel, const uint_t finestLevel );
-   void startCommunicationFineToCoarse( const uint_t index, const uint_t coarsestLevel, const uint_t finestLevel );
+   void startCommunicationEqualLevel  ( uint_t index, std::set< uint_t > & participatingLevels );
+   void startCommunicationCoarseToFine( uint_t index, uint_t coarsestLevel, uint_t finestLevel );
+   void startCommunicationFineToCoarse( uint_t index, uint_t coarsestLevel, uint_t finestLevel );
 
    void resetBufferSystem( shared_ptr< mpi::OpenMPBufferSystem > & bufferSystem );
 
-   void start( const INDEX i, const uint_t j );
-   void  wait( const INDEX i, const uint_t j );
+   void start( INDEX i, uint_t j );
+   void  wait( INDEX i, uint_t j );
 
    static void writeHeader( SendBuffer & buffer, const BlockID & sender, const BlockID & receiver, const stencil::Direction & dir );
    static void  readHeader( RecvBuffer & buffer,       BlockID & sender,       BlockID & receiver,       stencil::Direction & dir );
@@ -162,17 +162,17 @@ protected:
    static void send( SendBuffer & buffer, std::vector< SendBufferFunction > & functions );
           void receive( RecvBuffer & buffer );
 
-   void localBufferPacking( const INDEX i, const uint_t j, const uint_t bufferIndex, const PackInfo & packInfo,
+   void localBufferPacking( INDEX i, uint_t j, uint_t bufferIndex, const PackInfo & packInfo,
                             const Block * sender, const Block * receiver, const stencil::Direction & dir );
-   void localBufferUnpacking( const INDEX i, const uint_t j, const uint_t bufferIndex, const PackInfo & packInfo,
+   void localBufferUnpacking( INDEX i, uint_t j, uint_t bufferIndex, const PackInfo & packInfo,
                               Block * receiver, const Block * sender, const stencil::Direction & dir );
 
-   bool isAnyCommunicationInProgress() const;
+   [[nodiscard]] bool isAnyCommunicationInProgress() const;
 
 
 
    weak_ptr<StructuredBlockForest> blockForest_;
-   uint_t forestModificationStamp_;
+   uint_t forestModificationStamp_{uint_c(0)};
 
    std::vector< PackInfo > packInfos_;
 
@@ -279,8 +279,8 @@ void NonUniformBufferedScheme<Stencil>::refresh()
    }
    
 #ifndef NDEBUG
-   for( auto p = packInfos_.begin(); p != packInfos_.end(); ++p )
-      (*p)->clearBufferSizeCheckMap();
+   for(auto & packInfo : packInfos_)
+      packInfo->clearBufferSizeCheckMap();
 #endif
    
    forestModificationStamp_ = forest->getBlockForest().getModificationStamp();
@@ -312,8 +312,8 @@ inline void NonUniformBufferedScheme<Stencil>::addPackInfo( const PackInfo & pac
    packInfos_.push_back( packInfo );
 
    for( uint_t i = 0; i != 3; ++i )
-      for( auto level = setupBeforeNextCommunication_[i].begin(); level != setupBeforeNextCommunication_[i].end(); ++level )
-          *level = char(1);
+      for(char & level : setupBeforeNextCommunication_[i])
+          level = char(1);
 }
 
 
@@ -380,8 +380,8 @@ inline void NonUniformBufferedScheme<Stencil>::setLocalMode( const LocalCommunic
       localMode_ = mode;
 
       for( uint_t i = 0; i != 3; ++i )
-         for( auto level = setupBeforeNextCommunication_[i].begin(); level != setupBeforeNextCommunication_[i].end(); ++level )
-             *level = char(1);
+         for(char & level : setupBeforeNextCommunication_[i])
+             level = char(1);
    }
 }
 
@@ -419,7 +419,7 @@ inline void NonUniformBufferedScheme<Stencil>::startCommunicateCoarseToFine()
    if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() )
       refresh();
 
-   const uint_t coarsestLevel = uint_t(0);
+   const auto coarsestLevel = uint_t(0);
    const uint_t   finestLevel = levelIndex - uint_t(1);
 
    startCommunicationCoarseToFine( levelIndex, coarsestLevel, finestLevel );
@@ -440,7 +440,7 @@ inline void NonUniformBufferedScheme<Stencil>::startCommunicateFineToCoarse()
    if( forestModificationStamp_ != forest->getBlockForest().getModificationStamp() )
       refresh();
 
-   const uint_t coarsestLevel = uint_t(0);
+   const auto coarsestLevel = uint_t(0);
    const uint_t   finestLevel = levelIndex - uint_t(1);
 
    startCommunicationFineToCoarse( levelIndex, coarsestLevel, finestLevel );
@@ -633,7 +633,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint
 
       for( auto it = forest->begin(); it != forest->end(); ++it )
       {
-         Block * block = dynamic_cast< Block * >( it.get() );
+         auto * block = dynamic_cast< Block * >( it.get() );
 
          if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
             continue;
@@ -660,7 +660,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint
                auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) );
                WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() )
 
-               for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
+               for(auto & packInfo : packInfos_)
                {
                   if( localMode_ == BUFFER )
                   {
@@ -669,14 +669,14 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint
                      const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1);
 
                      VoidFunction pack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferPacking, this,
-                                                      EQUAL_LEVEL, index, bufferIndex, std::cref( *packInfo ), block, neighbor, *dir );
+                                                      EQUAL_LEVEL, index, bufferIndex, std::cref( packInfo ), block, neighbor, *dir );
 
                      threadsafeLocalCommunication.push_back( pack );
 
                      VoidFunction unpack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferUnpacking, this,
-                                                        EQUAL_LEVEL, index, bufferIndex, std::cref( *packInfo ), neighbor, block, *dir  );
+                                                        EQUAL_LEVEL, index, bufferIndex, std::cref( packInfo ), neighbor, block, *dir  );
 
-                     if( (*packInfo)->threadsafeReceiving() )
+                     if( packInfo->threadsafeReceiving() )
                         threadsafeLocalCommunicationUnpack.push_back( unpack );
                      else
                         localCommunicationUnpack.push_back( unpack );
@@ -684,8 +684,8 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint
                   else
                   {
                      VoidFunction localCommunicationFunction = std::bind( &blockforest::communication::NonUniformPackInfo::communicateLocalEqualLevel,
-                                                                            *packInfo, block, neighbor, *dir );
-                     if( (*packInfo)->threadsafeReceiving() )
+                                                                            packInfo, block, neighbor, *dir );
+                     if( packInfo->threadsafeReceiving() )
                         threadsafeLocalCommunication.push_back( localCommunicationFunction );
                      else
                         localCommunication.push_back( localCommunicationFunction );
@@ -699,18 +699,18 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationEqualLevel( const uint
                if( !packInfos_.empty() )
                   sendFunctions[ nProcess ].push_back( std::bind( NonUniformBufferedScheme<Stencil>::writeHeader, std::placeholders::_1, block->getId(), receiverId, *dir ) );
 
-               for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
-                  sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataEqualLevel, *packInfo, block, *dir, std::placeholders::_1 ) );
+               for(auto & packInfo : packInfos_)
+                  sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataEqualLevel, packInfo, block, *dir, std::placeholders::_1 ) );
             }
          }
       }
 
       resetBufferSystem( bufferSystem );
 
-      for( auto sender = sendFunctions.begin(); sender != sendFunctions.end(); ++sender )
+      for( auto & sender : sendFunctions)
       {
-         bufferSystem->addSendingFunction  ( int_c(sender->first), std::bind(  NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender->second ) );
-         bufferSystem->addReceivingFunction( int_c(sender->first), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) );
+         bufferSystem->addSendingFunction  ( int_c(sender.first), std::bind(  NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender.second ) );
+         bufferSystem->addReceivingFunction( int_c(sender.first), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) );
       }
 
       setupBeforeNextCommunication = char(0);
@@ -758,7 +758,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui
       WALBERLA_CHECK_NOT_NULLPTR( forest, "Trying to access communication for a block storage object that doesn't exist anymore" )
       for( auto it = forest->begin(); it != forest->end(); ++it )
       {
-         Block * block = dynamic_cast< Block * >( it.get() );
+         auto * block = dynamic_cast< Block * >( it.get() );
 
          if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
             continue;
@@ -786,7 +786,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui
                      auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) );
                      WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() )
 
-                     for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
+                     for(auto & packInfo : packInfos_)
                      {
                         if( localMode_ == BUFFER )
                         {
@@ -795,14 +795,14 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui
                            const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1);
 
                            VoidFunction pack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferPacking, this,
-                                                            COARSE_TO_FINE, index, bufferIndex, std::cref( *packInfo ), block, neighbor, *dir );
+                                                            COARSE_TO_FINE, index, bufferIndex, std::cref( packInfo ), block, neighbor, *dir );
 
                            threadsafeLocalCommunication.push_back( pack );
 
                            VoidFunction unpack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferUnpacking, this,
-                                                              COARSE_TO_FINE, index, bufferIndex, std::cref( *packInfo ), neighbor, block, *dir  );
+                                                              COARSE_TO_FINE, index, bufferIndex, std::cref( packInfo ), neighbor, block, *dir  );
 
-                           if( (*packInfo)->threadsafeReceiving() )
+                           if( packInfo->threadsafeReceiving() )
                               threadsafeLocalCommunicationUnpack.push_back( unpack );
                            else
                               localCommunicationUnpack.push_back( unpack );
@@ -810,8 +810,8 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui
                         else
                         {
                            VoidFunction localCommunicationFunction = std::bind( &blockforest::communication::NonUniformPackInfo::communicateLocalCoarseToFine,
-                                                                                  *packInfo, block, neighbor, *dir );
-                           if( (*packInfo)->threadsafeReceiving() )
+                                                                                  packInfo, block, neighbor, *dir );
+                           if( packInfo->threadsafeReceiving() )
                               threadsafeLocalCommunication.push_back( localCommunicationFunction );
                            else
                               localCommunication.push_back( localCommunicationFunction );
@@ -821,12 +821,11 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui
                   else
                   {
                      auto nProcess = block->getNeighborProcess( neighborIdx, n );
-
                      if( !packInfos_.empty() )
                         sendFunctions[ nProcess ].push_back( std::bind( NonUniformBufferedScheme<Stencil>::writeHeader, std::placeholders::_1, block->getId(), receiverId, *dir ) );
 
-                     for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
-                        sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataCoarseToFine, *packInfo, block, receiverId, *dir, std::placeholders::_1 ) );
+                     for(auto & packInfo : packInfos_)
+                        sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataCoarseToFine, packInfo, block, receiverId, *dir, std::placeholders::_1 ) );
                   }
                }
             }
@@ -852,11 +851,11 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationCoarseToFine( const ui
 
       resetBufferSystem( bufferSystem );
 
-      for( auto sender = sendFunctions.begin(); sender != sendFunctions.end(); ++sender )
-         bufferSystem->addSendingFunction( int_c(sender->first), std::bind(  NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender->second ) );
+      for( auto sender : sendFunctions )
+         bufferSystem->addSendingFunction( int_c(sender.first), std::bind(  NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender.second ) );
 
-      for( auto receiver = ranksToReceiveFrom.begin(); receiver != ranksToReceiveFrom.end(); ++receiver )
-         bufferSystem->addReceivingFunction( int_c(*receiver), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) );
+      for(auto receiver : ranksToReceiveFrom)
+         bufferSystem->addReceivingFunction( int_c(receiver), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) );
 
       setupBeforeNextCommunication = char(0);
    }
@@ -904,7 +903,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui
 
       for( auto it = forest->begin(); it != forest->end(); ++it )
       {
-         Block * block = dynamic_cast< Block * >( it.get() );
+         auto * block = dynamic_cast< Block * >( it.get() );
 
          if( !selectable::isSetSelected( block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_ ) )
             continue;
@@ -932,7 +931,7 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui
                   auto neighbor = dynamic_cast< Block * >( forest->getBlock(receiverId) );
                   WALBERLA_ASSERT_EQUAL( neighbor->getProcess(), block->getProcess() )
 
-                  for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
+                  for(auto & packInfo : packInfos_)
                   {
                      if( localMode_ == BUFFER )
                      {
@@ -941,14 +940,14 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui
                         const uint_t bufferIndex = uint_c( localBuffers.size() ) - uint_t(1);
 
                         VoidFunction pack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferPacking, this,
-                                                         FINE_TO_COARSE, index, bufferIndex, std::cref( *packInfo ), block, neighbor, *dir );
+                                                         FINE_TO_COARSE, index, bufferIndex, std::cref( packInfo ), block, neighbor, *dir );
 
                         threadsafeLocalCommunication.push_back( pack );
 
                         VoidFunction unpack = std::bind( &NonUniformBufferedScheme<Stencil>::localBufferUnpacking, this,
-                                                           FINE_TO_COARSE, index, bufferIndex, std::cref( *packInfo ), neighbor, block, *dir  );
+                                                           FINE_TO_COARSE, index, bufferIndex, std::cref( packInfo ), neighbor, block, *dir  );
 
-                        if( (*packInfo)->threadsafeReceiving() )
+                        if( packInfo->threadsafeReceiving() )
                            threadsafeLocalCommunicationUnpack.push_back( unpack );
                         else
                            localCommunicationUnpack.push_back( unpack );
@@ -956,8 +955,8 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui
                      else
                      {
                         VoidFunction localCommunicationFunction = std::bind( &blockforest::communication::NonUniformPackInfo::communicateLocalFineToCoarse,
-                                                                               *packInfo, block, neighbor, *dir );
-                        if( (*packInfo)->threadsafeReceiving() )
+                                                                               packInfo, block, neighbor, *dir );
+                        if( packInfo->threadsafeReceiving() )
                            threadsafeLocalCommunication.push_back( localCommunicationFunction );
                         else
                            localCommunication.push_back( localCommunicationFunction );
@@ -971,8 +970,8 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui
                   if( !packInfos_.empty() )
                      sendFunctions[ nProcess ].push_back( std::bind( NonUniformBufferedScheme<Stencil>::writeHeader, std::placeholders::_1, block->getId(), receiverId, *dir ) );
 
-                  for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
-                     sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataFineToCoarse, *packInfo, block, receiverId, *dir, std::placeholders::_1 ) );
+                  for(auto & packInfo : packInfos_)
+                     sendFunctions[ nProcess ].push_back( std::bind( &blockforest::communication::NonUniformPackInfo::packDataFineToCoarse, packInfo, block, receiverId, *dir, std::placeholders::_1 ) );
                }
             }
          }
@@ -997,11 +996,11 @@ void NonUniformBufferedScheme<Stencil>::startCommunicationFineToCoarse( const ui
 
       resetBufferSystem( bufferSystem );
 
-      for( auto sender = sendFunctions.begin(); sender != sendFunctions.end(); ++sender )
-         bufferSystem->addSendingFunction( int_c(sender->first), std::bind(  NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender->second ) );
+      for( auto sender : sendFunctions )
+         bufferSystem->addSendingFunction( int_c(sender.first), std::bind(  NonUniformBufferedScheme<Stencil>::send, std::placeholders::_1, sender.second ) );
 
-      for( auto receiver = ranksToReceiveFrom.begin(); receiver != ranksToReceiveFrom.end(); ++receiver )
-         bufferSystem->addReceivingFunction( int_c(*receiver), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) );
+      for(auto receiver : ranksToReceiveFrom)
+         bufferSystem->addReceivingFunction( int_c(receiver), std::bind( &NonUniformBufferedScheme<Stencil>::receive, this, std::placeholders::_1 ) );
 
       setupBeforeNextCommunication = char(0);
    }
@@ -1019,10 +1018,10 @@ void NonUniformBufferedScheme<Stencil>::resetBufferSystem( shared_ptr< mpi::Open
 
    bool constantSizes     = true;
    bool threadsafeReceive = true;
-   for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
+   for(auto & packInfo : packInfos_)
    {
-      if( !(*packInfo)->constantDataExchange() ) constantSizes = false;
-      if( !(*packInfo)->threadsafeReceiving()  ) threadsafeReceive = false;
+      if( !packInfo->constantDataExchange() ) constantSizes = false;
+      if( !packInfo->threadsafeReceiving()  ) threadsafeReceive = false;
    }
 
    bufferSystem->setReceiverInfo( !constantSizes );
@@ -1048,8 +1047,8 @@ void NonUniformBufferedScheme<Stencil>::start( const INDEX i, const uint_t j )
 
    if( localMode_ == START )
    {
-      for( auto function = localCommunication.begin(); function != localCommunication.end(); ++function )
-         (*function)();
+      for(auto & function : localCommunication)
+         function();
 
       const int threadsafeLocalCommunicationSize = int_c( threadsafeLocalCommunication.size() );
 #ifdef _OPENMP
@@ -1089,8 +1088,8 @@ void NonUniformBufferedScheme<Stencil>::wait( const INDEX i, const uint_t j )
 
    if( localMode_ == WAIT )
    {
-      for( auto function = localCommunication.begin(); function != localCommunication.end(); ++function )
-         (*function)();
+      for(auto & function : localCommunication)
+         function();
 
       const int threadsafeLocalCommunicationSize = int_c( threadsafeLocalCommunication.size() );
 #ifdef _OPENMP
@@ -1101,8 +1100,8 @@ void NonUniformBufferedScheme<Stencil>::wait( const INDEX i, const uint_t j )
    }
    else if( localMode_ == BUFFER )
    {
-      for( auto function = localCommunicationUnpack.begin(); function != localCommunicationUnpack.end(); ++function )
-         (*function)();
+      for(auto & function : localCommunicationUnpack)
+         function();
 
       const int threadsafeLocalCommunicationUnpackSize = int_c( threadsafeLocalCommunicationUnpack.size() );
 #ifdef _OPENMP
@@ -1144,8 +1143,8 @@ void NonUniformBufferedScheme<Stencil>::readHeader( RecvBuffer & buffer, BlockID
 template< typename Stencil >
 void NonUniformBufferedScheme<Stencil>::send( SendBuffer & buffer, std::vector< SendBufferFunction > & functions )
 {
-   for( auto function = functions.begin(); function != functions.end(); ++function )
-      (*function)( buffer );
+   for(auto & function : functions)
+      function( buffer );
 }
 
 
@@ -1171,18 +1170,18 @@ void NonUniformBufferedScheme<Stencil>::receive( RecvBuffer & buffer )
 
       if( senderLevel == receiverLevel )
       {
-         for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
-            (*packInfo)->unpackDataEqualLevel( block, stencil::inverseDir[dir], buffer );
+         for(auto & packInfo : packInfos_)
+            packInfo->unpackDataEqualLevel( block, stencil::inverseDir[dir], buffer );
       }
       else if( senderLevel < receiverLevel ) // coarse to fine
       {
-         for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
-            (*packInfo)->unpackDataCoarseToFine( block, sender, stencil::inverseDir[dir], buffer );
+         for(auto & packInfo : packInfos_)
+            packInfo->unpackDataCoarseToFine( block, sender, stencil::inverseDir[dir], buffer );
       }
       else if( senderLevel > receiverLevel ) // fine to coarse
       {
-         for( auto packInfo = packInfos_.begin(); packInfo != packInfos_.end(); ++packInfo )
-            (*packInfo)->unpackDataFineToCoarse( block, sender, stencil::inverseDir[dir], buffer );
+         for(auto & packInfo : packInfos_)
+            packInfo->unpackDataFineToCoarse( block, sender, stencil::inverseDir[dir], buffer );
       }
    }
 }
@@ -1244,15 +1243,13 @@ void NonUniformBufferedScheme<Stencil>::localBufferUnpacking( const INDEX i, con
 template< typename Stencil >
 bool NonUniformBufferedScheme<Stencil>::isAnyCommunicationInProgress() const
 {
-   for( auto caseIt = communicationInProgress_.begin(); caseIt != communicationInProgress_.end(); ++caseIt )
-      for( auto levelIt = caseIt->begin(); levelIt != caseIt->end(); ++levelIt )
-         if( *levelIt )
+   for(const auto & communicationInProgres : communicationInProgress_)
+      for(bool inProgress : communicationInProgres)
+         if( inProgress )
             return true;
 
    return false;
 }
 
 
-} // namespace communication
-} // namespace blockforest
 } // namespace walberla
diff --git a/src/core/mpi/BufferSystem.h b/src/core/mpi/BufferSystem.h
index 04161810a408aab7bc0baf6ddb6159714333494a..02bdd60bb38452899d84abeb78d19c68b03f1b69 100644
--- a/src/core/mpi/BufferSystem.h
+++ b/src/core/mpi/BufferSystem.h
@@ -245,9 +245,9 @@ protected:
 
 
    struct SendInfo {
-      SendInfo() : alreadySent(false) {}
+      SendInfo()= default;
       SendBuffer_T buffer;
-      bool alreadySent;
+      bool alreadySent{false};
    };
    std::map<MPIRank, SendInfo> sendInfos_;
 
diff --git a/src/core/mpi/BufferSystem.impl.h b/src/core/mpi/BufferSystem.impl.h
index 183d29bd86b7412090916e0b2be92c3dc7a4c352..3715a7ea9f3daf95c0bde6163954ca72159e48d3 100644
--- a/src/core/mpi/BufferSystem.impl.h
+++ b/src/core/mpi/BufferSystem.impl.h
@@ -52,7 +52,7 @@ void GenericBufferSystem<Rb, Sb>::iterator::operator++()
 {
    currentRecvBuffer_ = bufferSystem_.waitForNext( currentSenderRank_ );
    if ( ! currentRecvBuffer_ ) {
-      WALBERLA_ASSERT_EQUAL( currentSenderRank_, -1 );
+      WALBERLA_ASSERT_EQUAL( currentSenderRank_, -1 )
    } else
    {
       bufferSystem_.bytesReceived_ += currentRecvBuffer_->size() * sizeof(RecvBuffer::ElementType);
@@ -64,7 +64,7 @@ template< typename Rb, typename Sb>
 bool GenericBufferSystem<Rb, Sb>::iterator::operator==( const typename GenericBufferSystem<Rb, Sb>::iterator & other )
 {
    // only equality checks with end iterators are allowed
-   WALBERLA_ASSERT( other.currentSenderRank_ == -1 || currentSenderRank_ == -1 );
+   WALBERLA_ASSERT( other.currentSenderRank_ == -1 || currentSenderRank_ == -1 )
 
    return ( currentSenderRank_ == other.currentSenderRank_ );
 }
@@ -73,7 +73,7 @@ template< typename Rb, typename Sb>
 bool GenericBufferSystem<Rb, Sb>::iterator::operator!=( const typename GenericBufferSystem<Rb, Sb>::iterator & other )
 {
    // only equality checks with end iterators are allowed
-   WALBERLA_ASSERT( other.currentSenderRank_ == -1 || currentSenderRank_ == -1 );
+   WALBERLA_ASSERT( other.currentSenderRank_ == -1 || currentSenderRank_ == -1 )
 
    return ( currentSenderRank_ != other.currentSenderRank_ );
 }
@@ -90,7 +90,7 @@ template< typename Rb, typename Sb>
 template<typename RankIter>
 void GenericBufferSystem<Rb, Sb>::setReceiverInfo( RankIter rankBegin, RankIter rankEnd, bool changingSize )
 {
-   WALBERLA_ASSERT( ! communicationRunning_ );
+   WALBERLA_ASSERT( ! communicationRunning_ )
 
    recvInfos_.clear();
    for ( auto it = rankBegin; it != rankEnd; ++it )
@@ -147,7 +147,7 @@ GenericBufferSystem<Rb, Sb>::GenericBufferSystem( const GenericBufferSystem &oth
      recvInfos_( other.recvInfos_ ),
      sendInfos_( other.sendInfos_ )
 {
-   WALBERLA_ASSERT( !communicationRunning_, "Can't copy GenericBufferSystem while communication is running" );
+   WALBERLA_ASSERT( !communicationRunning_, "Can't copy GenericBufferSystem while communication is running" )
    if( other.currentComm_ == &other.knownSizeComm_ )
       currentComm_ = &knownSizeComm_;
    else if ( other.currentComm_ == &other.unknownSizeComm_ )
@@ -163,7 +163,7 @@ GenericBufferSystem<Rb, Sb>::GenericBufferSystem( const GenericBufferSystem &oth
 template< typename Rb, typename Sb>
 GenericBufferSystem<Rb, Sb> & GenericBufferSystem<Rb, Sb>::operator=( const GenericBufferSystem<Rb, Sb> & other )
 {
-   WALBERLA_ASSERT( !communicationRunning_, "Can't copy GenericBufferSystem while communication is running" );
+   WALBERLA_ASSERT( !communicationRunning_, "Can't copy GenericBufferSystem while communication is running" )
 
    sizeChangesEverytime_ = other.sizeChangesEverytime_;
    communicationRunning_ = other.communicationRunning_;
@@ -204,12 +204,11 @@ GenericBufferSystem<Rb, Sb> & GenericBufferSystem<Rb, Sb>::operator=( const Gene
 template< typename Rb, typename Sb>
 void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::set<MPIRank> & ranksToRecvFrom, bool changingSize )
 {
-   WALBERLA_ASSERT( ! communicationRunning_ );
+   WALBERLA_ASSERT( ! communicationRunning_ )
 
    recvInfos_.clear();
-   for ( auto it = ranksToRecvFrom.begin(); it != ranksToRecvFrom.end(); ++it )
+   for ( auto sender : ranksToRecvFrom )
    {
-      const MPIRank sender = *it;
       recvInfos_[ sender ].size = INVALID_SIZE;
    }
 
@@ -234,10 +233,10 @@ void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::map<MPIRank,MPISiz
    WALBERLA_ASSERT( ! communicationRunning_ )
 
    recvInfos_.clear();
-   for ( auto it = ranksToRecvFrom.begin(); it != ranksToRecvFrom.end(); ++it )
+   for ( auto it : ranksToRecvFrom )
    {
-      const MPIRank sender       = it->first;
-      const MPISize senderSize   = it->second;
+      const MPIRank sender       = it.first;
+      const MPISize senderSize   = it.second;
       WALBERLA_ASSERT_GREATER( senderSize, 0 )
       recvInfos_[ sender ].size   = senderSize;
    }
@@ -269,7 +268,7 @@ void GenericBufferSystem<Rb, Sb>::setReceiverInfo( const std::map<MPIRank,MPISiz
 template< typename Rb, typename Sb>
 void GenericBufferSystem<Rb, Sb>::setReceiverInfoFromSendBufferState( bool useSizeFromSendBuffers, bool changingSize )
 {
-   WALBERLA_ASSERT( ! communicationRunning_ );
+   WALBERLA_ASSERT( ! communicationRunning_ )
 
    recvInfos_.clear();
    for ( auto it = sendInfos_.begin(); it != sendInfos_.end(); ++it )
@@ -303,7 +302,7 @@ void GenericBufferSystem<Rb, Sb>::setReceiverInfoFromSendBufferState( bool useSi
 template< typename Rb, typename Sb>
 void GenericBufferSystem<Rb, Sb>::sizeHasChanged( bool alwaysChangingSize )
 {
-   WALBERLA_ASSERT( ! communicationRunning_ );
+   WALBERLA_ASSERT( ! communicationRunning_ )
 
    sizeChangesEverytime_ = alwaysChangingSize;
    setCommunicationType( false );
@@ -347,7 +346,7 @@ Sb & GenericBufferSystem<Rb, Sb>::sendBuffer( MPIRank rank )
 template< typename Rb, typename Sb>
 void GenericBufferSystem<Rb, Sb>::sendAll()
 {
-   WALBERLA_ASSERT_NOT_NULLPTR( currentComm_ ); // call setReceiverInfo first!
+   WALBERLA_ASSERT_NOT_NULLPTR( currentComm_ ) // call setReceiverInfo first!
 
    if ( !communicationRunning_ )
       startCommunication();
@@ -380,14 +379,14 @@ void GenericBufferSystem<Rb, Sb>::sendAll()
 template< typename Rb, typename Sb>
 void GenericBufferSystem<Rb, Sb>::send( MPIRank rank )
 {
-   WALBERLA_ASSERT_NOT_NULLPTR( currentComm_ ); // call setReceiverInfo first!
+   WALBERLA_ASSERT_NOT_NULLPTR( currentComm_ ) // call setReceiverInfo first!
 
    if ( !communicationRunning_ )
       startCommunication();
 
    auto iter = sendInfos_.find(rank);
-   WALBERLA_ASSERT( iter != sendInfos_.end() );   // no send buffer was created for this rank
-   WALBERLA_ASSERT( ! iter->second.alreadySent ); // this buffer has already been sent
+   WALBERLA_ASSERT( iter != sendInfos_.end() )   // no send buffer was created for this rank
+   WALBERLA_ASSERT( ! iter->second.alreadySent ) // this buffer has already been sent
 
    if ( iter->second.buffer.size() > 0 )
    {
@@ -424,10 +423,10 @@ void GenericBufferSystem<Rb, Sb>::startCommunication()
 
    const auto tag = currentComm_->getTag();
    WALBERLA_CHECK_EQUAL(activeTags_.find(tag), activeTags_.end(),
-                        "Another communication with the same MPI tag is currently in progress.");
+                        "Another communication with the same MPI tag is currently in progress.")
    activeTags_.insert(tag);
 
-   WALBERLA_CHECK( ! communicationRunning_ );
+   WALBERLA_CHECK( ! communicationRunning_ )
 
    currentComm_->scheduleReceives( recvInfos_ );
    communicationRunning_ = true;
@@ -452,7 +451,7 @@ void GenericBufferSystem<Rb, Sb>::startCommunication()
 template< typename Rb, typename Sb>
 void GenericBufferSystem<Rb, Sb>::endCommunication()
 {
-   WALBERLA_CHECK( communicationRunning_ );
+   WALBERLA_CHECK( communicationRunning_ )
    currentComm_->waitForSends();
 
    // Clear send buffers
@@ -482,7 +481,7 @@ void GenericBufferSystem<Rb, Sb>::endCommunication()
 template< typename Rb, typename Sb>
 Rb * GenericBufferSystem<Rb, Sb>::waitForNext( MPIRank & fromRank )
 {
-   WALBERLA_ASSERT( communicationRunning_ );
+   WALBERLA_ASSERT( communicationRunning_ )
 
    fromRank = currentComm_->waitForNextReceive( recvInfos_ );
 
@@ -531,7 +530,7 @@ void GenericBufferSystem<Rb, Sb>::setCommunicationType( const bool knownSize )
 template< typename Rb, typename Sb>
 typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::noRanks()
 {
-   return RankRange();
+   return {};
 }
 template< typename Rb, typename Sb>
 typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::allRanks()
@@ -544,7 +543,7 @@ typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::allR
 template< typename Rb, typename Sb>
 typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::allRanksButRoot()
 {
-   int numProcesses = MPIManager::instance()->numProcesses();
+   int const numProcesses = MPIManager::instance()->numProcesses();
    RankRange r;
    std::generate_n( std::inserter(r, r.end()), numProcesses-1, [&]{ return MPIRank(r.size()+size_t(1)); } );
    return r;
@@ -552,7 +551,7 @@ typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::allR
 template< typename Rb, typename Sb>
 typename GenericBufferSystem<Rb, Sb>::RankRange GenericBufferSystem<Rb,Sb>::onlyRank( int includedRank )
 {
-   WALBERLA_ASSERT_LESS( includedRank, MPIManager::instance()->numProcesses() );
+   WALBERLA_ASSERT_LESS( includedRank, MPIManager::instance()->numProcesses() )
    return RankRange { includedRank };
 }
 
diff --git a/src/core/mpi/BufferSystemHelper.impl.h b/src/core/mpi/BufferSystemHelper.impl.h
index e6236f100ca3d8f58817824c16ac5844e41f4af6..beee5bf78d4b03926b516a09b19b482e77254ba7 100644
--- a/src/core/mpi/BufferSystemHelper.impl.h
+++ b/src/core/mpi/BufferSystemHelper.impl.h
@@ -147,14 +147,14 @@ MPIRank KnownSizeCommunication<Rb, Sb>::waitForNextReceive( std::map<MPIRank, Re
 
    recvRequests_[ uint_c( requestIndex ) ] = MPI_REQUEST_NULL;
 
-   MPIRank senderRank = status.MPI_SOURCE;
+   MPIRank const senderRank = status.MPI_SOURCE;
    WALBERLA_ASSERT_GREATER_EQUAL( senderRank, 0 );
 
 
 #ifndef NDEBUG
    int receivedBytes;
    MPI_Get_count( &status, MPI_BYTE, &receivedBytes );
-   WALBERLA_ASSERT_EQUAL ( recvInfos[senderRank].size, receivedBytes );
+   WALBERLA_ASSERT_EQUAL ( recvInfos[senderRank].size, receivedBytes )
 #else
    WALBERLA_UNUSED( recvInfos );
 #endif
@@ -493,10 +493,7 @@ void NoMPICommunication<Rb, Sb>::send( MPIRank receiver, const Sb & sendBuffer )
 }
 
 template< typename Rb, typename Sb>
-void NoMPICommunication<Rb, Sb>::waitForSends()
-{
-   return;
-}
+void NoMPICommunication<Rb, Sb>::waitForSends(){}
 
 template< typename Rb, typename Sb>
 void NoMPICommunication<Rb, Sb>::scheduleReceives( std::map<MPIRank, ReceiveInfo> & recvInfos )
diff --git a/src/core/mpi/OpenMPBufferSystem.impl.h b/src/core/mpi/OpenMPBufferSystem.impl.h
index 5230d5b9de74a93ba98db6081e53ee752cc86bf7..f87895d7a3467ae4f7984e91e95c1c75472e13b3 100644
--- a/src/core/mpi/OpenMPBufferSystem.impl.h
+++ b/src/core/mpi/OpenMPBufferSystem.impl.h
@@ -57,7 +57,7 @@ void GenericOpenMPBufferSystem<Rb, Sb>::addSendingFunction( MPIRank rank, const
    dirty_ = true;
    sendRanks_.push_back( rank );
    sendFunctions_.push_back( sendFunction );
-   WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() );
+   WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() )
 }
 
 
@@ -72,8 +72,8 @@ void GenericOpenMPBufferSystem<Rb, Sb>::setupBufferSystem()
                    []( typename decltype(recvFunctions_)::value_type r ){ return r.first; });
    bs_.setReceiverInfo( recvRanks, sizeChangesEverytime_ );
 
-   for( auto sendRank = sendRanks_.begin(); sendRank != sendRanks_.end(); ++sendRank ) // Do NOT delete this for loop! This loop is needed ...
-      bs_.sendBuffer( *sendRank ); // ... so that the "sendBuffer(rank)" call in startCommunicationOpenMP is thread-safe!
+   for(auto sendRank : sendRanks_) // Do NOT delete this for loop! This loop is needed ...
+      bs_.sendBuffer( sendRank ); // ... so that the "sendBuffer(rank)" call in startCommunicationOpenMP is thread-safe!
 
    dirty_ = false;
 }
@@ -101,7 +101,7 @@ void GenericOpenMPBufferSystem<Rb, Sb>::startCommunicationSerial()
 {
    bs_.scheduleReceives();
 
-   WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() );
+   WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() )
 
    const uint_t nrOfSendFunctions = sendFunctions_.size();
 
@@ -122,7 +122,7 @@ void GenericOpenMPBufferSystem<Rb, Sb>::startCommunicationOpenMP()
 {
    bs_.scheduleReceives();
 
-   WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() );
+   WALBERLA_ASSERT_EQUAL( sendRanks_.size(), sendFunctions_.size() )
 
    const int nrOfSendFunctions = int_c( sendFunctions_.size() );
 
@@ -173,7 +173,7 @@ void GenericOpenMPBufferSystem<Rb, Sb>::waitSerial()
       RecvBuffer & recvBuffer = recvIt.buffer();
 
       // call unpacking
-      WALBERLA_ASSERT( recvFunctions_.find( rank ) != recvFunctions_.end() );
+      WALBERLA_ASSERT( recvFunctions_.find( rank ) != recvFunctions_.end() )
       recvFunctions_[rank] ( recvBuffer );
    }
 }
@@ -198,19 +198,19 @@ void GenericOpenMPBufferSystem<Rb, Sb>::waitOpenMP()
       recvBuffer = bs_.waitForNext( recvRank );
 
 
-      WALBERLA_ASSERT_GREATER_EQUAL( recvRank, 0 );
-      WALBERLA_ASSERT_NOT_NULLPTR( recvBuffer );
+      WALBERLA_ASSERT_GREATER_EQUAL( recvRank, 0 )
+      WALBERLA_ASSERT_NOT_NULLPTR( recvBuffer )
 
-      WALBERLA_ASSERT( recvFunctions_.find( recvRank ) != recvFunctions_.end() );
+      WALBERLA_ASSERT( recvFunctions_.find( recvRank ) != recvFunctions_.end() )
       recvFunctions_[recvRank] ( *recvBuffer );
    }
 
    MPIRank rank;
    RecvBuffer * ret = bs_.waitForNext( rank );
-   WALBERLA_ASSERT_NULLPTR( ret ); // call last time to finish communication
+   WALBERLA_ASSERT_NULLPTR( ret ) // call last time to finish communication
    WALBERLA_UNUSED( ret );
 
-   WALBERLA_ASSERT( ! bs_.isCommunicationRunning() );
+   WALBERLA_ASSERT( ! bs_.isCommunicationRunning() )
 }
 
 
diff --git a/src/gpu/communication/CustomMemoryBuffer.h b/src/gpu/communication/CustomMemoryBuffer.h
index e01e873708d84788fcecfb33a83ab3616b07c752..ce8fbc49acea51b736c79e11525c87ec482f069e 100644
--- a/src/gpu/communication/CustomMemoryBuffer.h
+++ b/src/gpu/communication/CustomMemoryBuffer.h
@@ -73,6 +73,7 @@ namespace communication {
       void resize( std::size_t newSize );
       inline std::size_t allocSize() const { return std::size_t(end_ - begin_); }
       inline std::size_t size() const { return std::size_t(cur_ - begin_); }
+      inline std::size_t remainingSize() const { return std::size_t(end_ - cur_); }
       ElementType *ptr() const { return begin_; }
       ElementType *cur() const { return cur_; }
 
diff --git a/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h b/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h
index f6b39d9b0fe0dd1b9c90c5d63eb7b8ca00bd3d0f..a5dc8d8cfccf70decacc7b33ecb5318e0490b933 100644
--- a/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h
+++ b/src/gpu/communication/GeneratedNonUniformGPUPackInfo.h
@@ -38,32 +38,29 @@ namespace walberla::gpu {
 class GeneratedNonUniformGPUPackInfo
 {
  public:
-   using VoidFunction                  = std::function< void( gpuStream_t) >;
    GeneratedNonUniformGPUPackInfo() = default;
    virtual ~GeneratedNonUniformGPUPackInfo() = default;
 
    virtual bool constantDataExchange() const = 0;
-   virtual bool threadsafeReceiving() const = 0;
+   virtual bool threadsafeReceiving()  const = 0;
 
-   inline void packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer) const;
-   virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0;
+   inline void packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream = nullptr) const;
+   virtual void unpackDataEqualLevel( Block * receiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0;
    virtual void communicateLocalEqualLevel( const Block * sender, Block * receiver, stencil::Direction dir, gpuStream_t stream) = 0;
-   virtual void getLocalEqualLevelCommFunction( std::vector< VoidFunction >& commFunctions, const Block * sender, Block * receiver, stencil::Direction dir) = 0;
 
-   inline  void packDataCoarseToFine        ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const;
-   virtual void unpackDataCoarseToFine      (       Block * fineReceiver, const BlockID & coarseSender, stencil::Direction dir, GpuBuffer_T & buffer) = 0;
-   virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir ) = 0;
+   inline  void packDataCoarseToFine        ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream = nullptr) const;
+   virtual void unpackDataCoarseToFine      (       Block * fineReceiver, const BlockID & coarseSender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0;
+   virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, gpuStream_t stream) = 0;
    virtual void communicateLocalCoarseToFine( const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0;
-   virtual void getLocalCoarseToFineCommFunction( std::vector< VoidFunction >& commFunctions, const Block * coarseSender, Block * fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0;
 
-   inline  void packDataFineToCoarse        ( const Block * fineSender,     const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const;
-   virtual void unpackDataFineToCoarse      (       Block * coarseReceiver, const BlockID & fineSender,     stencil::Direction dir, GpuBuffer_T & buffer) = 0;
-   virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir) = 0;
+   inline  void packDataFineToCoarse        ( const Block * fineSender,     const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream = nullptr) const;
+   virtual void unpackDataFineToCoarse      (       Block * coarseReceiver, const BlockID & fineSender,     stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0;
+   virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, gpuStream_t stream) = 0;
    virtual void communicateLocalFineToCoarse( const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) = 0;
-   virtual void getLocalFineToCoarseCommFunction( std::vector< VoidFunction >& commFunctions, const Block * fineSender, Block * coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) = 0;
 
    virtual uint_t sizeEqualLevelSend( const Block * sender, stencil::Direction dir) = 0;
    virtual uint_t sizeCoarseToFineSend ( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir) = 0;
+   virtual uint_t sizeCoarseToFineReceive ( Block* fineReceiver, stencil::Direction dir) = 0;
    virtual uint_t sizeFineToCoarseSend ( const Block * fineSender, stencil::Direction dir) = 0;
 
 
@@ -72,9 +69,9 @@ class GeneratedNonUniformGPUPackInfo
 #endif
 
  protected:
-   virtual void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer) const = 0;
-   virtual void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const = 0;
-   virtual void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const = 0;
+   virtual void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const = 0;
+   virtual void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const = 0;
+   virtual void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const = 0;
 
 #ifndef NDEBUG
    mutable std::map< const Block *, std::map< stencil::Direction, std::map< uint_t, size_t > > > bufferSize_;
@@ -82,13 +79,13 @@ class GeneratedNonUniformGPUPackInfo
 
 };
 
-inline void GeneratedNonUniformGPUPackInfo::packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer ) const
+inline void GeneratedNonUniformGPUPackInfo::packDataEqualLevel( const Block * sender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const
 {
 #ifndef NDEBUG
    size_t const sizeBefore = buffer.size();
 #endif
 
-   packDataEqualLevelImpl( sender, dir, buffer );
+   packDataEqualLevelImpl( sender, dir, buffer, stream);
 
 #ifndef NDEBUG
 size_t const sizeAfter = buffer.size();
@@ -107,13 +104,13 @@ if( constantDataExchange() )
 
 
 
-inline void GeneratedNonUniformGPUPackInfo::packDataCoarseToFine( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer ) const
+inline void GeneratedNonUniformGPUPackInfo::packDataCoarseToFine( const Block * coarseSender, const BlockID & fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const
 {
 #ifndef NDEBUG
    size_t const sizeBefore = buffer.size();
 #endif
 
-   packDataCoarseToFineImpl( coarseSender, fineReceiver, dir, buffer );
+   packDataCoarseToFineImpl( coarseSender, fineReceiver, dir, buffer, stream);
 
 #ifndef NDEBUG
 size_t const sizeAfter = buffer.size();
@@ -132,13 +129,13 @@ if( constantDataExchange() )
 
 
 
-inline void GeneratedNonUniformGPUPackInfo::packDataFineToCoarse( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer ) const
+inline void GeneratedNonUniformGPUPackInfo::packDataFineToCoarse( const Block * fineSender, const BlockID & coarseReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const
 {
 #ifndef NDEBUG
    size_t const sizeBefore = buffer.size();
 #endif
 
-   packDataFineToCoarseImpl( fineSender, coarseReceiver, dir, buffer );
+   packDataFineToCoarseImpl( fineSender, coarseReceiver, dir, buffer, stream);
 
 #ifndef NDEBUG
 size_t const sizeAfter = buffer.size();
diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h
index 4b9576434d52dc8726daa35ab91a32fd8984a780..1d3583a12f83295114c5758769689aac186e8399 100644
--- a/src/gpu/communication/NonUniformGPUScheme.h
+++ b/src/gpu/communication/NonUniformGPUScheme.h
@@ -23,40 +23,41 @@
 
 #include "blockforest/StructuredBlockForest.h"
 
+#include "core/DataTypes.h"
 #include "core/mpi/BufferSystem.h"
+#include "core/mpi/MPIManager.h"
 #include "core/mpi/MPIWrapper.h"
 
 #include "domain_decomposition/IBlock.h"
 
 #include "stencil/Directions.h"
 
-#include <thread>
-
 #include "gpu/ErrorChecking.h"
-#include "gpu/GPURAII.h"
 #include "gpu/GPUWrapper.h"
-#include "gpu/ParallelStreams.h"
 #include "gpu/communication/CustomMemoryBuffer.h"
 #include "gpu/communication/GeneratedNonUniformGPUPackInfo.h"
 
+#include <memory>
+#include <thread>
+
 namespace walberla::gpu::communication
 {
 
 template< typename Stencil >
 class NonUniformGPUScheme
 {
- public:
+public:
    enum INDEX { EQUAL_LEVEL = 0, COARSE_TO_FINE = 1, FINE_TO_COARSE = 2 };
 
    using CpuBuffer_T = walberla::gpu::communication::PinnedMemoryBuffer;
    using GpuBuffer_T = walberla::gpu::communication::GPUMemoryBuffer;
 
    explicit NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, bool sendDirectlyFromGPU = false,
-                                const int tag = 5432);
+                                int tag = 5432);
 
    explicit NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, const Set< SUID >& requiredBlockSelectors,
                                 const Set< SUID >& incompatibleBlockSelectors, bool sendDirectlyFromGPU = false,
-                                const int tag = 5432);
+                                int tag = 5432);
 
    ~NonUniformGPUScheme();
 
@@ -67,9 +68,9 @@ class NonUniformGPUScheme
    //@}
    //*******************************************************************************************************************
 
-   inline void communicateEqualLevel(const uint_t level);
-   inline void communicateCoarseToFine(const uint_t fineLevel);
-   inline void communicateFineToCoarse(const uint_t fineLevel);
+   inline void communicateEqualLevel(uint_t level);
+   inline void communicateCoarseToFine(uint_t fineLevel);
+   inline void communicateFineToCoarse(uint_t fineLevel);
 
    std::function<void()>  communicateEqualLevelFunctor(const uint_t level) {
       return [level, this](){ NonUniformGPUScheme::communicateEqualLevel(level);};
@@ -81,28 +82,28 @@ class NonUniformGPUScheme
       return [fineLevel, this](){ NonUniformGPUScheme::communicateFineToCoarse(fineLevel);};
    }
 
-   inline void startCommunicateEqualLevel(const uint_t level);
-   inline void startCommunicateCoarseToFine(const uint_t fineLevel);
-   inline void startCommunicateFineToCoarse(const uint_t fineLevel);
+   inline void startCommunicateEqualLevel(uint_t level);
+   inline void startCommunicateCoarseToFine(uint_t fineLevel);
+   inline void startCommunicateFineToCoarse(uint_t fineLevel);
 
-   inline void waitCommunicateEqualLevel(const uint_t level);
-   inline void waitCommunicateCoarseToFine(const uint_t fineLevel);
-   inline void waitCommunicateFineToCoarse(const uint_t fineLevel);
+   inline void waitCommunicateEqualLevel(uint_t level);
+   inline void waitCommunicateCoarseToFine(uint_t fineLevel);
+   inline void waitCommunicateFineToCoarse(uint_t fineLevel);
 
- private:
+private:
    void setupCommunication();
 
    void init();
    void refresh();
 
-   bool isAnyCommunicationInProgress() const;
+   [[nodiscard]] bool isAnyCommunicationInProgress() const;
 
-   void startCommunicationEqualLevel(const uint_t index, std::set< uint_t >& participatingLevels);
-   void startCommunicationCoarseToFine(const uint_t index, const uint_t coarsestLevel);
-   void startCommunicationFineToCoarse(const uint_t index, const uint_t finestLevel);
+   void startCommunicationEqualLevel(uint_t index, std::set< uint_t >& participatingLevels);
+   void startCommunicationCoarseToFine(uint_t index, uint_t coarsestLevel);
+   void startCommunicationFineToCoarse(uint_t index, uint_t finestLevel);
 
    weak_ptr< StructuredBlockForest > blockForest_;
-   uint_t forestModificationStamp_;
+   uint_t forestModificationStamp_{uint_c(0)};
 
    std::vector< std::vector< bool > > communicationInProgress_;
    bool sendFromGPU_;
@@ -111,11 +112,10 @@ class NonUniformGPUScheme
    std::vector< std::vector< mpi::GenericBufferSystem< CpuBuffer_T, CpuBuffer_T > > > bufferSystemCPU_;
    std::vector< std::vector< mpi::GenericBufferSystem< GpuBuffer_T, GpuBuffer_T > > > bufferSystemGPU_;
    std::vector< std::vector< GpuBuffer_T > > localBuffer_;
+   GpuBuffer_T adaptiveGPUBuffer;
 
    std::vector< shared_ptr< GeneratedNonUniformGPUPackInfo > > packInfos_;
 
-   ParallelStreams parallelSectionManager_;
-
    struct Header
    {
       BlockID receiverId;
@@ -126,13 +126,15 @@ class NonUniformGPUScheme
 
    Set< SUID > requiredBlockSelectors_;
    Set< SUID > incompatibleBlockSelectors_;
+
+   gpuStream_t streams_[Stencil::Q];
 };
 
 template< typename Stencil >
 NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBlockForest >& bf, bool sendDirectlyFromGPU,
                                                     const int tag)
-   : blockForest_(bf), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag), parallelSectionManager_(-1),
-     requiredBlockSelectors_(Set< SUID >::emptySet()), incompatibleBlockSelectors_(Set< SUID >::emptySet())
+      : blockForest_(bf), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag),
+        requiredBlockSelectors_(Set< SUID >::emptySet()), incompatibleBlockSelectors_(Set< SUID >::emptySet())
 {
    WALBERLA_MPI_SECTION()
    {
@@ -148,9 +150,8 @@ NonUniformGPUScheme< Stencil >::NonUniformGPUScheme(const weak_ptr< StructuredBl
                                                     const Set< SUID >& requiredBlockSelectors,
                                                     const Set< SUID >& incompatibleBlockSelectors,
                                                     bool sendDirectlyFromGPU, const int tag)
-   : blockForest_(bf), requiredBlockSelectors_(requiredBlockSelectors),
-     incompatibleBlockSelectors_(incompatibleBlockSelectors), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag),
-     parallelSectionManager_(-1)
+      : blockForest_(bf), requiredBlockSelectors_(requiredBlockSelectors),
+        incompatibleBlockSelectors_(incompatibleBlockSelectors), sendFromGPU_(sendDirectlyFromGPU), baseTag_(tag)
 {
    WALBERLA_MPI_SECTION()
    {
@@ -172,6 +173,11 @@ void NonUniformGPUScheme< Stencil >::init()
    communicationInProgress_.resize(3);
 
    refresh();
+
+   for (uint_t i = 0; i < Stencil::Q; ++i)
+   {
+      WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
+   }
 }
 
 template< typename Stencil >
@@ -195,10 +201,8 @@ void NonUniformGPUScheme< Stencil >::refresh()
       for (uint_t j = 0; j <= levels; ++j)
       {
          headers_[i][j].clear();
-         bufferSystemCPU_[i].emplace_back(
-            mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j));
-         bufferSystemGPU_[i].emplace_back(
-            mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j));
+         bufferSystemCPU_[i].emplace_back(mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j));
+         bufferSystemGPU_[i].emplace_back(mpi::MPIManager::instance()->comm(), baseTag_ + int_c(i * levels + j));
          localBuffer_[i].emplace_back();
       }
 
@@ -206,10 +210,9 @@ void NonUniformGPUScheme< Stencil >::refresh()
    }
 
 #ifndef NDEBUG
-   for (auto p = packInfos_.begin(); p != packInfos_.end(); ++p)
-      (*p)->clearBufferSizeCheckMap();
+   for (auto & packInfo : packInfos_)
+   packInfo->clearBufferSizeCheckMap();
 #endif
-
    forestModificationStamp_ = forest->getBlockForest().getModificationStamp();
 }
 
@@ -304,63 +307,66 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i
          bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(it.first).clear();
 
    // Start filling send buffers
+   for (auto& iBlock : *forest)
    {
-      for (auto& iBlock : *forest)
+      auto senderBlock = dynamic_cast< Block* >(&iBlock);
+
+      if (!selectable::isSetSelected(senderBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_))
+         continue;
+
+      if (participatingLevels.find(senderBlock->getLevel()) == participatingLevels.end())
+         continue;
+
+      for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
       {
-         auto senderBlock = dynamic_cast< Block* >(&iBlock);
+         const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
 
-         if (!selectable::isSetSelected(senderBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_))
+         if (!(senderBlock->neighborhoodSectionHasEquallySizedBlock(neighborIdx)))
             continue;
-
-         if (participatingLevels.find(senderBlock->getLevel()) == participatingLevels.end())
+         WALBERLA_ASSERT_EQUAL(senderBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
+         if (!selectable::isSetSelected(senderBlock->getNeighborState(neighborIdx, uint_t(0)),requiredBlockSelectors_, incompatibleBlockSelectors_))
             continue;
 
-         for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
+         if( senderBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) )
          {
-            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
-
-            if (!(senderBlock->neighborhoodSectionHasEquallySizedBlock(neighborIdx)))
-               continue;
-            WALBERLA_ASSERT_EQUAL(senderBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
-            if (!selectable::isSetSelected(senderBlock->getNeighborState(neighborIdx, uint_t(0)),requiredBlockSelectors_, incompatibleBlockSelectors_))
-               continue;
+            auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) );
+            for (auto& pi : packInfos_)
+            {
+               pi->communicateLocalEqualLevel(senderBlock, receiverBlock, *dir, streams_[*dir]);
+            }
+         }
+         else
+         {
+            auto nProcess              = mpi::MPIRank(senderBlock->getNeighborProcess(neighborIdx, uint_t(0)));
+            GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(nProcess);
 
-            if( senderBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) )
+            for (auto& pi : packInfos_)
             {
-               auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) );
-               for (auto& pi : packInfos_)
+               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+               WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeEqualLevelSend(senderBlock, *dir))
+               if(sendFromGPU_)
                {
-                  pi->communicateLocalEqualLevel(senderBlock, receiverBlock, *dir, nullptr);
+                  pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer, streams_[*dir]);
                }
-            }
-            else
-            {
-               auto nProcess              = mpi::MPIRank(senderBlock->getNeighborProcess(neighborIdx, uint_t(0)));
-               GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(nProcess);
-
-               for (auto& pi : packInfos_)
+               else
                {
-                  WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-                  WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeEqualLevelSend(senderBlock, *dir))
-
-                  pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer);
-
-                  if (!sendFromGPU_)
-                  {
-                     auto gpuDataPtr = gpuDataBuffer.cur();
-                     auto size = pi->sizeEqualLevelSend(senderBlock, *dir);
-                     auto cpuDataPtr = bufferSystemCPU_[EQUAL_LEVEL][index].sendBuffer(nProcess).advanceNoResize(size);
-                     WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
-                     WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost))
-                  }
+                  auto gpuDataPtr = gpuDataBuffer.cur();
+                  // packDataEqualLevel moves the pointer with advanceNoResize
+                  pi->packDataEqualLevel(senderBlock, *dir, gpuDataBuffer, streams_[*dir]);
+                  auto size = pi->sizeEqualLevelSend(senderBlock, *dir);
+                  auto cpuDataPtr = bufferSystemCPU_[EQUAL_LEVEL][index].sendBuffer(nProcess).advanceNoResize(size);
+                  WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
+                  WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir]))
                }
             }
          }
       }
    }
-
    // wait for packing to finish
-   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   for (uint_t i = 0; i < Stencil::Q; ++i)
+   {
+      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+   }
 
 
    if (sendFromGPU_)
@@ -391,78 +397,78 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t
          bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear();
 
    // Start filling send buffers
+   for (auto& iBlock : *forest)
    {
-      for (auto& iBlock : *forest)
-      {
-         auto coarseBlock = dynamic_cast< Block* >(&iBlock);
-         auto nLevel      = coarseBlock->getLevel();
+      auto coarseBlock = dynamic_cast< Block* >(&iBlock);
+      auto nLevel      = coarseBlock->getLevel();
 
-         if (!selectable::isSetSelected(coarseBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_))
-            continue;
+      if (!selectable::isSetSelected(coarseBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_))
+         continue;
 
-         if (nLevel != coarsestLevel) continue;
+      if (nLevel != coarsestLevel) continue;
 
-         for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
-         {
-            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+      for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
+      {
+         const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+
+         if (coarseBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue;
+         if (!(coarseBlock->neighborhoodSectionHasSmallerBlocks(neighborIdx))) continue;
 
-            if (coarseBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue;
-            if (!(coarseBlock->neighborhoodSectionHasSmallerBlocks(neighborIdx))) continue;
+         for (uint_t n = 0; n != coarseBlock->getNeighborhoodSectionSize(neighborIdx); ++n)
+         {
+            const BlockID& fineReceiverId = coarseBlock->getNeighborId(neighborIdx, n);
+            if (!selectable::isSetSelected(coarseBlock->getNeighborState(neighborIdx, n), requiredBlockSelectors_,
+                                           incompatibleBlockSelectors_))
+               continue;
 
-            for (uint_t n = 0; n != coarseBlock->getNeighborhoodSectionSize(neighborIdx); ++n)
+            if( coarseBlock->neighborExistsLocally( neighborIdx, n ) )
             {
-               const BlockID& fineReceiverId = coarseBlock->getNeighborId(neighborIdx, n);
-               if (!selectable::isSetSelected(coarseBlock->getNeighborState(neighborIdx, n), requiredBlockSelectors_,
-                                              incompatibleBlockSelectors_))
-                  continue;
+               auto fineReceiverBlock = dynamic_cast< Block * >( forest->getBlock( fineReceiverId ) );
+               GpuBuffer_T& gpuDataBuffer = localBuffer_[COARSE_TO_FINE][index];
 
-               if( coarseBlock->neighborExistsLocally( neighborIdx, n ) )
+               for (auto& pi : packInfos_)
                {
-                  auto fineReceiverBlock = dynamic_cast< Block * >( forest->getBlock( fineReceiverId ) );
-                  //                  for (auto& pi : packInfos_)
-                  //                  {
-                  //                     pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir);
-                  //                  }
-
-                  GpuBuffer_T& gpuDataBuffer = localBuffer_[COARSE_TO_FINE][index];
                   WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-
-                  for (auto& pi : packInfos_)
-                  {
-                     WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
-                     pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr);
-                  }
+                  WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
+                  pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]);
                }
-               else
+            }
+            else
+            {
+               auto nProcess              = mpi::MPIRank(coarseBlock->getNeighborProcess(neighborIdx, n));
+               GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(nProcess);
+               gpuDataBuffer.clear();
+               for (auto& pi : packInfos_)
                {
-                  auto nProcess              = mpi::MPIRank(coarseBlock->getNeighborProcess(neighborIdx, n));
-                  GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(nProcess);
-                  for (auto& pi : packInfos_)
+                  WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+                  WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
+                  if (sendFromGPU_)
                   {
-                     WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-                     WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
-
-                     pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer);
-
-                     if (!sendFromGPU_)
-                     {
-                        auto gpuDataPtr = gpuDataBuffer.cur();
-                        auto size = pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir);
-                        auto cpuDataPtr =
-                           bufferSystemCPU_[COARSE_TO_FINE][index].sendBuffer(nProcess).advanceNoResize(size);
-                        WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
-                        WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost))
-                     }
+                     pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[*dir]);
+                  }
+                  else
+                  {
+                     auto gpuDataPtr = gpuDataBuffer.cur();
+                     // packDataCoarseToFine moves the pointer with advanceNoResize
+                     pi->packDataCoarseToFine(coarseBlock, fineReceiverId, *dir, gpuDataBuffer, streams_[*dir]);
+                     auto size = pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir);
+                     auto cpuDataPtr = bufferSystemCPU_[COARSE_TO_FINE][index].sendBuffer(nProcess).advanceNoResize(size);
+                     WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
+                     WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir]))
                   }
                }
             }
          }
-         localBuffer_[COARSE_TO_FINE][index].clear();
       }
+      localBuffer_[COARSE_TO_FINE][index].clear();
    }
 
+
    // wait for packing to finish
-   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   for (uint_t i = 0; i < Stencil::Q; ++i)
+   {
+      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+   }
 
    if (sendFromGPU_)
       bufferSystemGPU_[COARSE_TO_FINE][index].sendAll();
@@ -494,74 +500,73 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t
          bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear();
 
    // Start filling send buffers
+   for (auto& iBlock : *forest)
    {
-      for (auto& iBlock : *forest)
-      {
-         auto fineBlock = dynamic_cast< Block* >(&iBlock);
-         auto nLevel    = fineBlock->getLevel();
+      auto fineBlock = dynamic_cast< Block* >(&iBlock);
+      auto nLevel    = fineBlock->getLevel();
 
-         if (!selectable::isSetSelected(fineBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_))
-            continue;
+      if (!selectable::isSetSelected(fineBlock->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_))
+         continue;
 
-         if (nLevel != finestLevel) continue;
+      if (nLevel != finestLevel) continue;
 
-         for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
-         {
-            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
+      for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
+      {
+         const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
 
-            if (fineBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue;
-            if (!(fineBlock->neighborhoodSectionHasLargerBlock(neighborIdx))) continue;
-            WALBERLA_ASSERT_EQUAL(fineBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
+         if (fineBlock->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue;
+         if (!(fineBlock->neighborhoodSectionHasLargerBlock(neighborIdx))) continue;
+         WALBERLA_ASSERT_EQUAL(fineBlock->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
 
-            const BlockID& coarseReceiverId = fineBlock->getNeighborId(neighborIdx, uint_t(0));
-            if (!selectable::isSetSelected(fineBlock->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_,
-                                           incompatibleBlockSelectors_))
-               continue;
-            if( fineBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) )
-            {
-               auto coarseReceiverBlock = dynamic_cast< Block * >( forest->getBlock( coarseReceiverId ) );
-               //               for (auto& pi : packInfos_)
-               //               {
-               //                  pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir);
-               //               }
+         const BlockID& coarseReceiverId = fineBlock->getNeighborId(neighborIdx, uint_t(0));
+         if (!selectable::isSetSelected(fineBlock->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_,
+                                        incompatibleBlockSelectors_))
+            continue;
+         if( fineBlock->neighborExistsLocally( neighborIdx, uint_t(0) ) )
+         {
+            auto coarseReceiverBlock = dynamic_cast< Block * >( forest->getBlock( coarseReceiverId ) );
+            GpuBuffer_T& gpuDataBuffer = localBuffer_[FINE_TO_COARSE][index];
 
-               GpuBuffer_T& gpuDataBuffer = localBuffer_[FINE_TO_COARSE][index];
+            for (auto& pi : packInfos_)
+            {
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-               for (auto& pi : packInfos_)
-               {
-                  WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir))
-                  pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr);
-               }
+               WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir))
+               pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]);
             }
-            else
+         }
+         else
+         {
+            auto nProcess              = mpi::MPIRank(fineBlock->getNeighborProcess(neighborIdx, uint_t(0)));
+            GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(nProcess);
+            gpuDataBuffer.clear();
+            for (auto& pi : packInfos_)
             {
-               auto nProcess              = mpi::MPIRank(fineBlock->getNeighborProcess(neighborIdx, uint_t(0)));
-               GpuBuffer_T& gpuDataBuffer = bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(nProcess);
-
-               for (auto& pi : packInfos_)
+               WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
+               WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeFineToCoarseSend(fineBlock, *dir))
+               if (sendFromGPU_)
                {
-                  WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-                  WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir))
-
-                  pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer);
-
-                  if (!sendFromGPU_)
-                  {
-                     auto gpuDataPtr = gpuDataBuffer.cur();
-                     auto size = pi->sizeFineToCoarseSend(fineBlock, *dir);
-                     auto cpuDataPtr = bufferSystemCPU_[FINE_TO_COARSE][index].sendBuffer(nProcess).advanceNoResize(size);
-                     WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
-                     WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost))
-                  }
+                  pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[*dir]);
+               }
+               else
+               {
+                  auto gpuDataPtr = gpuDataBuffer.cur();
+                  // packDataFineToCoarse moves the pointer with advanceNoResize
+                  pi->packDataFineToCoarse(fineBlock, coarseReceiverId, *dir, gpuDataBuffer, streams_[*dir]);
+                  auto size = pi->sizeFineToCoarseSend(fineBlock, *dir);
+                  auto cpuDataPtr = bufferSystemCPU_[FINE_TO_COARSE][index].sendBuffer(nProcess).advanceNoResize(size);
+                  WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
+                  WALBERLA_GPU_CHECK(gpuMemcpyAsync(cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir]))
                }
             }
          }
-         localBuffer_[FINE_TO_COARSE][index].clear();
       }
+      localBuffer_[FINE_TO_COARSE][index].clear();
    }
-
    // wait for packing to finish
-   WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+   for (uint_t i = 0; i < Stencil::Q; ++i)
+   {
+      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+   }
 
    if (sendFromGPU_)
       bufferSystemGPU_[FINE_TO_COARSE][index].sendAll();
@@ -596,9 +601,7 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t leve
             {
                GpuBuffer_T& gpuDataBuffer = recvInfo.buffer();
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-               // parallelSection.run([&](auto s) {
-               pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuDataBuffer);
-               // });
+               pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[stencil::inverseDir[header.dir]]);
             }
          }
       }
@@ -608,29 +611,31 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateEqualLevel(const uint_t leve
       for (auto recvInfo = bufferSystemCPU_[EQUAL_LEVEL][level].begin();
            recvInfo != bufferSystemCPU_[EQUAL_LEVEL][level].end(); ++recvInfo)
       {
-         auto& gpuBuffer = bufferSystemGPU_[EQUAL_LEVEL][level].sendBuffer(recvInfo.rank());
+         auto &gpuBuffer = bufferSystemGPU_[EQUAL_LEVEL][level].sendBuffer(recvInfo.rank());
 
          recvInfo.buffer().clear();
          gpuBuffer.clear();
-         for (auto& header : headers_[EQUAL_LEVEL][level][recvInfo.rank()])
+
+         for (auto &header : headers_[EQUAL_LEVEL][level][recvInfo.rank()])
          {
             auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
-            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
-
             for (auto& pi : packInfos_)
             {
-               auto size       = pi->sizeEqualLevelSend(senderBlock, header.dir);
+               auto size       = pi->sizeEqualLevelSend(block, stencil::inverseDir[header.dir]);
                auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size);
                auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size );
                WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr)
 
-               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, nullptr))
-               pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuBuffer);
+               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]]))
+               pi->unpackDataEqualLevel(block, stencil::inverseDir[header.dir], gpuBuffer, streams_[stencil::inverseDir[header.dir]]);
             }
          }
       }
-      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+   }
+   for (uint_t i = 0; i < Stencil::Q; ++i)
+   {
+      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
    }
    communicationInProgress_[EQUAL_LEVEL][level] = false;
 }
@@ -647,61 +652,51 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateCoarseToFine(const uint_t fi
                               "Trying to access communication for a block storage object that doesn't exist anymore")
    WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels())
 
-   if (sendFromGPU_)
-   {
-      // auto parallelSection = parallelSectionManager_.parallelSection( nullptr );
+   if (sendFromGPU_) {
       for (auto recvInfo = bufferSystemGPU_[COARSE_TO_FINE][fineLevel].begin();
-           recvInfo != bufferSystemGPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo)
-      {
+           recvInfo != bufferSystemGPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo) {
          recvInfo.buffer().clear();
-         for (auto& header : headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()])
-         {
-            auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
-            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
-
-            for (auto& pi : packInfos_)
-            {
-               // auto size = pi->sizeCoarseToFineSend( senderBlock, block->getId(), header.dir );
-               GpuBuffer_T& gpuDataBuffer = recvInfo.buffer();
+         for (auto &header: headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()]) {
+            auto fineReceiver = dynamic_cast< Block * >(forest->getBlock(header.receiverId));
+            for (auto &pi: packInfos_) {
+               GpuBuffer_T &gpuDataBuffer = recvInfo.buffer();
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-               // parallelSection.run([&](auto s) {
-               pi->unpackDataCoarseToFine(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuDataBuffer);
-               // });
+               pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir],
+                                          gpuDataBuffer, streams_[stencil::inverseDir[header.dir]]);
             }
          }
       }
-   }
-   else
+   } else
    {
-      auto parallelSection = parallelSectionManager_.parallelSection(nullptr);
       for (auto recvInfo = bufferSystemCPU_[COARSE_TO_FINE][fineLevel].begin();
-           recvInfo != bufferSystemCPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo)
-      {
-         auto& gpuBuffer = bufferSystemGPU_[COARSE_TO_FINE][fineLevel].sendBuffer(recvInfo.rank());
+           recvInfo != bufferSystemCPU_[COARSE_TO_FINE][fineLevel].end(); ++recvInfo) {
 
+         adaptiveGPUBuffer.clear();
+         adaptiveGPUBuffer.resize(recvInfo.buffer().allocSize());
          recvInfo.buffer().clear();
-         gpuBuffer.clear();
-         for (auto& header : headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()])
-         {
-            auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
-            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
 
-            for (auto& pi : packInfos_)
-            {
-               auto size       = pi->sizeCoarseToFineSend(senderBlock, block->getId(), header.dir);
+         for (auto &header: headers_[COARSE_TO_FINE][fineLevel][recvInfo.rank()]) {
+            auto fineReceiver = dynamic_cast< Block * >(forest->getBlock(header.receiverId));
+            // WALBERLA_ASSERT_NOT_NULLPTR(fineReceiver)
+            for (auto &pi: packInfos_) {
+               auto size = pi->sizeCoarseToFineReceive(fineReceiver, stencil::inverseDir[header.dir]);
+
                auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size);
-               auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size );
+               auto gpuDataPtr = adaptiveGPUBuffer.cur(); // advanceNoResize( size );
                WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr)
 
-               parallelSection.run([&](auto s) {
-                  WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, s))
-                  pi->unpackDataCoarseToFine(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuBuffer);
-               });
+               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]]))
+               pi->unpackDataCoarseToFine(fineReceiver, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[stencil::inverseDir[header.dir]]);
             }
          }
       }
    }
+
+   for (uint_t i = 0; i < Stencil::Q; ++i)
+   {
+      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+   }
    communicationInProgress_[COARSE_TO_FINE][fineLevel] = false;
 }
 
@@ -716,11 +711,9 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi
    WALBERLA_CHECK_NOT_NULLPTR(forest,
                               "Trying to access communication for a block storage object that doesn't exist anymore")
    WALBERLA_ASSERT_LESS(fineLevel, forest->getNumberOfLevels())
-   // WALBERLA_ASSERT_EQUAL( forestModificationStamp_, forest->getBlockForest().getModificationStamp() );
 
    if (sendFromGPU_)
    {
-      // auto parallelSection = parallelSectionManager_.parallelSection( nullptr );
       for (auto recvInfo = bufferSystemGPU_[FINE_TO_COARSE][fineLevel].begin();
            recvInfo != bufferSystemGPU_[FINE_TO_COARSE][fineLevel].end(); ++recvInfo)
       {
@@ -728,50 +721,44 @@ void NonUniformGPUScheme< Stencil >::waitCommunicateFineToCoarse(const uint_t fi
          for (auto& header : headers_[FINE_TO_COARSE][fineLevel][recvInfo.rank()])
          {
             auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
-            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
-
             for (auto& pi : packInfos_)
             {
                GpuBuffer_T& gpuDataBuffer = recvInfo.buffer();
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
-               // parallelSection.run([&](auto s) {
-               pi->unpackDataFineToCoarse(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuDataBuffer);
-               // });
+               pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], gpuDataBuffer, streams_[stencil::inverseDir[header.dir]]);
             }
          }
       }
    }
    else
    {
-      auto parallelSection = parallelSectionManager_.parallelSection(nullptr);
       for (auto recvInfo = bufferSystemCPU_[FINE_TO_COARSE][fineLevel].begin();
            recvInfo != bufferSystemCPU_[FINE_TO_COARSE][fineLevel].end(); ++recvInfo)
       {
-         auto& gpuBuffer = bufferSystemGPU_[FINE_TO_COARSE][fineLevel].sendBuffer(recvInfo.rank());
-
          recvInfo.buffer().clear();
-         gpuBuffer.clear();
+         adaptiveGPUBuffer.clear();
+         adaptiveGPUBuffer.resize(recvInfo.buffer().allocSize());
          for (auto& header : headers_[FINE_TO_COARSE][fineLevel][recvInfo.rank()])
          {
             auto block       = dynamic_cast< Block* >(forest->getBlock(header.receiverId));
-            auto senderBlock = dynamic_cast< Block* >(forest->getBlock(header.senderId));
-
             for (auto& pi : packInfos_)
             {
-               auto size       = pi->sizeFineToCoarseSend(senderBlock, header.dir);
+               auto size       = pi->sizeFineToCoarseSend(block, stencil::inverseDir[header.dir]);
                auto cpuDataPtr = recvInfo.buffer().advanceNoResize(size);
-               auto gpuDataPtr = gpuBuffer.cur(); // advanceNoResize( size );
+               auto gpuDataPtr = adaptiveGPUBuffer.cur(); // advanceNoResize( size );
                WALBERLA_ASSERT_NOT_NULLPTR(cpuDataPtr)
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataPtr)
 
-               parallelSection.run([&](auto s) {
-                  WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, s))
-                  pi->unpackDataFineToCoarse(block, senderBlock->getId(), stencil::inverseDir[header.dir], gpuBuffer);
-               });
+               WALBERLA_GPU_CHECK(gpuMemcpyAsync(gpuDataPtr, cpuDataPtr, size, gpuMemcpyHostToDevice, streams_[stencil::inverseDir[header.dir]]))
+               pi->unpackDataFineToCoarse(block, header.senderId, stencil::inverseDir[header.dir], adaptiveGPUBuffer, streams_[stencil::inverseDir[header.dir]]);
             }
          }
       }
    }
+   for (uint_t i = 0; i < Stencil::Q; ++i)
+   {
+      WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+   }
    communicationInProgress_[FINE_TO_COARSE][fineLevel] = false;
 }
 
@@ -785,25 +772,32 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
                               "Trying to access communication for a block storage object that doesn't exist anymore")
    const uint_t levels = forest->getNumberOfLevels();
 
-   std::vector< std::vector< std::map< mpi::MPIRank, mpi::MPISize > > >
-      receiverInfo; // how many bytes to send to each neighbor
-   std::vector< std::vector< mpi::BufferSystem > > headerExchangeBs;
+   std::vector< std::vector< std::map< mpi::MPIRank, mpi::MPISize > > > senderInfo; // how many bytes to send to each neighbor
+   std::vector< std::vector< std::set< mpi::MPIRank > > > receiverInfo; // how many bytes to receive from each neighbor
 
+   std::vector< std::vector< shared_ptr< mpi::BufferSystem > > > headerExchangeBs;
+
+   std::vector< std::vector< mpi::MPISize > > localBufferSize;
+
+   localBufferSize.resize(3);
+   senderInfo.resize(3);
    receiverInfo.resize(3);
+
+   senderInfo[EQUAL_LEVEL].resize(levels + uint_c(1));
+   senderInfo[COARSE_TO_FINE].resize(levels + uint_c(1));
+   senderInfo[FINE_TO_COARSE].resize(levels + uint_c(1));
+
    receiverInfo[EQUAL_LEVEL].resize(levels + uint_c(1));
    receiverInfo[COARSE_TO_FINE].resize(levels + uint_c(1));
    receiverInfo[FINE_TO_COARSE].resize(levels + uint_c(1));
 
-   std::vector< std::vector< mpi::MPISize > > localBufferSize;
-
    headerExchangeBs.resize(3);
-   localBufferSize.resize(3);
 
    for (uint_t j = 0; j <= levels; ++j)
    {
-      headerExchangeBs[EQUAL_LEVEL].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123));
-      headerExchangeBs[COARSE_TO_FINE].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123));
-      headerExchangeBs[FINE_TO_COARSE].push_back(mpi::BufferSystem(mpi::MPIManager::instance()->comm(), 123));
+      headerExchangeBs[EQUAL_LEVEL].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123));
+      headerExchangeBs[COARSE_TO_FINE].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123));
+      headerExchangeBs[FINE_TO_COARSE].push_back(make_shared< mpi::BufferSystem >(mpi::MPIManager::instance()->comm(), 123));
 
       localBufferSize[EQUAL_LEVEL].push_back(mpi::MPISize(0));
       localBufferSize[COARSE_TO_FINE].push_back(mpi::MPISize(0));
@@ -816,7 +810,7 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
       if (!selectable::isSetSelected(block->getState(), requiredBlockSelectors_, incompatibleBlockSelectors_)) continue;
 
       const BlockID& senderId = block->getId();
-      auto nLevel             = block->getLevel();
+      auto level       = block->getLevel();
 
       for (auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir)
       {
@@ -824,6 +818,7 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
          const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex(*dir);
          if (block->getNeighborhoodSectionSize(neighborIdx) == uint_t(0)) continue;
 
+         // EQUAL_LEVEL communication
          if (block->neighborhoodSectionHasEquallySizedBlock(neighborIdx))
          {
             WALBERLA_ASSERT_EQUAL(block->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
@@ -838,25 +833,28 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
 
             for (auto& pi : packInfos_)
             {
-               receiverInfo[EQUAL_LEVEL][nLevel][nProcess] += mpi::MPISize(pi->sizeEqualLevelSend(block, *dir));
+               senderInfo[EQUAL_LEVEL][level][nProcess] += mpi::MPISize(pi->sizeEqualLevelSend(block, *dir));
             }
 
-            auto& headerBuffer = headerExchangeBs[EQUAL_LEVEL][nLevel].sendBuffer(nProcess);
+            auto& headerBuffer = headerExchangeBs[EQUAL_LEVEL][level]->sendBuffer(nProcess);
             receiverId.toBuffer(headerBuffer);
             senderId.toBuffer(headerBuffer);
             headerBuffer << *dir;
+
+            receiverInfo[EQUAL_LEVEL][level].insert( nProcess );
          }
          else if (block->neighborhoodSectionHasSmallerBlocks(neighborIdx))
          {
-            auto fineLevel = nLevel + uint_c(1); // For indexing always the fineLevel is taken to be consistent.
+            auto fineLevel = level + uint_c(1); // For indexing always the fineLevel is taken to be consistent.
             WALBERLA_ASSERT_LESS(fineLevel, levels)
 
             for (uint_t n = 0; n != block->getNeighborhoodSectionSize(neighborIdx); ++n)
             {
                const BlockID& receiverId = block->getNeighborId(neighborIdx, n);
-               if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, n), requiredBlockSelectors_,
-                                              incompatibleBlockSelectors_))
+
+               if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, n), requiredBlockSelectors_, incompatibleBlockSelectors_))
                   continue;
+
                if( block->neighborExistsLocally( neighborIdx, n ) )
                {
                   for (auto& pi : packInfos_)
@@ -866,19 +864,21 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
 
                auto nProcess = mpi::MPIRank(block->getNeighborProcess(neighborIdx, n));
                for (auto& pi : packInfos_)
-                  receiverInfo[COARSE_TO_FINE][fineLevel][nProcess] +=
-                     mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir));
-               auto& headerBuffer = headerExchangeBs[COARSE_TO_FINE][fineLevel].sendBuffer(nProcess);
+                  senderInfo[COARSE_TO_FINE][fineLevel][nProcess] += mpi::MPISize(pi->sizeCoarseToFineSend(block, receiverId, *dir));
+
+               auto& headerBuffer = headerExchangeBs[COARSE_TO_FINE][fineLevel]->sendBuffer(nProcess);
                receiverId.toBuffer(headerBuffer);
                senderId.toBuffer(headerBuffer);
                headerBuffer << *dir;
+
+               receiverInfo[FINE_TO_COARSE][fineLevel].insert( nProcess );
             }
          }
          else if (block->neighborhoodSectionHasLargerBlock(neighborIdx))
          {
             WALBERLA_ASSERT_EQUAL(block->getNeighborhoodSectionSize(neighborIdx), uint_t(1))
-
             const BlockID& receiverId = block->getNeighborId(neighborIdx, uint_t(0));
+
             if (!selectable::isSetSelected(block->getNeighborState(neighborIdx, uint_t(0)), requiredBlockSelectors_,
                                            incompatibleBlockSelectors_))
                continue;
@@ -886,18 +886,20 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
             if( block->neighborExistsLocally( neighborIdx, uint_t(0) ) )
             {
                for (auto& pi : packInfos_)
-                  localBufferSize[FINE_TO_COARSE][nLevel] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir));
+                  localBufferSize[FINE_TO_COARSE][level] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir));
                continue;
             }
 
             auto nProcess = mpi::MPIRank(block->getNeighborProcess(neighborIdx, uint_t(0)));
             for (auto& pi : packInfos_)
-               receiverInfo[FINE_TO_COARSE][nLevel][nProcess] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir));
+               senderInfo[FINE_TO_COARSE][level][nProcess] += mpi::MPISize(pi->sizeFineToCoarseSend(block, *dir));
 
-            auto& headerBuffer = headerExchangeBs[FINE_TO_COARSE][nLevel].sendBuffer(nProcess);
+            auto& headerBuffer = headerExchangeBs[FINE_TO_COARSE][level]->sendBuffer(nProcess);
             receiverId.toBuffer(headerBuffer);
             senderId.toBuffer(headerBuffer);
             headerBuffer << *dir;
+
+            receiverInfo[COARSE_TO_FINE][level].insert( nProcess );
          }
       }
    }
@@ -906,14 +908,12 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
    {
       for (uint_t j = 0; j <= levels; ++j)
       {
-         headerExchangeBs[i][j].setReceiverInfoFromSendBufferState(false, true);
-         headerExchangeBs[i][j].sendAll();
-         for (auto recvIter = headerExchangeBs[i][j].begin(); recvIter != headerExchangeBs[i][j].end(); ++recvIter)
-         {
-            auto& headerVector = headers_[i][j][recvIter.rank()];
-            auto& buffer       = recvIter.buffer();
-            while (buffer.size())
-            {
+         headerExchangeBs[i][j]->setReceiverInfo(receiverInfo[i][j], false);
+         headerExchangeBs[i][j]->sendAll();
+         for (auto recvIter = headerExchangeBs[i][j]->begin(); recvIter != headerExchangeBs[i][j]->end(); ++recvIter) {
+            auto &headerVector = headers_[i][j][recvIter.rank()];
+            auto &buffer = recvIter.buffer();
+            while (buffer.size()) {
                Header header;
                header.receiverId.fromBuffer(buffer);
                header.senderId.fromBuffer(buffer);
@@ -921,11 +921,9 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
                headerVector.push_back(header);
             }
          }
-
-         bufferSystemCPU_[i][j].setReceiverInfo(receiverInfo[i][j]);
-         bufferSystemGPU_[i][j].setReceiverInfo(receiverInfo[i][j]);
-
-         for (auto it : receiverInfo[i][j])
+         bufferSystemCPU_[i][j].setReceiverInfo(receiverInfo[i][j], false);
+         bufferSystemGPU_[i][j].setReceiverInfo(receiverInfo[i][j], false);
+         for (auto it : senderInfo[i][j])
          {
             bufferSystemCPU_[i][j].sendBuffer(it.first).resize(size_t(it.second));
             bufferSystemGPU_[i][j].sendBuffer(it.first).resize(size_t(it.second));
@@ -934,16 +932,16 @@ void NonUniformGPUScheme< Stencil >::setupCommunication()
             localBuffer_[i][j].resize(size_t(localBufferSize[i][j]));
       }
    }
-
    forestModificationStamp_      = forest->getBlockForest().getModificationStamp();
 }
 
 template< typename Stencil >
 bool NonUniformGPUScheme< Stencil >::isAnyCommunicationInProgress() const
 {
-   for (auto caseIt = communicationInProgress_.begin(); caseIt != communicationInProgress_.end(); ++caseIt)
-      for (auto levelIt = caseIt->begin(); levelIt != caseIt->end(); ++levelIt)
-         if (*levelIt) return true;
+   const uint_t levels = uint_c(communicationInProgress_[0].size());
+   for (uint_t i = 0; i != 3; ++i)
+      for (uint_t j = 0; j != levels; ++j)
+         if (communicationInProgress_[i][j]) return true;
 
    return false;
 }
@@ -957,6 +955,11 @@ NonUniformGPUScheme< Stencil >::~NonUniformGPUScheme()
       waitCommunicateCoarseToFine(i);
       waitCommunicateFineToCoarse(i);
    }
+
+   for (uint_t i = 0; i < Stencil::Q; ++i)
+   {
+      WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[i]))
+   }
 }
 
 template< typename Stencil >
@@ -969,5 +972,4 @@ void NonUniformGPUScheme< Stencil >::addPackInfo(const shared_ptr< GeneratedNonU
    packInfos_.push_back(pi);
    setupCommunication();
 }
-
 } // namespace walberla::gpu::communication
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
index d6ac87010a6889b899380514ec51d717159bd6f8..96c514fcc4369084273c098ac9bf4ad21310ae29 100644
--- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.h
@@ -42,23 +42,23 @@ template< typename PdfField_T, bool inplace >
 class NonuniformGPUPackingKernelsWrapper
 {
  public:
-   void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream = nullptr) const  = 0;
-   void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream = nullptr) const = 0;
+   void packAll(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, gpuStream_t stream ) const  = 0;
+   void unpackAll(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, gpuStream_t stream ) const = 0;
    void localCopyAll(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
-                     CellInterval dstInterval, gpuStream_t stream = nullptr) const                                    = 0;
+                     CellInterval dstInterval, gpuStream_t stream ) const                                    = 0;
 
-   void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const  = 0;
-   void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0;
+   void packDirection(PdfField_T* srcField, CellInterval ci, unsigned char* outBuffer, Direction dir, gpuStream_t stream ) const  = 0;
+   void unpackDirection(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream ) const = 0;
    void localCopyDirection(PdfField_T* srcField, CellInterval srcInterval, PdfField_T* dstField,
                            CellInterval dstInterval, Direction dir, gpuStream_t stream) const               = 0;
 
    void unpackRedistribute(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer,
-                           stencil::Direction dir, gpuStream_t stream = nullptr) const = 0;
+                           stencil::Direction dir, gpuStream_t stream ) const = 0;
 
    void packPartialCoalescence(PdfField_T* srcField, PartialCoalescenceMaskFieldGPU* maskField, CellInterval ci,
-                               unsigned char* outBuffer, Direction dir, gpuStream_t stream = nullptr) const                                   = 0;
+                               unsigned char* outBuffer, Direction dir, gpuStream_t stream ) const                                   = 0;
    void zeroCoalescenceRegion(PdfField_T* dstField, CellInterval ci, Direction dir) const                      = 0;
-   void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream = nullptr) const = 0;
+   void unpackCoalescence(PdfField_T* dstField, CellInterval ci, unsigned char* inBuffer, Direction dir, gpuStream_t stream ) const = 0;
 
    uint_t size(CellInterval ci, Direction dir) const                   = 0;
    uint_t size(CellInterval ci) const                                  = 0;
@@ -239,9 +239,7 @@ template< typename PdfField_T >
 class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUniformGPUPackInfo
 {
  public:
-   using VoidFunction                  = std::function< void(gpuStream_t) >;
    using LatticeStorageSpecification_T = typename PdfField_T::LatticeStorageSpecification;
-   using PackingKernels_T              = typename LatticeStorageSpecification_T::PackKernels;
    using Stencil                       = typename LatticeStorageSpecification_T::Stencil;
    using CommunicationStencil          = typename LatticeStorageSpecification_T::CommunicationStencil;
    using CommData_T                    = NonuniformGPUCommData< LatticeStorageSpecification_T >;
@@ -253,58 +251,53 @@ class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUnif
    bool threadsafeReceiving() const override { return false; };
 
    /// Equal Level
-   void unpackDataEqualLevel(Block* receiver, Direction dir, GpuBuffer_T& buffer) override;
+   void unpackDataEqualLevel(Block* receiver, Direction dir, GpuBuffer_T& buffer, gpuStream_t stream) override;
    void communicateLocalEqualLevel(const Block* sender, Block* receiver, stencil::Direction dir,
                                    gpuStream_t stream) override;
-   void getLocalEqualLevelCommFunction(std::vector< VoidFunction >& commFunctions, const Block* sender, Block* receiver,
-                                       stencil::Direction dir) override;
 
    /// Coarse to Fine
    void unpackDataCoarseToFine(Block* fineReceiver, const BlockID& coarseSender, stencil::Direction dir,
-                               GpuBuffer_T& buffer) override;
-   void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir) override;
+                               GpuBuffer_T& buffer, gpuStream_t stream) override;
+   void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, gpuStream_t stream) override;
    void communicateLocalCoarseToFine(const Block* coarseSender, Block* fineReceiver, stencil::Direction dir,
                                      GpuBuffer_T& buffer, gpuStream_t stream) override;
-   void getLocalCoarseToFineCommFunction(std::vector< VoidFunction >& commFunctions, const Block* coarseSender,
-                                         Block* fineReceiver, stencil::Direction dir, GpuBuffer_T& buffer) override;
 
    /// Fine to Coarse
    void prepareCoalescence(Block* coarseReceiver, gpuStream_t gpuStream = nullptr);
    void unpackDataFineToCoarse(Block* coarseReceiver, const BlockID& fineSender, stencil::Direction dir,
-                               GpuBuffer_T& buffer) override;
+                               GpuBuffer_T& buffer, gpuStream_t stream) override;
 
-   void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir) override;
+   void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir, gpuStream_t stream) override;
    void communicateLocalFineToCoarse(const Block* fineSender, Block* coarseReceiver, stencil::Direction dir,
                                      GpuBuffer_T& buffer, gpuStream_t stream) override;
-   void getLocalFineToCoarseCommFunction(std::vector< VoidFunction >& commFunctions, const Block* fineSender,
-                                         Block* coarseReceiver, stencil::Direction dir, GpuBuffer_T& buffer) override;
 
    uint_t sizeEqualLevelSend(const Block* sender, stencil::Direction dir) override;
    uint_t sizeCoarseToFineSend(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir) override;
+   uint_t sizeCoarseToFineReceive ( Block* fineReceiver, stencil::Direction dir) override;
    uint_t sizeFineToCoarseSend(const Block* fineSender, stencil::Direction dir) override;
 
  protected:
-   void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T& buffer) const override;
+   void packDataEqualLevelImpl(const Block* sender, stencil::Direction dir, GpuBuffer_T& buffer, gpuStream_t stream) const override;
 
    void packDataCoarseToFineImpl(const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir,
-                                 GpuBuffer_T& buffer) const override;
+                                 GpuBuffer_T& buffer, gpuStream_t stream) const override;
    void packDataFineToCoarseImpl(const Block* fineSender, const BlockID& coarseReceiver, stencil::Direction dir,
-                                 GpuBuffer_T& buffer) const override;
+                                 GpuBuffer_T& buffer, gpuStream_t stream) const override;
 
  private:
    /// Helper Functions
    /// As in PdfFieldPackInfo.h
    Vector3< cell_idx_t > getNeighborShift(const BlockID& fineBlock, stencil::Direction dir) const;
    bool areNeighborsInDirection(const Block* block, const BlockID& neighborID,
-                                const Vector3< cell_idx_t > dirVec) const;
+                                Vector3< cell_idx_t > dirVec) const;
 
-   CellInterval intervalHullInDirection(const CellInterval& ci, const Vector3< cell_idx_t > tangentialDir,
+   CellInterval intervalHullInDirection(const CellInterval& ci, Vector3< cell_idx_t > tangentialDir,
                                         cell_idx_t width) const;
-   bool skipsThroughCoarseBlock(const Block* block, const Direction dir) const;
+   bool skipsThroughCoarseBlock(const Block* block, Direction dir) const;
 
-   void getCoarseBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+   void getCoarseBlockCommIntervals(const BlockID& fineBlockID, Direction dir, const PdfField_T* field,
                                     std::vector< std::pair< Direction, CellInterval > >& intervals) const;
-   void getFineBlockCommIntervals(const BlockID& fineBlockID, const Direction dir, const PdfField_T* field,
+   void getFineBlockCommIntervals(const BlockID& fineBlockID, Direction dir, const PdfField_T* field,
                                   std::vector< std::pair< Direction, CellInterval > >& intervals) const;
 
    CellInterval getCoarseBlockCoalescenceInterval(const Block* coarseBlock, const BlockID& fineBlockID, Direction dir,
@@ -324,7 +317,7 @@ class NonuniformGeneratedGPUPdfPackInfo : public walberla::gpu::GeneratedNonUnif
 template< typename PdfField_T >
 std::shared_ptr< NonuniformGeneratedGPUPdfPackInfo< PdfField_T > >
    setupNonuniformGPUPdfCommunication(const std::weak_ptr< StructuredBlockForest >& blocks,
-                                      const BlockDataID pdfFieldID,
+                                      BlockDataID pdfFieldID,
                                       const std::string& dataIdentifier = "NonuniformGPUCommData");
 
 } // namespace walberla::lbm_generated
diff --git a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
index adfbb419a8d3a3c82217fecf974977b28bb2a19b..987cebe9b2bfd343ed0277ed3faefef4dddaa753 100644
--- a/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
+++ b/src/lbm_generated/gpu/NonuniformGeneratedGPUPdfPackInfo.impl.h
@@ -66,7 +66,7 @@ std::shared_ptr< NonuniformGeneratedGPUPdfPackInfo< PdfField_T > >
 template< typename PdfField_T>
 void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataEqualLevel(Block* receiver,
                                                                            Direction dir,
-                                                                           GpuBuffer_T & buffer)
+                                                                           GpuBuffer_T & buffer, gpuStream_t stream)
 {
    auto field = receiver->getData< PdfField_T >(pdfFieldID_);
    CellInterval ci;
@@ -74,7 +74,7 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataEqualLevel(Block
    field->getGhostRegion(dir, ci, gls, false);
    uint_t size              = kernels_.size(ci, dir);
    auto bufferPtr = buffer.advanceNoResize(size);
-   kernels_.unpackDirection(field, ci, bufferPtr, dir);
+   kernels_.unpackDirection(field, ci, bufferPtr, dir, stream);
 }
 
 template< typename PdfField_T>
@@ -92,37 +92,10 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalEqualLevel
    kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, stream);
 }
 
-template< typename PdfField_T>
-void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalEqualLevelCommFunction(
-   std::vector< VoidFunction >& commFunctions, const Block* sender, Block* receiver,
-   stencil::Direction dir)
-{
-   auto srcField = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
-   auto dstField = receiver->getData< PdfField_T >(pdfFieldID_);
-
-   CellInterval srcRegion;
-   CellInterval dstRegion;
-   cell_idx_t gls = skipsThroughCoarseBlock(sender, dir) ? 2 : 1;
-   srcField->getSliceBeforeGhostLayer(dir, srcRegion, gls, false);
-   dstField->getGhostRegion(stencil::inverseDir[dir], dstRegion, gls, false);
-
-//   VoidFunction t = std::bind(kernels_.localCopyDirection,
-//                                         srcField, srcRegion, dstField, dstRegion, dir, std::placeholders::_1 );
-
-//   CellInterval test(srcRegion.min(), srcRegion.max());
-//   CellInterval test2(dstRegion.min(), dstRegion.max());
-
-
-   auto commFunction = [this, srcField, srcRegion, dstField, dstRegion, dir](gpuStream_t gpuStream)
-   {
-      kernels_.localCopyDirection(srcField, srcRegion, dstField, dstRegion, dir, gpuStream);
-   };
-   commFunctions.emplace_back(commFunction);
-}
 
 template< typename PdfField_T>
 void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataEqualLevelImpl(
-   const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer) const
+   const Block* sender, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const
 {
    auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
    CellInterval ci;
@@ -130,7 +103,7 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataEqualLevelImpl(
    field->getSliceBeforeGhostLayer(dir, ci, gls, false);
    uint_t size              = kernels_.size(ci, dir);
    auto bufferPtr = buffer.advanceNoResize(size);
-   kernels_.packDirection(field, ci, bufferPtr, dir);
+   kernels_.packDirection(field, ci, bufferPtr, dir, stream);
 }
 
 /***********************************************************************************************************************
@@ -139,7 +112,7 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataEqualLevelImpl(
 
 template< typename PdfField_T>
 void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataCoarseToFineImpl(
-   const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer) const
+   const Block* coarseSender, const BlockID& fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream) const
 {
    auto field = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
 
@@ -149,15 +122,14 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataCoarseToFineImpl(
    for (auto t : intervals)
    {
       CellInterval ci          = t.second;
-      uint_t size              = kernels_.size(ci);
-      auto bufferPtr = buffer.advanceNoResize(size);
-      kernels_.packAll(field, ci, bufferPtr);
+      auto bufferPtr = buffer.advanceNoResize(kernels_.size(ci));
+      kernels_.packAll(field, ci, bufferPtr, stream);
    }
 }
 
 template< typename PdfField_T>
 void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataCoarseToFine(
-   Block* fineReceiver, const BlockID& /*coarseSender*/, stencil::Direction dir, GpuBuffer_T & buffer)
+   Block* fineReceiver, const BlockID& /*coarseSender*/, stencil::Direction dir, GpuBuffer_T & buffer, gpuStream_t stream)
 {
    auto field = fineReceiver->getData< PdfField_T >(pdfFieldID_);
 
@@ -170,13 +142,13 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataCoarseToFine(
       CellInterval ci          = t.second;
       uint_t size              = kernels_.redistributeSize(ci);
       auto bufferPtr = buffer.advanceNoResize(size);
-      kernels_.unpackRedistribute(field, ci, bufferPtr, d);
+      kernels_.unpackRedistribute(field, ci, bufferPtr, d, stream);
    }
 }
 
 template< typename PdfField_T>
 void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFine(
-   const Block* coarseSender, Block* fineReceiver, stencil::Direction dir)
+   const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, gpuStream_t stream)
 {
    auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
    auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_);
@@ -208,8 +180,8 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFi
       // TODO: This is a dirty workaround. Code-generate direct redistribution!
       unsigned char *buffer;
       WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize))
-      kernels_.packAll(srcField, srcInterval, buffer);
-      kernels_.unpackRedistribute(dstField, dstInterval, buffer, unpackDir);
+      kernels_.packAll(srcField, srcInterval, buffer, stream);
+      kernels_.unpackRedistribute(dstField, dstInterval, buffer, unpackDir, stream);
       WALBERLA_GPU_CHECK(gpuFree(buffer))
    }
 }
@@ -251,50 +223,6 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalCoarseToFi
    }
 }
 
-template< typename PdfField_T>
-void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalCoarseToFineCommFunction(
-   std::vector< VoidFunction >& commFunctions,
-   const Block* coarseSender, Block* fineReceiver, stencil::Direction dir, GpuBuffer_T & buffer)
-{
-   auto srcField = const_cast< Block* >(coarseSender)->getData< PdfField_T >(pdfFieldID_);
-   auto dstField = fineReceiver->getData< PdfField_T >(pdfFieldID_);
-
-   std::vector< std::pair< Direction, CellInterval > > srcIntervals;
-   getCoarseBlockCommIntervals(fineReceiver->getId(), dir, srcField, srcIntervals);
-
-   std::vector< std::pair< Direction, CellInterval > > dstIntervals;
-   getFineBlockCommIntervals(fineReceiver->getId(), stencil::inverseDir[dir], dstField, dstIntervals);
-
-   WALBERLA_ASSERT_EQUAL(srcIntervals.size(), dstIntervals.size())
-
-   for(size_t index = 0; index < srcIntervals.size(); index++)
-   {
-      CellInterval srcInterval = srcIntervals[index].second;
-
-      Direction const unpackDir      = dstIntervals[index].first;
-      CellInterval dstInterval = dstIntervals[index].second;
-
-      uint_t packSize      = kernels_.size(srcInterval);
-
-#ifndef NDEBUG
-      Direction const packDir        = srcIntervals[index].first;
-      WALBERLA_ASSERT_EQUAL(packDir, stencil::inverseDir[unpackDir])
-      uint_t unpackSize = kernels_.redistributeSize(dstInterval);
-      WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
-#endif
-
-      auto bufferPtr = buffer.advanceNoResize(packSize);
-
-      auto commFunction = [this, srcField, srcInterval, bufferPtr, dstField, dstInterval, unpackDir](gpuStream_t gpuStream)
-      {
-         kernels_.packAll(srcField, srcInterval, bufferPtr, gpuStream);
-         kernels_.unpackRedistribute(dstField, dstInterval, bufferPtr, unpackDir, gpuStream);
-      };
-      commFunctions.emplace_back(commFunction);
-   }
-}
-
-
 
 /***********************************************************************************************************************
  *                                          Fine to Coarse Communication                                               *
@@ -318,19 +246,19 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::prepareCoalescence(Block*
 template< typename PdfField_T>
 void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::unpackDataFineToCoarse(
    Block* coarseReceiver, const walberla::BlockID& fineSender, walberla::stencil::Direction dir,
-   GpuBuffer_T & buffer)
+   GpuBuffer_T & buffer, gpuStream_t stream)
 {
    auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
 
    CellInterval ci = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender, dir, dstField);
    uint_t size = kernels_.size(ci, dir);
    unsigned char* bufferPtr = buffer.advanceNoResize(size);
-   kernels_.unpackCoalescence(dstField, ci, bufferPtr, dir);
+   kernels_.unpackCoalescence(dstField, ci, bufferPtr, dir, stream);
 }
 
 template< typename PdfField_T>
 void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoarse(
-   const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir)
+   const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir, gpuStream_t stream)
 {
    auto varFineSender = const_cast< Block * >(fineSender);
    auto srcField   = varFineSender->getData< PdfField_T >(pdfFieldID_);
@@ -354,8 +282,8 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoar
    // TODO: This is a dirty workaround. Code-generate direct redistribution!
    unsigned char *buffer;
    WALBERLA_GPU_CHECK( gpuMalloc( &buffer, packSize))
-   kernels_.packPartialCoalescence(srcField, maskField, srcInterval, buffer, dir);
-   kernels_.unpackCoalescence(dstField, dstInterval, buffer, invDir);
+   kernels_.packPartialCoalescence(srcField, maskField, srcInterval, buffer, dir, stream);
+   kernels_.unpackCoalescence(dstField, dstInterval, buffer, invDir, stream);
    WALBERLA_GPU_CHECK(gpuFree(buffer))
 }
 
@@ -388,39 +316,6 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::communicateLocalFineToCoar
    kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, stream);
 }
 
-template< typename PdfField_T>
-void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::getLocalFineToCoarseCommFunction(
-   std::vector< VoidFunction >& commFunctions,
-   const Block* fineSender, Block* coarseReceiver, walberla::stencil::Direction dir, GpuBuffer_T & buffer)
-{
-   auto varFineSender = const_cast< Block * >(fineSender);
-   auto srcField   = varFineSender->getData< PdfField_T >(pdfFieldID_);
-   auto srcCommData   = varFineSender->getData< CommData_T >(commDataID_);
-   PartialCoalescenceMaskFieldGPU * maskField = &(srcCommData->getMaskFieldGPU());
-   auto dstField = coarseReceiver->getData<PdfField_T>(pdfFieldID_);
-   Direction invDir = stencil::inverseDir[dir];
-
-   CellInterval srcInterval;
-   srcField->getGhostRegion(dir, srcInterval, 2);
-   uint_t packSize = kernels_.partialCoalescenceSize(srcInterval, dir);
-
-   CellInterval dstInterval = getCoarseBlockCoalescenceInterval(coarseReceiver, fineSender->getId(),
-                                                                invDir, dstField);
-
-#ifndef NDEBUG
-   uint_t unpackSize = kernels_.size(dstInterval, invDir);
-   WALBERLA_ASSERT_EQUAL(packSize, unpackSize)
-#endif
-
-   auto bufferPtr = buffer.advanceNoResize(packSize);
-   auto commFunction = [this, srcField, maskField, srcInterval, bufferPtr, dir, dstField, dstInterval, invDir](gpuStream_t gpuStream)
-   {
-      kernels_.packPartialCoalescence(srcField, maskField, srcInterval, bufferPtr, dir, gpuStream);
-      kernels_.unpackCoalescence(dstField, dstInterval, bufferPtr, invDir, gpuStream);
-   };
-   commFunctions.emplace_back(commFunction);
-}
-
 
 template< typename PdfField_T>
 uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeEqualLevelSend( const Block * sender, stencil::Direction dir)
@@ -453,16 +348,35 @@ uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeCoarseToFineSend ( c
    return size;
 }
 
+template< typename PdfField_T>
+uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeCoarseToFineReceive ( Block* fineReceiver, stencil::Direction dir)
+{
+
+   auto field = fineReceiver->getData< PdfField_T >(pdfFieldID_);
+
+   std::vector< std::pair< Direction, CellInterval > > intervals;
+   getFineBlockCommIntervals(fineReceiver->getId(), dir, field, intervals);
+
+   uint_t size = 0;
+   for (auto t : intervals)
+   {
+      size += kernels_.redistributeSize(t.second);
+   }
+   WALBERLA_ASSERT_GREATER(size, 0)
+   return size;
+}
+
 
 
 template< typename PdfField_T>
 uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeFineToCoarseSend ( const Block * sender, stencil::Direction dir)
 {
-   auto field = const_cast< Block* >(sender)->getData< PdfField_T >(pdfFieldID_);
+    auto block      = const_cast< Block* >(sender);
+    auto srcField   = block->getData< PdfField_T >(pdfFieldID_);
 
-   CellInterval ci;
-   field->getGhostRegion(dir, ci, 2);
-   return kernels_.partialCoalescenceSize(ci, dir);
+    CellInterval ci;
+    srcField->getGhostRegion(dir, ci, 2);
+    return kernels_.partialCoalescenceSize(ci, dir);
 }
 
 
@@ -470,7 +384,7 @@ uint_t NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::sizeFineToCoarseSend ( c
 template< typename PdfField_T>
 void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataFineToCoarseImpl(
    const Block* fineSender, const walberla::BlockID& /*coarseReceiver*/, walberla::stencil::Direction dir,
-   GpuBuffer_T & buffer) const
+   GpuBuffer_T & buffer, gpuStream_t stream) const
 {
    auto varBlock = const_cast< Block* >(fineSender);
    auto srcField   = varBlock->getData< PdfField_T >(pdfFieldID_);
@@ -481,7 +395,7 @@ void NonuniformGeneratedGPUPdfPackInfo< PdfField_T >::packDataFineToCoarseImpl(
    srcField->getGhostRegion(dir, ci, 2);
    uint_t size = kernels_.partialCoalescenceSize(ci, dir);
    auto bufferPtr = buffer.advanceNoResize(size);
-   kernels_.packPartialCoalescence(srcField, maskField, ci, bufferPtr, dir);
+   kernels_.packPartialCoalescence(srcField, maskField, ci, bufferPtr, dir, stream);
 }
 
 /***********************************************************************************************************************
diff --git a/src/mesh/blockforest/BlockForestInitialization.cpp b/src/mesh/blockforest/BlockForestInitialization.cpp
index 91b702813a8c20d590bc4ab3a8213c53223b72cf..48a7edf06a08d1e0fe2ddb82c91bd4440bea4bf3 100644
--- a/src/mesh/blockforest/BlockForestInitialization.cpp
+++ b/src/mesh/blockforest/BlockForestInitialization.cpp
@@ -35,19 +35,16 @@
 namespace walberla {
 namespace mesh {
 
-static inline uint_t uintAbsDiff( const uint_t x, const uint_t y )
-{
-   return x > y ? x - y : y - x;
-}
-
-static inline void compareAABB( const AABB & oldAABB, const AABB & newAABB )
-{
-   if(oldAABB != newAABB)
-   {
-      WALBERLA_LOG_INFO_ON_ROOT( "Domain AABB has been adapted to requested block size:\n" \
-                                 "Old AABB: " << oldAABB   << "\n" \
-                                 "New AABB: " << newAABB << "\n" )
-   }
+namespace {
+    inline uint_t uintAbsDiff(const uint_t x, const uint_t y) {return x > y ? x - y : y - x;}
+
+    inline void compareAABB(const AABB &oldAABB, const AABB &newAABB) {
+        if (oldAABB != newAABB) {
+            WALBERLA_LOG_INFO_ON_ROOT("Domain AABB has been adapted to requested block size:\n" \
+                                 "Old AABB: " << oldAABB << "\n" \
+                                 "New AABB: " << newAABB << "\n")
+        }
+    }
 }
 
 ComplexGeometryBlockforestCreator::ComplexGeometryBlockforestCreator( const AABB & aabb )
@@ -213,7 +210,7 @@ shared_ptr<SetupBlockForest> ComplexGeometryBlockforestCreator::createSetupBlock
 
 shared_ptr<BlockForest> ComplexGeometryBlockforestCreator::createBlockForest( const uint_t targetNumRootBlocks ) const
 {
-   shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( targetNumRootBlocks );
+   shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( targetNumRootBlocks );
 
    auto blockForest = make_shared< blockforest::BlockForest >( MPIManager::instance()->rank(), *setupBlockForest );
 
@@ -222,7 +219,7 @@ shared_ptr<BlockForest> ComplexGeometryBlockforestCreator::createBlockForest( co
 
 shared_ptr<BlockForest> ComplexGeometryBlockforestCreator::createBlockForest( const Vector3<real_t> & blockSize ) const
 {
-   shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( blockSize );
+   shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( blockSize );
 
    auto blockForest = make_shared< blockforest::BlockForest >( MPIManager::instance()->rank(), *setupBlockForest );
 
@@ -304,7 +301,7 @@ shared_ptr<SetupBlockForest> ComplexGeometryStructuredBlockforestCreator::create
 {
 
    Vector3<uint_t> numCells = domainSizeCells();
-   real_t domainVolume = real_c( numCells[0] ) * real_c( numCells[1] ) * real_c( numCells[2] );
+   real_t const domainVolume = real_c( numCells[0] ) * real_c( numCells[1] ) * real_c( numCells[2] );
 
    std::set< uint_t > blockSizeTested;
 
@@ -437,7 +434,7 @@ shared_ptr<SetupBlockForest> ComplexGeometryStructuredBlockforestCreator::create
 
 shared_ptr<SetupBlockForest> ComplexGeometryStructuredBlockforestCreator::createSetupBlockForest( const Vector3<uint_t> & cellsPerBlock, const Vector3<uint_t> & numBlocks ) const
 {
-   uint_t numProcesses = numBlocks[0] * numBlocks[1] * numBlocks[2];
+   uint_t const numProcesses = numBlocks[0] * numBlocks[1] * numBlocks[2];
 
 
    AABB newAABB(real_t(0), real_t(0), real_t(0), real_c(numBlocks[0] * cellsPerBlock[0]) * cellSize_[0],
@@ -465,7 +462,7 @@ shared_ptr<SetupBlockForest> ComplexGeometryStructuredBlockforestCreator::create
 
 shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::createStructuredBlockForest( const uint_t targetNumRootBlocks ) const
 {
-   shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( targetNumRootBlocks );
+   shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( targetNumRootBlocks );
 
    Vector3< real_t > blockSize( setupBlockForest->getRootBlockXSize(),
                                 setupBlockForest->getRootBlockYSize(),
@@ -488,7 +485,7 @@ shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::c
 
 shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::createStructuredBlockForest( const Vector3<uint_t> & blockSize ) const
 {
-   shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( blockSize );
+   shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( blockSize );
 
    auto blockForest = make_shared< blockforest::BlockForest >( MPIManager::instance()->rank(), *setupBlockForest );
 
@@ -501,7 +498,7 @@ shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::c
 
 shared_ptr<StructuredBlockForest> ComplexGeometryStructuredBlockforestCreator::createStructuredBlockForest( const Vector3<uint_t> & cellsPerBlock, const Vector3<uint_t> & numBlocks ) const
 {
-   shared_ptr< blockforest::SetupBlockForest> setupBlockForest = createSetupBlockForest( cellsPerBlock, numBlocks );
+   shared_ptr< blockforest::SetupBlockForest> const setupBlockForest = createSetupBlockForest( cellsPerBlock, numBlocks );
 
    auto blockForest = make_shared< blockforest::BlockForest >( MPIManager::instance()->rank(), *setupBlockForest );
 
diff --git a/src/mesh/blockforest/BlockForestInitialization.h b/src/mesh/blockforest/BlockForestInitialization.h
index 976e298b95def7a52e46b29c8887d1281c868b90..6ce1f3f9b2c8a96f4fcfe872bdd03154259a895f 100644
--- a/src/mesh/blockforest/BlockForestInitialization.h
+++ b/src/mesh/blockforest/BlockForestInitialization.h
@@ -55,10 +55,10 @@ public:
 
    void setPeriodicity( const Vector3< bool > & periodicity ) { periodicity_ = periodicity; }
 
-   shared_ptr<SetupBlockForest>      createSetupBlockForest     ( const uint_t targetNumRootBlocks, const uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const;
-   shared_ptr<SetupBlockForest>      createSetupBlockForest     ( const Vector3<real_t> & blockSize, const uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const;
+   shared_ptr<SetupBlockForest>      createSetupBlockForest     ( uint_t targetNumRootBlocks, uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const;
+   shared_ptr<SetupBlockForest>      createSetupBlockForest     ( const Vector3<real_t> & blockSize, uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const;
 
-   shared_ptr<BlockForest>           createBlockForest          ( const uint_t targetNumRootBlocks ) const;
+   shared_ptr<BlockForest>           createBlockForest          ( uint_t targetNumRootBlocks ) const;
    shared_ptr<BlockForest>           createBlockForest          ( const Vector3<real_t> & blockSize ) const;
 
 private:
@@ -95,11 +95,11 @@ public:
 
    void setPeriodicity( const Vector3< bool > & periodicity ) { periodicity_ = periodicity; }
 
-   shared_ptr<SetupBlockForest>      createSetupBlockForest     ( const uint_t targetNumRootBlocks, const uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const;
-   shared_ptr<SetupBlockForest>      createSetupBlockForest     ( const Vector3<uint_t> & blockSize, const uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const;
+   shared_ptr<SetupBlockForest>      createSetupBlockForest     ( uint_t targetNumRootBlocks, uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const;
+   shared_ptr<SetupBlockForest>      createSetupBlockForest     ( const Vector3<uint_t> & blockSize, uint_t numProcesses = uint_c( MPIManager::instance()->numProcesses() ) ) const;
    shared_ptr<SetupBlockForest>      createSetupBlockForest     ( const Vector3<uint_t> & cellsPerBlock, const Vector3<uint_t> & numBlocks ) const;
 
-   shared_ptr<StructuredBlockForest> createStructuredBlockForest( const uint_t targetNumRootBlocks ) const;
+   shared_ptr<StructuredBlockForest> createStructuredBlockForest( uint_t targetNumRootBlocks ) const;
    shared_ptr<StructuredBlockForest> createStructuredBlockForest( const Vector3<uint_t> & blockSize ) const;
    shared_ptr<StructuredBlockForest> createStructuredBlockForest( const Vector3<uint_t> & cellsPerBlock, const Vector3<uint_t> & numBlocks ) const;
 
diff --git a/src/mesh/boundary/BoundarySetup.cpp b/src/mesh/boundary/BoundarySetup.cpp
index e4d443b5378f24f7986c29e6e7e2d901bf58c74d..7003fa77260d548d8db0a7e73aac29728e818ec5 100644
--- a/src/mesh/boundary/BoundarySetup.cpp
+++ b/src/mesh/boundary/BoundarySetup.cpp
@@ -24,55 +24,56 @@
 #include "field/AddToStorage.h"
 #include "field/vtk/VTKWriter.h"
 
-namespace walberla {
-namespace mesh {
+namespace walberla::mesh {
 
-
-BoundarySetup::BoundarySetup( const shared_ptr< StructuredBlockStorage > & structuredBlockStorage, const DistanceFunction & distanceFunction, const uint_t numGhostLayers )
+BoundarySetup::BoundarySetup( const shared_ptr< StructuredBlockStorage > & structuredBlockStorage, const DistanceFunction & distanceFunction, const uint_t numGhostLayers, const bool doRefinementCorrection)
    : structuredBlockStorage_( structuredBlockStorage ), distanceFunction_( distanceFunction ), numGhostLayers_( numGhostLayers ), cellVectorChunkSize_( size_t(1000) )
 {
    voxelize();
 
-   try {
-      auto & blockForest = dynamic_cast< StructuredBlockForest & >( *structuredBlockStorage_ );
-      if( !blockForest.storesUniformBlockGrid() )
-         refinementCorrection( blockForest );
-   }
-   catch( const std::bad_cast &  ) {} // If it's not a BlockForest no refinement correction is done
+    if (doRefinementCorrection)
+    {
+        try {
+            auto & blockForest = dynamic_cast< StructuredBlockForest & >( *structuredBlockStorage_ );
+            if( !blockForest.storesUniformBlockGrid() )
+                refinementCorrection( blockForest );
+        }
+        catch( const std::bad_cast &  ) {} // If it's not a BlockForest no refinement correction is done
+    }
 }
 
 
 void BoundarySetup::divideAndPushCellInterval( const CellInterval & ci, std::queue< CellInterval > & outputQueue )
 {
-   WALBERLA_ASSERT( !ci.empty() );
+   WALBERLA_ASSERT( !ci.empty() )
 
    Cell newMax( ci.xMin() + std::max( cell_idx_c( ci.xSize() ) / cell_idx_t(2) - cell_idx_t(1), cell_idx_t(0) ),
                 ci.yMin() + std::max( cell_idx_c( ci.ySize() ) / cell_idx_t(2) - cell_idx_t(1), cell_idx_t(0) ),
                 ci.zMin() + std::max( cell_idx_c( ci.zSize() ) / cell_idx_t(2) - cell_idx_t(1), cell_idx_t(0) ) );
 
-   WALBERLA_ASSERT( ci.contains( newMax ) );
+   WALBERLA_ASSERT( ci.contains( newMax ) )
 
    Cell newMin( newMax[0] + cell_idx_c( 1 ), newMax[1] + cell_idx_c( 1 ), newMax[2] + cell_idx_c( 1 ) );
 
-   outputQueue.push( CellInterval( ci.xMin(), ci.yMin(), ci.zMin(), newMax[0], newMax[1], newMax[2] ) );
+   outputQueue.emplace( ci.xMin(), ci.yMin(), ci.zMin(), newMax[0], newMax[1], newMax[2] );
    if( newMin[2] <= ci.zMax() )
-      outputQueue.push( CellInterval( ci.xMin(), ci.yMin(), newMin[2], newMax[0], newMax[1], ci.zMax() ) );
+      outputQueue.emplace( ci.xMin(), ci.yMin(), newMin[2], newMax[0], newMax[1], ci.zMax() );
    if( newMin[1] <= ci.yMax() )
    {
-      outputQueue.push( CellInterval( ci.xMin(), newMin[1], ci.zMin(), newMax[0], ci.yMax(), newMax[2]) );
+      outputQueue.emplace( ci.xMin(), newMin[1], ci.zMin(), newMax[0], ci.yMax(), newMax[2] );
       if( newMin[2] <= ci.zMax() )
-         outputQueue.push( CellInterval( ci.xMin(), newMin[1], newMin[2], newMax[0], ci.yMax(), ci.zMax() ) );
+         outputQueue.emplace( ci.xMin(), newMin[1], newMin[2], newMax[0], ci.yMax(), ci.zMax() );
    }
    if( newMin[0] <= ci.xMax() )
    {
-      outputQueue.push( CellInterval( newMin[0], ci.yMin(), ci.zMin(), ci.xMax(), newMax[1], newMax[2] ) );
+      outputQueue.emplace( newMin[0], ci.yMin(), ci.zMin(), ci.xMax(), newMax[1], newMax[2] );
       if( newMin[2] <= ci.zMax() )
-         outputQueue.push( CellInterval( newMin[0], ci.yMin(), newMin[2], ci.xMax(), newMax[1], ci.zMax() ) );
+         outputQueue.emplace( newMin[0], ci.yMin(), newMin[2], ci.xMax(), newMax[1], ci.zMax() );
       if( newMin[1] <= ci.yMax() )
       {
-         outputQueue.push( CellInterval( newMin[0], newMin[1], ci.zMin(), ci.xMax(), ci.yMax(), newMax[2] ) );
+         outputQueue.emplace( newMin[0], newMin[1], ci.zMin(), ci.xMax(), ci.yMax(), newMax[2] );
          if( newMin[2] <= ci.zMax() )
-            outputQueue.push( CellInterval( newMin[0], newMin[1], newMin[2], ci.xMax(), ci.yMax(), ci.zMax() ) );
+            outputQueue.emplace( newMin[0], newMin[1], newMin[2], ci.xMax(), ci.yMax(), ci.zMax() );
       }
    }
 }
@@ -92,7 +93,7 @@ void BoundarySetup::allocateOrResetVoxelizationField()
       voxelizationFieldId_ = make_shared< BlockDataID >( field::addToStorage< VoxelizationField >( structuredBlockStorage_, "voxelization field", uint8_t(0), field::fzyx, numGhostLayers_ ) );
    }
 
-   WALBERLA_ASSERT_NOT_NULLPTR( voxelizationFieldId_ );
+   WALBERLA_ASSERT_NOT_NULLPTR( voxelizationFieldId_ )
 }
 
 void BoundarySetup::deallocateVoxelizationField()
@@ -112,8 +113,8 @@ void BoundarySetup::voxelize()
    {
       VoxelizationField * voxelizationField = block.getData< VoxelizationField >( *voxelizationFieldId_ );
 
-      WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField );
-      WALBERLA_ASSERT_EQUAL( numGhostLayers_, voxelizationField->nrOfGhostLayers() );
+      WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField )
+      WALBERLA_ASSERT_EQUAL( numGhostLayers_, voxelizationField->nrOfGhostLayers() )
 
       CellInterval blockCi = voxelizationField->xyzSizeWithGhostLayer();
       structuredBlockStorage_->transformBlockLocalToGlobalCellInterval( blockCi, block );
@@ -125,11 +126,11 @@ void BoundarySetup::voxelize()
       {
          const CellInterval & curCi = ciQueue.front();
 
-         WALBERLA_ASSERT( !curCi.empty(), "Cell Interval: " << curCi );
+         WALBERLA_ASSERT( !curCi.empty(), "Cell Interval: " << curCi )
 
-         AABB curAABB = structuredBlockStorage_->getAABBFromCellBB( curCi, structuredBlockStorage_->getLevel( block ) );
+         AABB const curAABB = structuredBlockStorage_->getAABBFromCellBB( curCi, structuredBlockStorage_->getLevel( block ) );
 
-         WALBERLA_ASSERT( !curAABB.empty(), "AABB: " << curAABB );
+         WALBERLA_ASSERT( !curAABB.empty(), "AABB: " << curAABB )
 
          Vector3<real_t> cellCenter = curAABB.center();
          structuredBlockStorage_->mapToPeriodicDomain( cellCenter );
@@ -170,7 +171,7 @@ void BoundarySetup::voxelize()
             continue;
          }
 
-         WALBERLA_ASSERT_GREATER( curCi.numCells(), uint_t(1) );
+         WALBERLA_ASSERT_GREATER( curCi.numCells(), uint_t(1) )
          divideAndPushCellInterval( curCi, ciQueue );
 
          ciQueue.pop();
@@ -184,7 +185,7 @@ void BoundarySetup::refinementCorrection( StructuredBlockForest & blockForest )
        || blockForest.getNumberOfYCellsPerBlock() < uint_t( 16 )
        || blockForest.getNumberOfZCellsPerBlock() < uint_t( 16 ) )
    {
-      WALBERLA_ABORT( "The mesh boundary setup requires a block size of at least 16 in each dimension, when refinement is used!" );
+      WALBERLA_ABORT( "The mesh boundary setup requires a block size of at least 16 in each dimension, when refinement is used!" )
    }
 
    for( auto & iBlock : blockForest )
@@ -253,12 +254,11 @@ void BoundarySetup::refinementCorrection( StructuredBlockForest & blockForest )
 
 void BoundarySetup::writeVTKVoxelfile( const std::string & identifier, bool writeGhostLayers, const std::string & baseFolder, const std::string & executionFolder )
 {
-   WALBERLA_ASSERT_NOT_NULLPTR( voxelizationFieldId_ );
-   WALBERLA_ASSERT_NOT_NULLPTR( structuredBlockStorage_ );
+   WALBERLA_ASSERT_NOT_NULLPTR( voxelizationFieldId_ )
+   WALBERLA_ASSERT_NOT_NULLPTR( structuredBlockStorage_ )
 
    field::createVTKOutput< VoxelizationField >( *voxelizationFieldId_, *structuredBlockStorage_, identifier, uint_t(1), writeGhostLayers ? numGhostLayers_ : uint_t(0), false, baseFolder, executionFolder )();
 }
 
 
-} // namespace mesh
-} // namespace walberla
\ No newline at end of file
+} // namespace walberla::mesh
\ No newline at end of file
diff --git a/src/mesh/boundary/BoundarySetup.h b/src/mesh/boundary/BoundarySetup.h
index 6486bb75d88667e2e560d6dbada4e91df2046828..1993738499166006a8ef0a9eaa50589192709632 100644
--- a/src/mesh/boundary/BoundarySetup.h
+++ b/src/mesh/boundary/BoundarySetup.h
@@ -41,8 +41,7 @@
 
 #include <queue>
 
-namespace walberla {
-namespace mesh {
+namespace walberla::mesh {
 
 class BoundarySetup
 {
@@ -53,23 +52,23 @@ public:
 
    enum Location { INSIDE, OUTSIDE };
 
-   BoundarySetup( const shared_ptr< StructuredBlockStorage > & structuredBlockStorage, const DistanceFunction & distanceFunction, const uint_t numGhostLayers );
+   BoundarySetup( const shared_ptr< StructuredBlockStorage > & structuredBlockStorage, const DistanceFunction & distanceFunction, uint_t numGhostLayers, bool doRefinementCorrection = true );
 
    ~BoundarySetup() { deallocateVoxelizationField(); }
 
    template< typename BoundaryHandlingType >
-   void setDomainCells( const BlockDataID boundaryHandlingID, const Location domainLocation );
+   void setDomainCells( BlockDataID boundaryHandlingID, Location domainLocation );
 
    template< typename BoundaryHandlingType, typename BoundaryFunction, typename Stencil = stencil::D3Q27 >
-   void setBoundaries( const BlockDataID boundaryHandlingID, const BoundaryFunction & boundaryFunction, Location boundaryLocation );
+   void setBoundaries( BlockDataID boundaryHandlingID, const BoundaryFunction & boundaryFunction, Location boundaryLocation );
 
 
    template<typename FlagField_T>
-   void setFlag( const BlockDataID flagFieldID, field::FlagUID flagUID, Location boundaryLocation );
+   void setFlag( BlockDataID flagFieldID, field::FlagUID flagUID, Location boundaryLocation );
 
    // set flag with FlagUID wherever boundaryUID is set
    template< typename FlagField_T, typename BoundaryFunction, typename Stencil = stencil::D3Q27 >
-   void setBoundaryFlag(const BlockDataID flagFieldID, field::FlagUID flagUID, boundary::BoundaryUID boundaryUID,
+   void setBoundaryFlag(BlockDataID flagFieldID, field::FlagUID flagUID, boundary::BoundaryUID boundaryUID,
                         const BoundaryFunction& boundaryFunction, Location boundaryLocation);
 
    void writeVTKVoxelfile( const std::string & identifier = "voxelization", bool writeGhostLayers = false, const std::string & baseFolder = std::string("vtk_out"),
@@ -100,12 +99,12 @@ void BoundarySetup::setDomainCells( const BlockDataID boundaryHandlingId, const
    for( auto & block : *structuredBlockStorage_ )
    {
       BoundaryHandlingType * boundaryHandling  = block.getData< BoundaryHandlingType >( boundaryHandlingId  );
-      WALBERLA_CHECK_NOT_NULLPTR( boundaryHandling, "boundaryHandlingId invalid!" );
+      WALBERLA_CHECK_NOT_NULLPTR( boundaryHandling, "boundaryHandlingId invalid!" )
 
       const VoxelizationField * voxelizationField = block.getData< VoxelizationField >( *voxelizationFieldId_ );
-      WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField );
+      WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField )
       WALBERLA_CHECK_LESS_EQUAL( numGhostLayers_, boundaryHandling->getFlagField()->nrOfGhostLayers(), "You want to use mesh boundary setup with " \
-                                 << numGhostLayers_ << " but your flag field has only " << boundaryHandling->getFlagField()->nrOfGhostLayers() << " ghost layers!" );
+                                 << numGhostLayers_ << " but your flag field has only " << boundaryHandling->getFlagField()->nrOfGhostLayers() << " ghost layers!" )
 
       const uint8_t domainValue = domainLocation == INSIDE ? uint8_t(1) : uint8_t(0);
 
@@ -132,13 +131,13 @@ void BoundarySetup::setFlag( const BlockDataID flagFieldID, field::FlagUID flagU
    for( auto & block : *structuredBlockStorage_ )
    {
       FlagField_T * flagField  = block.getData< FlagField_T >( flagFieldID );
-      WALBERLA_CHECK_NOT_NULLPTR( flagField, "flagFieldID invalid!" );
+      WALBERLA_CHECK_NOT_NULLPTR( flagField, "flagFieldID invalid!" )
       auto flag = flagField->getFlag(flagUID);
 
       const VoxelizationField * voxelizationField = block.getData< VoxelizationField >( *voxelizationFieldId_ );
-      WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField );
+      WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField )
       WALBERLA_CHECK_LESS_EQUAL( numGhostLayers_, flagField->nrOfGhostLayers(), "You want to use mesh boundary setup with " \
-                                 << numGhostLayers_ << " but your flag field has only " << flagField->nrOfGhostLayers() << " ghost layers!" );
+                                 << numGhostLayers_ << " but your flag field has only " << flagField->nrOfGhostLayers() << " ghost layers!" )
 
       const uint8_t domainValue = boundaryLocation == INSIDE ? uint8_t(0) : uint8_t(1);
 
@@ -156,15 +155,15 @@ void BoundarySetup::setBoundaryFlag(const BlockDataID flagFieldID, field::FlagUI
    for (auto& block : *structuredBlockStorage_)
    {
       FlagField_T* flagField = block.getData< FlagField_T >(flagFieldID);
-      WALBERLA_CHECK_NOT_NULLPTR(flagField, "flagFieldID invalid!");
+      WALBERLA_CHECK_NOT_NULLPTR(flagField, "flagFieldID invalid!")
       auto flag = flagField->getFlag(flagUID);
 
       const VoxelizationField* voxelizationField = block.getData< VoxelizationField >(*voxelizationFieldId_);
-      WALBERLA_ASSERT_NOT_NULLPTR(voxelizationField);
+      WALBERLA_ASSERT_NOT_NULLPTR(voxelizationField)
       WALBERLA_CHECK_LESS_EQUAL(numGhostLayers_, flagField->nrOfGhostLayers(),
                                 "You want to use mesh boundary setup with "
                                    << numGhostLayers_ << " but your flag field has only "
-                                   << flagField->nrOfGhostLayers() << " ghost layers!");
+                                   << flagField->nrOfGhostLayers() << " ghost layers!")
 
       // get where the (fluid) domain is located (on the inside/outside of the mesh)
       const uint8_t domainValue = boundaryLocation == INSIDE ? uint8_t(0) : uint8_t(1);
@@ -211,12 +210,12 @@ void BoundarySetup::setBoundaries( const BlockDataID boundaryHandlingID, const B
    for( auto & block : *structuredBlockStorage_ )
    {
       BoundaryHandlingType * boundaryHandling  = block.getData< BoundaryHandlingType >( boundaryHandlingID  );
-      WALBERLA_CHECK_NOT_NULLPTR( boundaryHandling, "boundaryHandlingId invalid!" );
+      WALBERLA_CHECK_NOT_NULLPTR( boundaryHandling, "boundaryHandlingId invalid!" )
 
       const VoxelizationField * voxelizationField = block.getData< VoxelizationField >( *voxelizationFieldId_ );
-      WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField );
+      WALBERLA_ASSERT_NOT_NULLPTR( voxelizationField )
       WALBERLA_CHECK_LESS_EQUAL( numGhostLayers_, boundaryHandling->getFlagField()->nrOfGhostLayers(), "You want to use mesh boundary setup with " \
-                                 << numGhostLayers_ << " but your flag field has only " << boundaryHandling->getFlagField()->nrOfGhostLayers() << " ghost layers!" );
+                                 << numGhostLayers_ << " but your flag field has only " << boundaryHandling->getFlagField()->nrOfGhostLayers() << " ghost layers!" )
 
       const uint8_t domainValue   = boundaryLocation == INSIDE ? uint8_t(0) : uint8_t(1);
 
@@ -244,5 +243,4 @@ void BoundarySetup::setBoundaries( const BlockDataID boundaryHandlingID, const B
    }
 }
 
-} // namespace mesh
-} // namespace walberla
\ No newline at end of file
+} // namespace walberla::mesh
\ No newline at end of file