7a6d3b57 · 7a6d3b57 · 7a6d3b57 · 7a6d3b57 · 7a6d3b57 · 7a6d3b57
--- a/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
+++ b/apps/benchmarks/SchaeferTurek/SchaeferTurek.cpp
@@ -204,25 +204,25 @@ template< typename LatticeModel_T, class Enable = void >
 struct StencilString;
 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D2Q9 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D2Q9 > > >
 {
   static const char * str() { return "D2Q9"; }
 };
 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q15 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q15 > > >
 {
   static const char * str() { return "D3Q15"; }
 };
 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q19 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q19 > > >
 {
   static const char * str() { return "D3Q19"; }
 };
 template< typename LatticeModel_T >
-struct StencilString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q27 >::value >::type >
+struct StencilString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q27 > > >
 {
   static const char * str() { return "D3Q27"; }
 };
@@ -232,22 +232,22 @@ template< typename LatticeModel_T, class Enable = void >
 struct CollisionModelString;
 template< typename LatticeModel_T >
-struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag,
+struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
-                                                                                          lbm::collision_model::SRT_tag >::value >::type >
+                                                                                          lbm::collision_model::SRT_tag > > >
 {
   static const char * str() { return "SRT"; }
 };
 template< typename LatticeModel_T >
-struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag,
+struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
-                                                                                          lbm::collision_model::TRT_tag >::value >::type >
+                                                                                          lbm::collision_model::TRT_tag > > >
 {
   static const char * str() { return "TRT"; }
 };
 template< typename LatticeModel_T >
-struct CollisionModelString< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag,
+struct CollisionModelString< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag,
-                                                                                          lbm::collision_model::MRT_tag >::value >::type >
+                                                                                          lbm::collision_model::MRT_tag > > >
 {
   static const char * str() { return "MRT"; }
 };
@@ -355,7 +355,7 @@ bool Cylinder::contains( const AABB & aabb ) const
 {
   if( setup_.circularCrossSection )
   {
-      Vector3< real_t > p[8];
+      std::array< Vector3< real_t >, 8 > p;
      p[0].set( aabb.xMin(), aabb.yMin(), aabb.zMin() );
      p[1].set( aabb.xMax(), aabb.yMin(), aabb.zMin() );
      p[2].set( aabb.xMin(), aabb.yMax(), aabb.zMin() );
@@ -521,12 +521,12 @@ public:
   void operator()( SetupBlockForest & forest )
   {
-      for( auto block = forest.begin(); block != forest.end(); ++block )
+      for(auto & block : forest)
      {
-         const AABB & aabb = block->getAABB();
+         const AABB & aabb = block.getAABB();
-         if( block->getLevel() < level_ && cylinder_.intersects( aabb, bufferDistance_ ) && !cylinder_.contains( aabb ) )
+         if( block.getLevel() < level_ && cylinder_.intersects( aabb, bufferDistance_ ) && !cylinder_.contains( aabb ) )
-            block->setMarker( true );
+            block.setMarker( true );
      }
   }
@@ -633,10 +633,10 @@ static void workloadMemoryAndSUIDAssignment( SetupBlockForest & forest, const me
   }
   else
   {
-      for( auto block = forest.begin(); block != forest.end(); ++block )
+      for(auto & block : forest)
      {
-         block->setWorkload( numeric_cast< workload_t >( uint_t(1) << block->getLevel() ) );
+         block.setWorkload( numeric_cast< workload_t >( uint_t(1) << block.getLevel() ) );
-         block->setMemory( memoryPerBlock );
+         block.setMemory( memoryPerBlock );
      }
   }
 }
@@ -656,7 +656,8 @@ static shared_ptr< SetupBlockForest > createSetupBlockForest( const blockforest:
                                                             ( setup.xCells + uint_t(2) * FieldGhostLayers ) ) * memoryPerCell;
   forest->addRefinementSelectionFunction( refinementSelectionFunctions );
-   forest->addWorkloadMemorySUIDAssignmentFunction( std::bind( workloadMemoryAndSUIDAssignment, std::placeholders::_1, memoryPerBlock, std::cref( setup ) ) );
+   forest->addWorkloadMemorySUIDAssignmentFunction([memoryPerBlock, setup_cref = std::cref(setup)](SetupBlockForest & sbf) {
+      return workloadMemoryAndSUIDAssignment(sbf, memoryPerBlock, setup_cref); } );
   forest->init( AABB( real_c(0), real_c(0), real_c(0),
                       setup.H * ( real_c(setup.xBlocks) * real_c(setup.xCells) ) / ( real_c(setup.yzBlocks) * real_c(setup.yzCells) ), setup.H, setup.H ),
@@ -1034,11 +1035,11 @@ Set<SUID> Pseudo2DBlockStateDetermination::operator()( const std::vector< std::p
   auto aabb = forest_.getAABBFromBlockId( destintation );
   Set<SUID> state;
-   for( auto it = source.begin(); it != source.end(); ++it )
+   for(const auto & it : source)
   {
-      for( auto suid = it->second.begin(); suid != it->second.end(); ++suid )
+      for(auto suid : it.second)
-         if( *suid != state_ )
+         if( suid != state_ )
-            state += *suid;
+            state += suid;
   }
   if( markEmptyBlocks_ )
@@ -1064,16 +1065,16 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
   uint_t maxInflowLevel( uint_t(0) );
   uint_t maxOutflowLevel( uint_t(0) );
-   // In addtion to keeping in- and outflow blocks at the same level, this callback also
+   // In addition to keeping in- and outflow blocks at the same level, this callback also
   // prevents these blocks from coarsening.
-   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+   for(auto & minTargetLevel : minTargetLevels)
   {
-      const Block * const block = it->first;
+      const Block * const block = minTargetLevel.first;
      if( forest.atDomainXMinBorder(*block) )
      {
-         it->second = std::max( it->second, block->getLevel() );
+         minTargetLevel.second = std::max( minTargetLevel.second, block->getLevel() );
-         maxInflowLevel = std::max( maxInflowLevel, it->second );
+         maxInflowLevel = std::max( maxInflowLevel, minTargetLevel.second );
      }
      if( setup.strictlyObeyL )
      {
@@ -1083,19 +1084,18 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
         p[0] = setup.L;
         if( aabb.contains(p) )
         {
-            it->second = std::max( it->second, block->getLevel() );
+            minTargetLevel.second = std::max( minTargetLevel.second, block->getLevel() );
-            maxOutflowLevel = std::max( maxOutflowLevel, it->second );
+            maxOutflowLevel = std::max( maxOutflowLevel, minTargetLevel.second );
         }
      }
      else if( forest.atDomainXMaxBorder(*block) )
      {
-         it->second = std::max( it->second, block->getLevel() );
+         minTargetLevel.second = std::max( minTargetLevel.second, block->getLevel() );
-         maxOutflowLevel = std::max( maxOutflowLevel, it->second );
+         maxOutflowLevel = std::max( maxOutflowLevel, minTargetLevel.second );
      }
   }
-   for( auto it = blocksAlreadyMarkedForRefinement.begin(); it != blocksAlreadyMarkedForRefinement.end(); ++it )
+   for(auto block : blocksAlreadyMarkedForRefinement)
   {
-      const Block * const block = *it;
      if( forest.atDomainXMinBorder(*block) )
      {
         maxInflowLevel = std::max( maxInflowLevel, block->getTargetLevel() );
@@ -1116,12 +1116,12 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
   mpi::allReduceInplace( maxInflowLevel, mpi::MAX );
   mpi::allReduceInplace( maxOutflowLevel, mpi::MAX );
-   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+   for(auto & minTargetLevel : minTargetLevels)
   {
-      const Block * const block = it->first;
+      const Block * const block = minTargetLevel.first;
      if( forest.atDomainXMinBorder(*block) )
      {
-         it->second = maxInflowLevel;
+         minTargetLevel.second = maxInflowLevel;
      }
      if( setup.strictlyObeyL )
      {
@@ -1130,10 +1130,10 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
         auto p = aabb.center();
         p[0] = setup.L;
         if( aabb.contains(p) )
-            it->second = maxOutflowLevel;
+            minTargetLevel.second = maxOutflowLevel;
      }
      else if( forest.atDomainXMaxBorder(*block) )
-         it->second = maxOutflowLevel;
+         minTargetLevel.second = maxOutflowLevel;
   }
 }
@@ -1143,10 +1143,10 @@ void keepInflowOutflowAtTheSameLevel( std::vector< std::pair< const Block *, uin
 void pseudo2DTargetLevelCorrection( std::vector< std::pair< const Block *, uint_t > > & minTargetLevels,
                                    std::vector< const Block * > &, const blockforest::BlockForest & forest )
 {
-   for( auto it = minTargetLevels.begin(); it != minTargetLevels.end(); ++it )
+   for(auto & minTargetLevel : minTargetLevels)
   {
-      if( ! forest.atDomainZMinBorder( *(it->first ) ) )
+      if( ! forest.atDomainZMinBorder( *(minTargetLevel.first ) ) )
-         it->second = uint_t(0);
+         minTargetLevel.second = uint_t(0);
   }
 }
@@ -1177,12 +1177,12 @@ public:
   void operator()( std::vector< std::pair< const PhantomBlock *, walberla::any > > & blockData, const PhantomBlockForest & )
   {
-      for( auto it = blockData.begin(); it != blockData.end(); ++it )
+      for(auto & it : blockData)
      {
-         if( it->first->getState().contains( state_ ) )
+         if( it.first->getState().contains( state_ ) )
-            it->second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(0) : uint8_t(1) );
+            it.second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(0) : uint8_t(1) );
         else
-            it->second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(1) : uint8_t(0) );
+            it.second = Pseudo2DPhantomWeight( markEmptyBlocks_ ? uint8_t(1) : uint8_t(0) );
      }
   }
@@ -1297,10 +1297,9 @@ public:
               const bool logToStream = true, const bool logToFile = true, const std::string& filename = std::string("SchaeferTurek.txt"),
               const Set<SUID> & requiredSelectors = Set<SUID>::emptySet(),
               const Set<SUID> & incompatibleSelectors = Set<SUID>::emptySet() ) :
-      initialized_( false ), blocks_( blocks ),
+      blocks_( blocks ),
-      executionCounter_( uint_t(0) ), checkFrequency_( checkFrequency ), pdfFieldId_( pdfFieldId ), flagFieldId_( flagFieldId ),
+      checkFrequency_( checkFrequency ), pdfFieldId_( pdfFieldId ), flagFieldId_( flagFieldId ),
-      fluid_( fluid ), obstacle_( obstacle ), setup_( setup ), D_( uint_t(0) ), AD_( real_t(0) ), AL_( real_t(0) ), forceEvaluationExecutionCount_( uint_t(0) ),
+      fluid_( fluid ), obstacle_( obstacle ), setup_( setup ),
-      strouhalRising_( false ), strouhalNumberRealD_( real_t(0) ), strouhalNumberDiscreteD_( real_t(0) ), strouhalEvaluationExecutionCount_( uint_t(0) ),
      logToStream_( logToStream ), logToFile_( logToFile ), filename_( filename ),
      requiredSelectors_( requiredSelectors ), incompatibleSelectors_( incompatibleSelectors )
   {
@@ -1345,13 +1344,13 @@ protected:
-   bool initialized_;
+   bool initialized_{ false };
   weak_ptr< StructuredBlockStorage > blocks_;
   std::map< IBlock *, std::vector< std::pair< Cell, stencil::Direction > > >  directions_;
-   uint_t executionCounter_;
+   uint_t executionCounter_{ uint_c(0) };
-   uint_t checkFrequency_;
+   uint_t checkFrequency_{ uint_c(0) };
   BlockDataID pdfFieldId_;
   BlockDataID flagFieldId_;
@@ -1361,23 +1360,23 @@ protected:
   Setup setup_;
-   uint_t D_;
+   uint_t D_{ uint_c(0) };
-   real_t AD_;
+   real_t AD_{ real_c(0) };
-   real_t AL_;
+   real_t AL_{ real_c(0) };
   Vector3< real_t > force_;
   std::vector< math::Sample > forceSample_;
-   uint_t forceEvaluationExecutionCount_;
+   uint_t forceEvaluationExecutionCount_{ uint_c(0) };
   std::vector< std::deque< real_t > > coefficients_;
   std::vector< std::pair< real_t, real_t > > coefficientExtremas_;
   std::vector< real_t > strouhalVelocities_;
   std::vector< uint_t > strouhalTimeStep_;
-   bool strouhalRising_;
+   bool strouhalRising_{ false };
-   real_t strouhalNumberRealD_;
+   real_t strouhalNumberRealD_{ real_c(0) };
-   real_t strouhalNumberDiscreteD_;
+   real_t strouhalNumberDiscreteD_{ real_c(0) };
-   uint_t strouhalEvaluationExecutionCount_;
+   uint_t strouhalEvaluationExecutionCount_{ uint_c(0) };
   bool logToStream_;
   bool logToFile_;
@@ -1472,7 +1471,7 @@ void Evaluation< LatticeModel_T >::operator()()
      {
         WALBERLA_LOG_RESULT_ON_ROOT( "force acting on cylinder (in dimensionless lattice units of the coarsest grid - evaluated in time step "
                                      << forceEvaluationExecutionCount_ << "):\n   " << force_ << oss.str() <<
-                                      "\ndrag and lift coefficients (including extremas of last " << ( coefficients_[0].size() * checkFrequency_ ) << " time steps):"
+                                      "\ndrag and lift coefficients (including extrema of last " << ( coefficients_[0].size() * checkFrequency_ ) << " time steps):"
                                      "\n   \"real\" area:"
                                      "\n      c_D: " << cDRealArea << " (min = " << coefficientExtremas_[0].first << ", max = " << coefficientExtremas_[0].second << ")" <<
                                      "\n      c_L: " << cLRealArea << " (min = " << coefficientExtremas_[1].first << ", max = " << coefficientExtremas_[1].second << ")" <<
@@ -1604,10 +1603,10 @@ void Evaluation< LatticeModel_T >::operator()( IBlock * block, const uint_t leve
      const PdfField_T * const pdfField = block->template getData< PdfField_T >( pdfFieldId_ );
      const auto & directions = directions_[ block ];
-      for( auto pair = directions.begin(); pair != directions.end(); ++pair )
+      for(const auto & pair : directions)
      {
-         const Cell cell( pair->first );
+         const Cell cell( pair.first );
-         const stencil::Direction direction( pair->second );
+         const stencil::Direction direction( pair.second );
         const real_t scaleFactor = real_t(1) / real_c( uint_t(1) << ( (Is2D< LatticeModel_T >::value ? uint_t(1) : uint_t(2)) * level ) );
@@ -1712,8 +1711,8 @@ void Evaluation< LatticeModel_T >::getResultsForSQLOnRoot( std::map< std::string
 {
   WALBERLA_ROOT_SECTION()
   {
-      for( auto result = sqlResults_.begin(); result != sqlResults_.end(); ++result )
+      for(const auto & sqlResult : sqlResults_)
-         realProperties[ result->first ] = result->second;
+         realProperties[ sqlResult.first ] = sqlResult.second;
      integerProperties[ "forceEvaluationTimeStep" ] = int_c( forceEvaluationExecutionCount_ );
      integerProperties[ "strouhalEvaluationTimeStep" ] = int_c( strouhalEvaluationExecutionCount_ );
@@ -2050,9 +2049,7 @@ void Evaluation< LatticeModel_T >::evaluate( real_t & cDRealArea, real_t & cLRea
   // force on obstacle
-   mpi::reduceInplace( force_[0], mpi::SUM );
+   mpi::reduceInplace( force_, mpi::SUM );
-   mpi::reduceInplace( force_[1], mpi::SUM );
-   mpi::reduceInplace( force_[2], mpi::SUM );
   if( setup_.evaluateForceComponents )
   {
@@ -2157,7 +2154,7 @@ public:
   SnapshotSimulator( const weak_ptr<blockforest::StructuredBlockForest> & blocks,
                      const uint_t failAt, const uint_t failRangeBegin, const uint_t failRangeEnd, const bool failRebalance ) :
-      blocks_( blocks ), executionCounter_( 0 ),
+      blocks_( blocks ),
      failAt_( failAt ), failRangeBegin_( failRangeBegin ), failRangeEnd_( failRangeEnd ), failRebalance_( failRebalance )
   {}
@@ -2218,7 +2215,7 @@ private:
   weak_ptr<blockforest::StructuredBlockForest> blocks_;
-   uint_t executionCounter_;
+   uint_t executionCounter_{ uint_c(0) };
   uint_t failAt_;
   uint_t failRangeBegin_;
@@ -2304,11 +2301,11 @@ struct AddRefinementTimeStep
 };
 template< typename LatticeModel_T  >
-struct AddRefinementTimeStep< LatticeModel_T, typename std::enable_if< std::is_same< typename LatticeModel_T::CollisionModel::tag, lbm::collision_model::MRT_tag >::value ||
+struct AddRefinementTimeStep< LatticeModel_T, std::enable_if_t< std::is_same_v< typename LatticeModel_T::CollisionModel::tag, lbm::collision_model::MRT_tag > ||
-                                                                       std::is_same< typename LatticeModel_T::Stencil, stencil::D2Q9 >::value ||
+                                                                       std::is_same_v< typename LatticeModel_T::Stencil, stencil::D2Q9 > ||
-                                                                       std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q15 >::value ||
+                                                                       std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q15 > ||
-                                                                       std::is_same< typename LatticeModel_T::Stencil, stencil::D3Q27 >::value
+                                                                       std::is_same_v< typename LatticeModel_T::Stencil, stencil::D3Q27 >
-                                                                       >::type >
+                                                                       > >
 {
   static void add( SweepTimeloop & timeloop, shared_ptr< blockforest::StructuredBlockForest > & blocks,
                    const BlockDataID & pdfFieldId, const BlockDataID & flagFieldId, const BlockDataID & boundaryHandlingId,
@@ -2365,7 +2362,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
   // add velocity field + initialize velocity field writer (only used for simulations with an adaptive block structure)
   using VelocityField_T = field::GhostLayerField<Vector3<real_t>, 1>;
-   BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::zyxf, FieldGhostLayers, true, None, Empty );
+   BlockDataID velocityFieldId = field::addToStorage< VelocityField_T >( blocks, "velocity", Vector3<real_t>(0), field::fzyx, FieldGhostLayers, true, None, Empty );
   using VelocityFieldWriter_T = lbm::VelocityFieldWriter<typename Types<LatticeModel_T>::PdfField_T, VelocityField_T>;
   BlockSweepWrapper< VelocityFieldWriter_T > velocityFieldWriter( blocks, VelocityFieldWriter_T( pdfFieldId, velocityFieldId ), None, Empty );
@@ -2431,9 +2428,9 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
   if( vtkBeforeTimeStep )
   {
-      for( auto output = vtkOutputFunctions.begin(); output != vtkOutputFunctions.end(); ++output )
+      for(auto & output : vtkOutputFunctions)
-         timeloop.addFuncBeforeTimeStep( output->second.outputFunction, std::string("VTK: ") + output->first,
+         timeloop.addFuncBeforeTimeStep( output.second.outputFunction, std::string("VTK: ") + output.first,
-                                         output->second.requiredGlobalStates, output->second.incompatibleGlobalStates );
+                                        output.second.requiredGlobalStates, output.second.incompatibleGlobalStates );
   }
   // evaluation
@@ -2492,8 +2489,8 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
         adaptiveRefinementLog = oss.str();
      }
-      minTargetLevelDeterminationFunctions.add( std::bind( keepInflowOutflowAtTheSameLevel, std::placeholders::_1, std::placeholders::_2, 
+      minTargetLevelDeterminationFunctions.add([setup_cref = std::cref(setup)](auto & minTargetLevels, auto & blocksAlreadyMarkedForRefinement, auto & bf) {
-                                                           std::placeholders::_3, std::cref(setup) ) );
+         return keepInflowOutflowAtTheSameLevel(minTargetLevels, blocksAlreadyMarkedForRefinement, bf, setup_cref); } );
      if( Is2D< LatticeModel_T >::value )
         minTargetLevelDeterminationFunctions.add( pseudo2DTargetLevelCorrection );
@@ -2569,14 +2566,14 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
                     blockforest::DynamicDiffusionBalance< blockforest::NoPhantomData >( maxIterations, flowIterations ) );
      }
-      // add callback functions which are executed after all block data was unpakced after the dynamic load balancing
+      // add callback functions which are executed after all block data was unpacked after the dynamic load balancing
      // for blocks that have *not* migrated: store current flag field state (required for lbm::PostProcessing)
      blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::MarkerFieldGenerator< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >(
               pdfFieldId, markerDataId, flagFieldFilter ) );
      // (re)set boundaries = (re)initialize flag field for every block with respect to the new block structure (the size of neighbor blocks might have changed)
      blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( blockforest::BlockForest::RefreshCallbackWrappper( boundarySetter ) );
-      // treat boundary-fluid cell convertions
+      // treat boundary-fluid cell conversions
      blockforest.addRefreshCallbackFunctionAfterBlockDataIsUnpacked( lbm::PostProcessing< LatticeModel_T, field::FlagFieldEvaluationFilter<FlagField_T> >(
               pdfFieldId, markerDataId, flagFieldFilter ) );
      // (re)set velocity field (velocity field data is not migrated!)
@@ -2612,9 +2609,9 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
   if( !vtkBeforeTimeStep )
   {
-      for( auto output = vtkOutputFunctions.begin(); output != vtkOutputFunctions.end(); ++output )
+      for(auto & output : vtkOutputFunctions)
-         timeloop.addFuncAfterTimeStep( output->second.outputFunction, std::string("VTK: ") + output->first,
+         timeloop.addFuncAfterTimeStep( output.second.outputFunction, std::string("VTK: ") + output.first,
-                                        output->second.requiredGlobalStates, output->second.incompatibleGlobalStates );
+                                       output.second.requiredGlobalStates, output.second.incompatibleGlobalStates );
   }
   // stability check (non-finite values in the PDF field?)
@@ -2625,7 +2622,7 @@ void run( const shared_ptr< Config > & config, const LatticeModel_T & latticeMod
   // remaining time logger
-   const double remainingTimeLoggerFrequency = configBlock.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 );
+   const real_t remainingTimeLoggerFrequency = configBlock.getParameter< real_t >( "remainingTimeLoggerFrequency", real_c(3.0) );
   timeloop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeloop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "Remaining time logger" );
   // logging right before the simulation starts
@@ -2920,10 +2917,10 @@ int main( int argc, char **argv )
                              "//                                                                                                                      //\n"
                              "//                                               Schaefer Turek Benchmark                                               //\n"
                              "//                                                                                                                      //\n"
-                              "// Reference: Schaefer, M. and Turek, S. (1996) 'Benchmark computations of laminar flow around a cylinder (with support //\n"
+                              "// Reference: Schaefer, M. and Turek, S. (1996) Benchmark computations of laminar flow around a cylinder (with support  //\n"
                              "//            by F. Durst, E. Krause and R. Rannacher), in E. Hirschel (Ed.): Flow Simulation with High-Performance     //\n"
-                              "//            Computers II. DFG Priority Research Program Results 1993-1995, No. 52 in Notes Numer, Fluid Mech.,        //\n"
+                              "//            Computers II. DFG Priority Research Program Results 1993-1995, No. 48 in Notes on Numerical Fluid         //\n"
-                              "//            pp.547-566, Vieweg, Weisbaden.                                                                            //\n"
+                              "//            Mechanics, pp.547-566, Vieweg, Weisbaden.                                                                 //\n"
                              "//                                                                                                                      //\n"
                              "//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////" );
@@ -3062,7 +3059,7 @@ int main( int argc, char **argv )
      refinementSelectionFunctions.add( cylinderRefinementSelection );
   }
-   refinementSelectionFunctions.add( std::bind( setInflowOutflowToSameLevel, std::placeholders::_1, setup ) );
+   refinementSelectionFunctions.add([&setup](auto & bf) { return setInflowOutflowToSameLevel(bf, setup); });
   if( setup.pseudo2D )
      refinementSelectionFunctions.add( Pseudo2DRefinementSelectionCorrection );

--- a/apps/benchmarks/TurbulentChannel/CMakeLists.txt
+++ b/apps/benchmarks/TurbulentChannel/CMakeLists.txt
+waLBerla_link_files_to_builddir( *.prm )
+if( WALBERLA_BUILD_WITH_CODEGEN )
+    #   Turbulent Channel generation
+    walberla_generate_target_from_python( NAME TurbulentChannel_CodeGeneration
+            FILE TurbulentChannel.py
+            OUT_FILES CodegenIncludes.h
+            TurbulentChannel_Sweep.cpp TurbulentChannel_Sweep.h
+            TurbulentChannel_PackInfo.cpp TurbulentChannel_PackInfo.h
+            TurbulentChannel_Setter.cpp TurbulentChannel_Setter.h
+            TurbulentChannel_NoSlip.cpp TurbulentChannel_NoSlip.h
+            TurbulentChannel_FreeSlip_top.cpp TurbulentChannel_FreeSlip_top.h
+            TurbulentChannel_WFB_bottom.cpp TurbulentChannel_WFB_bottom.h
+            TurbulentChannel_WFB_top.cpp TurbulentChannel_WFB_top.h
+            TurbulentChannel_Welford.cpp TurbulentChannel_Welford.h
+            TurbulentChannel_Welford_TKE_SGS.cpp TurbulentChannel_Welford_TKE_SGS.h
+            TurbulentChannel_TKE_SGS_Writer.cpp TurbulentChannel_TKE_SGS_Writer.h
+    )
+    walberla_add_executable( NAME TurbulentChannel_Application
+          FILES TurbulentChannel.cpp
+          DEPENDS walberla::blockforest walberla::core walberla::domain_decomposition walberla::field walberla::geometry walberla::timeloop
+          walberla::lbm walberla::stencil walberla::vtk TurbulentChannel_CodeGeneration )
+endif()
\ No newline at end of file
--- a/apps/benchmarks/TurbulentChannel/TurbulentChannel.cpp
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file TurbulentChannel.cpp
+//! \author Helen Schottenhamml <helen.schottenhamml@fau.de>
+//
+//======================================================================================================================
+#include <memory>
+#include <cmath>
+#include <string>
+#include <iostream>
+#include <blockforest/all.h>
+#include <core/all.h>
+#include <domain_decomposition/all.h>
+#include <field/all.h>
+#include <field/vtk/VTKWriter.h>
+#include <geometry/all.h>
+#include <timeloop/all.h>
+#include <lbm/all.h>
+//    Codegen Includes
+#include "CodegenIncludes.h"
+namespace walberla {
+   ///////////////////////
+   /// Typedef Aliases ///
+   ///////////////////////
+   using Stencil_T = codegen::Stencil_T;
+   // Communication Pack Info
+   using PackInfo_T = pystencils::TurbulentChannel_PackInfo;
+   // PDF field type
+   using PdfField_T = field::GhostLayerField< real_t, Stencil_T::Size >;
+   // Field Types
+   using ScalarField_T = field::GhostLayerField< real_t, 1 >;
+   using VectorField_T = field::GhostLayerField< real_t, Stencil_T::D >;
+   using TensorField_T = field::GhostLayerField< real_t, Stencil_T::D*Stencil_T::D >;
+   using Setter_T = pystencils::TurbulentChannel_Setter;
+   using StreamCollideSweep_T = pystencils::TurbulentChannel_Sweep;
+   using WelfordSweep_T = pystencils::TurbulentChannel_Welford;
+   using TKEWelfordSweep_T = pystencils::TurbulentChannel_Welford_TKE_SGS;
+   using TkeSgsWriter_T = pystencils::TurbulentChannel_TKE_SGS_Writer;
+   // Boundary Handling
+   using flag_t = uint8_t;
+   using FlagField_T = FlagField< flag_t >;
+   using NoSlip_T = lbm::TurbulentChannel_NoSlip;
+   using FreeSlip_top_T = lbm::TurbulentChannel_FreeSlip_top;
+   using WFB_bottom_T = lbm::TurbulentChannel_WFB_bottom;
+   using WFB_top_T = lbm::TurbulentChannel_WFB_top;
+   /// DEAN CORRELATIONS
+   namespace dean_correlation {
+      real_t calculateFrictionReynoldsNumber(const real_t reynoldsBulk) {
+         return std::pow(0.073_r / 8_r, 1_r / 2_r) * std::pow(reynoldsBulk, 7_r / 8_r);
+      }
+      real_t calculateBulkReynoldsNumber(const real_t reynoldsFriction) {
+         return std::pow(8_r / 0.073_r, 4_r / 7_r) * std::pow(reynoldsFriction, 8_r / 7_r);
+      }
+   } // namespace dean_correlation
+   /// VELOCITY FIELD INITIALISATION
+   /*
+    * Initialises the velocity field with a logarithmic profile and sinusoidal perturbations to trigger turbulence.
+    * This initialisation is provided by Henrik Asmuth.
+    */
+   template<typename VelocityField_T>
+   void setVelocityFieldsAsmuth( const std::weak_ptr<StructuredBlockStorage>& forest,
+                                 const BlockDataID & velocityFieldId, const BlockDataID & meanVelocityFieldId,
+                                 const real_t frictionVelocity, const uint_t channel_half_width,
+                                 const real_t B, const real_t kappa, const real_t viscosity,
+                                 const uint_t wallAxis, const uint_t flowAxis ) {
+      auto blocks = forest.lock();
+      WALBERLA_CHECK_NOT_NULLPTR(blocks)
+      const auto domainSize = blocks->getDomain().max();
+      const auto delta = real_c(channel_half_width);
+      const auto remAxis = 3 - wallAxis - flowAxis;
+      for( auto block = blocks->begin(); block != blocks->end(); ++block ) {
+         auto * velocityField = block->template getData<VelocityField_T>(velocityFieldId);
+         WALBERLA_CHECK_NOT_NULLPTR(velocityField)
+         auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityFieldId);
+         WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+         const auto ci = velocityField->xyzSizeWithGhostLayer();
+         for(auto cellIt = ci.begin(); cellIt != ci.end(); ++cellIt) {
+            Cell globalCell(*cellIt);
+            blocks->transformBlockLocalToGlobalCell(globalCell, *block);
+            Vector3<real_t> cellCenter;
+            blocks->getCellCenter(cellCenter, globalCell);
+            const auto y = cellCenter[wallAxis];
+            const auto rel_x = cellCenter[flowAxis] / domainSize[flowAxis];
+            const auto rel_z = cellCenter[remAxis] / domainSize[remAxis];
+            const real_t pos = std::max(delta - std::abs(y - delta - 1_r), 0.05_r);
+            const auto rel_y = pos / delta;
+            auto initialVel = frictionVelocity * (std::log(frictionVelocity * pos / viscosity) / kappa + B);
+            Vector3<real_t> vel;
+            vel[flowAxis] = initialVel;
+            vel[remAxis] = 2_r * frictionVelocity / kappa * std::sin(math::pi * 16_r * rel_x) *
+                           std::sin(math::pi * 8_r * rel_y) / (std::pow(rel_y, 2_r) + 1_r);
+            vel[wallAxis] = 8_r * frictionVelocity / kappa *
+                            (std::sin(math::pi * 8_r * rel_z) * std::sin(math::pi * 8_r * rel_y) +
+                             std::sin(math::pi * 8_r * rel_x)) / (std::pow(0.5_r * delta - pos, 2_r) + 1_r);
+            for(uint_t d = 0; d < 3; ++d) {
+               velocityField->get(*cellIt, d) = vel[d];
+               meanVelocityField->get(*cellIt, d) = vel[d];
+            }
+         }
+      }
+   } // function setVelocityFieldsHenrik
+   /// SIMULATION PARAMETERS
+   struct SimulationParameters {
+      SimulationParameters(const Config::BlockHandle & config)
+      {
+         channelHalfWidth = config.getParameter<uint_t>("channel_half_width");
+         fullChannel = config.getParameter<bool>("full_channel", false);
+         /// TARGET QUANTITIES
+         targetFrictionReynolds = config.getParameter<real_t>("target_friction_reynolds");
+         targetBulkVelocity = config.getParameter<real_t>("target_bulk_velocity", 0.05_r);
+         targetBulkReynolds = dean_correlation::calculateBulkReynoldsNumber(targetFrictionReynolds);
+         viscosity = 2_r * real_c(channelHalfWidth) * targetBulkVelocity / targetBulkReynolds;
+         targetFrictionVelocity = targetFrictionReynolds * viscosity / real_c(channelHalfWidth);
+         /// TIMESTEPS
+         timesteps = config.getParameter<uint_t>("timesteps", 0);
+         const uint_t turnoverPeriods = config.getParameter<uint_t>("turnover_periods", 0);
+         WALBERLA_ASSERT((timesteps != 0) != (turnoverPeriods != 0),
+                         "Either timesteps OR turnover periods must be given.")
+         if(turnoverPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            timesteps = turnoverPeriods * uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
+         }
+         /// DOMAIN DEFINITIONS
+         // obtained from codegen script -> adapt there
+         wallAxis = codegen::wallAxis;
+         flowAxis = codegen::flowAxis;
+         uint_t sizeFlowAxis      = config.getParameter<uint_t>("size_flow_axis", 0);
+         uint_t sizeRemainingAxis = config.getParameter<uint_t>("size_remaining_axis", 0);
+         WALBERLA_ASSERT_NOT_IDENTICAL(wallAxis, flowAxis, "Wall and flow axis must be different.")
+         const auto sizeFactor = channelHalfWidth / uint_t(10);
+         if( !sizeFlowAxis) sizeFlowAxis = sizeFactor * 64;
+         if( !sizeRemainingAxis) sizeRemainingAxis = sizeFactor * 32;
+         domainSize[wallAxis] = fullChannel ? 2 * channelHalfWidth : channelHalfWidth;
+         domainSize[flowAxis] = sizeFlowAxis;
+         domainSize[3- wallAxis - flowAxis] = sizeRemainingAxis;
+         periodicity[wallAxis] = false;
+         boundaryCondition = config.getParameter<std::string>("wall_boundary_condition", "WFB");
+         /// OUTPUT
+         auto tsPerPeriod = uint_c((real_c(channelHalfWidth) / targetFrictionVelocity));
+         vtkFrequency = config.getParameter<uint_t>("vtk_frequency", 0);
+         vtkStart = config.getParameter<uint_t>("vtk_start", 0);
+         plotFrequency = config.getParameter<uint_t>("plot_frequency", 0);
+         plotStart = config.getParameter<uint_t>("plot_start", 0);
+         // vtk start
+         vtkStart = config.getParameter<uint_t>("vtk_start_timesteps", 0);
+         const uint_t vtkStartPeriods = config.getParameter<uint_t>("vtk_start_periods", 0);
+         if(vtkStart || vtkStartPeriods) {
+            WALBERLA_ASSERT((vtkStart != 0) != (vtkStartPeriods != 0),
+                            "VTK start must be given in timesteps OR periods, not both.")
+         }
+         if(vtkStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            vtkStart = vtkStartPeriods * tsPerPeriod;
+         }
+         // plot start
+         plotStart = config.getParameter<uint_t>("plot_start_timesteps", 0);
+         const uint_t plotStartPeriods = config.getParameter<uint_t>("plot_start_periods", 0);
+         if(plotStart || plotStartPeriods) {
+            WALBERLA_ASSERT((plotStart != 0) != (plotStartPeriods != 0),
+                            "Plotting start must be given in timesteps OR periods, not both.")
+         }
+         if(plotStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            plotStart = plotStartPeriods * tsPerPeriod;
+         }
+         // frequencies
+         if(plotFrequency) {
+            timesteps = uint_c(std::ceil(real_c(timesteps) / real_c(plotFrequency))) * plotFrequency;
+         }
+         // sampling start & interval
+         samplingStart = config.getParameter<uint_t>("sampling_start_timesteps", 0);
+         const uint_t samplingStartPeriods = config.getParameter<uint_t>("sampling_start_periods", 0);
+         if(samplingStart || samplingStartPeriods) {
+            WALBERLA_ASSERT((samplingStart != 0) != (samplingStartPeriods != 0),
+                            "Sampling start must be given in timesteps OR periods, not both.")
+         }
+         if(samplingStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            samplingStart = samplingStartPeriods * tsPerPeriod;
+         }
+         samplingInterval = config.getParameter<uint_t>("sampling_interval_timesteps", 0);
+         const uint_t samplingIntervalPeriods = config.getParameter<uint_t>("sampling_interval_periods", 0);
+         if(samplingInterval || samplingIntervalPeriods) {
+            WALBERLA_ASSERT((samplingInterval != 0) != (samplingIntervalPeriods != 0),
+                            "Sampling start must be given in timesteps OR periods, not both.")
+         }
+         if(samplingStartPeriods != 0) {
+            // turnover period defined by T = delta / u_tau
+            samplingInterval = samplingIntervalPeriods * tsPerPeriod;
+         }
+         timesteps += 1;
+      }
+      uint_t channelHalfWidth{};
+      bool fullChannel{};
+      Vector3<uint_t> domainSize{};
+      Vector3<uint_t> periodicity{true};
+      real_t targetFrictionReynolds{};
+      real_t targetBulkReynolds{};
+      real_t targetFrictionVelocity{};
+      real_t targetBulkVelocity{};
+      real_t viscosity{};
+      const real_t density{1.0};
+      uint_t timesteps{};
+      std::string boundaryCondition{};
+      uint_t wallAxis{};
+      uint_t flowAxis{};
+      /// output
+      uint_t vtkFrequency{};
+      uint_t vtkStart{};
+      uint_t plotFrequency{};
+      uint_t plotStart{};
+      uint_t samplingStart{};
+      uint_t samplingInterval{};
+   };
+   namespace boundaries {
+      void createBoundaryConfig(const SimulationParameters & parameters, Config::Block & boundaryBlock) {
+         auto & bottomWall = boundaryBlock.createBlock("Border");
+         bottomWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, true)]);
+         bottomWall.addParameter("walldistance", "-1");
+         if(parameters.boundaryCondition == "NoSlip") {
+            bottomWall.addParameter("flag", "NoSlip");
+         } else if(parameters.boundaryCondition == "WFB") {
+            bottomWall.addParameter("flag", "WFB_bottom");
+         }
+         auto & topWall = boundaryBlock.createBlock("Border");
+         topWall.addParameter("direction", stencil::dirToString[stencil::directionFromAxis(parameters.wallAxis, false)]);
+         topWall.addParameter("walldistance", "-1");
+         if(parameters.fullChannel) {
+            if (parameters.boundaryCondition == "NoSlip") {
+               topWall.addParameter("flag", "NoSlip");
+            } else if (parameters.boundaryCondition == "WFB") {
+               topWall.addParameter("flag", "WFB_top");
+            }
+         } else {
+            topWall.addParameter("flag", "FreeSlip");
+         }
+      }
+   }
+   /// BULK VELOCITY CALCULATION
+   template< typename VelocityField_T >
+   class ForceCalculator {
+    public:
+      ForceCalculator(const std::weak_ptr<StructuredBlockStorage> & blocks, const BlockDataID meanVelocityId,
+                      const SimulationParameters & parameter)
+         : blocks_(blocks), meanVelocityId_(meanVelocityId), channelHalfWidth_(real_c(parameter.channelHalfWidth)),
+           targetBulkVelocity_(parameter.targetBulkVelocity), targetFrictionVelocity_(parameter.targetFrictionVelocity)
+      {
+         const auto & domainSize = parameter.domainSize;
+         Cell maxCell;
+         maxCell[parameter.wallAxis] = int_c(parameter.channelHalfWidth) - 1;
+         maxCell[flowDirection_] = int_c(domainSize[flowDirection_]) - 1;
+         const auto remainingIdx = 3 - parameter.wallAxis - flowDirection_;
+         maxCell[remainingIdx] = int_c(domainSize[remainingIdx]) - 1;
+         ci_ = CellInterval(Cell{}, maxCell);
+         numCells_ = real_c(parameter.channelHalfWidth * domainSize[flowDirection_] * domainSize[remainingIdx]);
+      }
+      real_t bulkVelocity() const { return bulkVelocity_; }
+      void setBulkVelocity(const real_t bulkVelocity) { bulkVelocity_ = bulkVelocity; }
+      void calculateBulkVelocity() {
+         // reset bulk velocity
+         bulkVelocity_ = 0_r;
+         auto blocks = blocks_.lock();
+         WALBERLA_CHECK_NOT_NULLPTR(blocks)
+         for( auto block = blocks->begin(); block != blocks->end(); ++block) {
+            auto * meanVelocityField = block->template getData<VelocityField_T>(meanVelocityId_);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+            auto fieldSize = meanVelocityField->xyzSize();
+            CellInterval localCi;
+            blocks->transformGlobalToBlockLocalCellInterval(localCi, *block, ci_);
+            fieldSize.intersect(localCi);
+            auto * slicedField = meanVelocityField->getSlicedField(fieldSize);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocityField)
+            for(auto fieldIt = slicedField->beginXYZ(); fieldIt != slicedField->end(); ++fieldIt) {
+               const auto localMean = fieldIt[flowDirection_];
+               bulkVelocity_ += localMean;
+            }
+         }
+         mpi::allReduceInplace< real_t >(bulkVelocity_, mpi::SUM);
+         bulkVelocity_ /= numCells_;
+      }
+      real_t calculateDrivingForce() const {
+         // forcing term as in Malaspinas (2014) "Wall model for large-eddy simulation based on the lattice Boltzmann method"
+         const auto force = targetFrictionVelocity_ * targetFrictionVelocity_ / channelHalfWidth_
+                            + (targetBulkVelocity_ - bulkVelocity_) * targetBulkVelocity_ / channelHalfWidth_;
+         return force;
+      }
+    private:
+      const std::weak_ptr<StructuredBlockStorage> blocks_{};
+      const BlockDataID meanVelocityId_{};
+      const uint_t flowDirection_{};
+      const real_t channelHalfWidth_{};
+      const real_t targetBulkVelocity_{};
+      const real_t targetFrictionVelocity_{};
+      CellInterval ci_{};
+      real_t numCells_{};
+      real_t bulkVelocity_{};
+   };
+   template< typename Welford_T >
+   class TurbulentChannelPlotter {
+    public:
+      TurbulentChannelPlotter(SimulationParameters const * const parameters, Timeloop * const timeloop,
+                              ForceCalculator<VectorField_T> const * const forceCalculator,
+                              const std::weak_ptr<StructuredBlockStorage> & blocks,
+                              const BlockDataID velocityFieldId, const BlockDataID meanVelocityFieldId,
+                              const BlockDataID meanTkeSGSFieldId, const BlockDataID sosFieldId, Welford_T * velocityWelford,
+                              const bool separateFile = false)
+         : parameters_(parameters), forceCalculator_(forceCalculator), timeloop_(timeloop), blocks_(blocks),
+           velocityWelford_(velocityWelford), velocityFieldId_(velocityFieldId), meanVelocityFieldId_(meanVelocityFieldId),
+           meanTkeSGSFieldId_(meanTkeSGSFieldId), sosFieldId_(sosFieldId), plotFrequency_(parameters->plotFrequency), plotStart_(parameters->plotStart),
+           separateFiles_(separateFile)
+      {
+         if(!plotFrequency_)
+            return;
+         // prepare output folder
+         const filesystem::path path(baseFolder_);
+         std::string fileSuffix = parameters->boundaryCondition + "_";
+         if(parameters->fullChannel)
+            fileSuffix += "full_D";
+         else
+            fileSuffix += "half_D";
+         fileSuffix += std::to_string(parameters->channelHalfWidth) + "_Re" +
+                       std::to_string(int(parameters->targetFrictionReynolds)) ;
+         velocityProfilesFilePath_ = path / ("velocity_profiles_" + fileSuffix);
+         forcingDataFilePath_ = path / ("forcing_data_" + fileSuffix + "_t" +
+                                        std::to_string(parameters->timesteps-1) + ".txt");
+         WALBERLA_ROOT_SECTION() {
+            // create directory if not existent; empty if existent
+            if( !filesystem::exists(path) ) {
+               filesystem::create_directories(path);
+            } else {
+               for (const auto& entry : filesystem::directory_iterator(path))
+                  std::filesystem::remove_all(entry.path());
+            }
+         }
+         // write force header
+         std::ofstream os (forcingDataFilePath_, std::ios::out | std::ios::trunc);
+         if(os.is_open()) {
+            os << "# timestep\t bulk_velocity\t driving_force\n";
+            os.close();
+         } else {
+            WALBERLA_ABORT("Could not open forcing data file.")
+         }
+      }
+      void operator()() {
+         const auto ts = timeloop_->getCurrentTimeStep();
+         if(ts < plotStart_)
+            return;
+         if(!plotFrequency_ || (ts % plotFrequency_))
+            return;
+         const auto channelHalfWidth = real_c(parameters_->channelHalfWidth);
+         const auto bulkVelocity = forceCalculator_->bulkVelocity();
+         /// write force data
+         WALBERLA_ROOT_SECTION() {
+            std::ofstream forceOS(forcingDataFilePath_, std::ios::out | std::ios::app);
+            if (forceOS.is_open())
+            {
+               forceOS << ts << "\t" << bulkVelocity << "\t" << forceCalculator_->calculateDrivingForce() << "\n";
+               forceOS.close();
+            }
+         }
+         /// write velocity data
+         // gather velocity data
+         std::vector<real_t> instantaneousVelocityVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> meanVelocityVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> tkeSGSVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> tkeResolvedVector(parameters_->channelHalfWidth, 0_r);
+         std::vector<real_t> reynoldsStressVector(parameters_->channelHalfWidth * TensorField_T::F_SIZE, 0_r);
+         const auto idxFlow = int_c(parameters_->domainSize[parameters_->flowAxis] / uint_t(2));
+         const auto idxRem = int_c(parameters_->domainSize[3 - parameters_->flowAxis - parameters_->wallAxis] / uint_t(2));
+         Cell point;
+         point[parameters_->flowAxis] = idxFlow;
+         point[3 - parameters_->flowAxis - parameters_->wallAxis] = idxRem;
+         const auto flowAxis = int_c(parameters_->flowAxis);
+         auto blocks = blocks_.lock();
+         WALBERLA_CHECK_NOT_NULLPTR(blocks)
+         for(auto block = blocks->begin(); block != blocks->end(); ++block) {
+            const auto * const velocity = block->template getData<VectorField_T>(velocityFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(velocity)
+            const auto * const meanVelocity = block->template getData<VectorField_T>(meanVelocityFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(meanVelocity)
+            const auto * const tkeSGS = block->template getData<ScalarField_T>(meanTkeSGSFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(tkeSGS)
+            const auto * const sos = block->template getData<TensorField_T>(sosFieldId_);
+            WALBERLA_CHECK_NOT_NULLPTR(sos)
+            for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
+               point[parameters_->wallAxis] = int_c(idx);
+               Cell localCell;
+               blocks->transformGlobalToBlockLocalCell(localCell, *block, point);
+               if(velocity->xyzSize().contains(localCell)){
+                  instantaneousVelocityVector[idx] = velocity->get(localCell, flowAxis);
+                  meanVelocityVector[idx] = meanVelocity->get(localCell, flowAxis);
+                  tkeSGSVector[idx] = tkeSGS->get(localCell);
+                  for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+i] = sos->get(localCell,i) / velocityWelford_->getCounter();
+                  }
+                  tkeResolvedVector[idx] = real_c(0.5) * (
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+0] +
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+4] +
+                     reynoldsStressVector[idx*TensorField_T::F_SIZE+8]
+                  );
+               }
+            }
+         }
+         // MPI exchange information
+         mpi::reduceInplace(instantaneousVelocityVector, mpi::SUM);
+         mpi::reduceInplace(meanVelocityVector, mpi::SUM);
+         mpi::reduceInplace(tkeSGSVector, mpi::SUM);
+         mpi::reduceInplace(tkeResolvedVector, mpi::SUM);
+         mpi::reduceInplace(reynoldsStressVector, mpi::SUM);
+         WALBERLA_ROOT_SECTION()
+         {
+            std::ofstream velocityOS;
+            filesystem::path path = velocityProfilesFilePath_;
+            if (separateFiles_) {
+               path.concat("_t" + std::to_string(timeloop_->getCurrentTimeStep()) + ".txt");
+               velocityOS.open(path, std::ios::out | std::ios::trunc);
+            } else {
+               path.concat("_t" + std::to_string(parameters_->timesteps-1) + ".txt");
+               velocityOS.open(path, std::ios::out | std::ios::trunc);
+            }
+            if (velocityOS.is_open()) {
+               if (!separateFiles_) velocityOS << "# t = " << ts << "\n";
+               velocityOS << "# y/delta\t y+\t u+\t u_mean\t u_instantaneous\t TKE_SGS\t TKE_resolved\t uu_rms\t uv_rms\t uw_rms\t vu_rms\t vv_rms\t vw_rms\t wu_rms\t wv_rms\t ww_rms\n";
+               const auto & viscosity = parameters_->viscosity;
+               const auto bulkReynolds = 2_r * channelHalfWidth * bulkVelocity / viscosity;
+               const auto frictionReynolds = dean_correlation::calculateFrictionReynoldsNumber(bulkReynolds);
+               const auto frictionVelocity = frictionReynolds * viscosity / channelHalfWidth;
+               for(uint_t idx = 0; idx < parameters_->channelHalfWidth; ++idx) {
+                  // relative position
+                  velocityOS << (real_c(idx)+0.5_r) / channelHalfWidth << "\t";
+                  // y+
+                  velocityOS << (real_c(idx)+0.5_r) * frictionVelocity / viscosity << "\t";
+                  // u+
+                  velocityOS << meanVelocityVector[idx] / frictionVelocity << "\t";
+                  // mean velocity
+                  velocityOS << meanVelocityVector[idx] << "\t";
+                  // instantaneous velocity
+                  velocityOS << instantaneousVelocityVector[idx] << "\t";
+                  // subgrid-scale TKE
+                  velocityOS << tkeSGSVector[idx] << "\t";
+                  // resolved TKE
+                  velocityOS << tkeResolvedVector[idx] << "\t";
+                  // Reynolds stresses
+                  for(uint_t i = 0; i < TensorField_T::F_SIZE; ++i) {
+                     velocityOS << reynoldsStressVector[idx*TensorField_T::F_SIZE+i] << "\t";
+                  }
+                  velocityOS << "\n";
+               }
+               velocityOS.close();
+            } else{
+               WALBERLA_ABORT("Could not open velocity plot file " << path.generic_string())
+            }
+         }
+      }
+    private:
+      SimulationParameters const * const parameters_{};
+      ForceCalculator<VectorField_T> const * const forceCalculator_{};
+      Timeloop * const timeloop_{};
+      const std::weak_ptr<StructuredBlockStorage> blocks_;
+      Welford_T * const velocityWelford_{};
+      const BlockDataID velocityFieldId_{};
+      const BlockDataID meanVelocityFieldId_{};
+      const BlockDataID meanTkeSGSFieldId_{};
+      const BlockDataID sosFieldId_{};
+      const uint_t plotFrequency_{};
+      const uint_t plotStart_{};
+      const bool separateFiles_{false};
+      const std::string baseFolder_{"output"};
+      filesystem::path velocityProfilesFilePath_;
+      filesystem::path forcingDataFilePath_;
+   };
+   /////////////////////
+   /// Main Function ///
+   /////////////////////
+   int main(int argc, char** argv) {
+      Environment walberlaEnv(argc, argv);
+      if (!walberlaEnv.config()) { WALBERLA_ABORT("No configuration file specified!") }
+      ///////////////////////////////////////////////////////
+      /// Block Storage Creation and Simulation Parameter ///
+      ///////////////////////////////////////////////////////
+      WALBERLA_LOG_INFO_ON_ROOT("Creating block forest...")
+      const auto channelParameter = walberlaEnv.config()->getOneBlock("TurbulentChannel");
+      SimulationParameters simulationParameters(channelParameter);
+      // domain creation
+      std::shared_ptr<StructuredBlockForest> blocks;
+      {
+         Vector3< uint_t > numBlocks;
+         Vector3< uint_t > cellsPerBlock;
+         blockforest::calculateCellDistribution(simulationParameters.domainSize,
+                                                uint_c(mpi::MPIManager::instance()->numProcesses()),
+                                                numBlocks, cellsPerBlock);
+         const auto & periodicity = simulationParameters.periodicity;
+         auto & domainSize = simulationParameters.domainSize;
+         const Vector3<uint_t> newDomainSize(numBlocks[0] * cellsPerBlock[0], numBlocks[1] * cellsPerBlock[1], numBlocks[2] * cellsPerBlock[2]);
+         if(domainSize != newDomainSize) {
+            domainSize = newDomainSize;
+            WALBERLA_LOG_WARNING_ON_ROOT("\nWARNING: Domain size has changed due to the chosen domain decomposition.\n")
+         }
+         SetupBlockForest sforest;
+         sforest.addWorkloadMemorySUIDAssignmentFunction( blockforest::uniformWorkloadAndMemoryAssignment );
+         sforest.init( AABB(0_r, 0_r, 0_r, real_c(domainSize[0]), real_c(domainSize[1]), real_c(domainSize[2])),
+                       numBlocks[0], numBlocks[1], numBlocks[2], periodicity[0], periodicity[1], periodicity[2] );
+         // calculate process distribution
+         const memory_t memoryLimit = numeric_cast< memory_t >( sforest.getNumberOfBlocks() );
+         const blockforest::GlobalLoadBalancing::MetisConfiguration< SetupBlock > metisConfig(
+            true, false, std::bind( blockforest::cellWeightedCommunicationCost, std::placeholders::_1, std::placeholders::_2,
+                                    cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] ) );
+         sforest.calculateProcessDistribution_Default( uint_c( MPIManager::instance()->numProcesses() ), memoryLimit,
+                                                       "hilbert", 10, false, metisConfig );
+         if( !MPIManager::instance()->rankValid() )
+            MPIManager::instance()->useWorldComm();
+         // create StructuredBlockForest (encapsulates a newly created BlockForest)
+         WALBERLA_LOG_INFO_ON_ROOT("SetupBlockForest created successfully:\n" << sforest)
+         sforest.writeVTKOutput("domain_decomposition");
+         auto bf = std::make_shared< BlockForest >( uint_c( MPIManager::instance()->rank() ), sforest, false );
+         blocks = std::make_shared< StructuredBlockForest >( bf, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2] );
+         blocks->createCellBoundingBoxes();
+      }
+      ////////////////////////////////////
+      /// PDF Field and Velocity Setup ///
+      ////////////////////////////////////
+      WALBERLA_LOG_INFO_ON_ROOT("Creating fields...")
+      // Common Fields
+      const BlockDataID velocityFieldId = field::addToStorage< VectorField_T >(blocks, "velocity", real_c(0.0), codegen::layout);
+      const BlockDataID meanVelocityFieldId = field::addToStorage< VectorField_T >(blocks, "mean velocity", real_c(0.0), codegen::layout);
+      const BlockDataID sosFieldId = field::addToStorage< TensorField_T >(blocks, "sum of squares", real_c(0.0), codegen::layout);
+      const BlockDataID tkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "tke_SGS", real_c(0.0), codegen::layout);
+      const BlockDataID meanTkeSgsFieldId = field::addToStorage< ScalarField_T >(blocks, "mean_tke_SGS", real_c(0.0), codegen::layout);
+      const BlockDataID omegaFieldId = field::addToStorage< ScalarField_T >(blocks, "omega_out", real_c(0.0), codegen::layout);
+      const BlockDataID flagFieldId = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+      // CPU Field for PDFs
+      const BlockDataID pdfFieldId = field::addToStorage< PdfField_T >(blocks, "pdf field", real_c(0.0), codegen::layout);
+      ///////////////////////////////////////////
+      /// Force and bulk velocity calculation ///
+      ///////////////////////////////////////////
+      ForceCalculator<VectorField_T> forceCalculator(blocks, velocityFieldId, simulationParameters);
+      //////////////
+      /// Setter ///
+      //////////////
+      WALBERLA_LOG_INFO_ON_ROOT("Setting up fields...")
+      // Velocity field setup
+      setVelocityFieldsAsmuth<VectorField_T>(
+         blocks, velocityFieldId, meanVelocityFieldId,
+         simulationParameters.targetFrictionVelocity, simulationParameters.channelHalfWidth,
+         5.5_r, 0.41_r, simulationParameters.viscosity,
+         simulationParameters.wallAxis, simulationParameters.flowAxis );
+      forceCalculator.setBulkVelocity(simulationParameters.targetBulkVelocity);
+      const auto initialForce = forceCalculator.calculateDrivingForce();
+      // pdfs setup
+      Setter_T pdfSetter(pdfFieldId, velocityFieldId, initialForce, simulationParameters.density);
+      for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt)
+         pdfSetter(blockIt.get());
+      /////////////
+      /// Sweep ///
+      /////////////
+      WALBERLA_LOG_INFO_ON_ROOT("Creating sweeps...")
+      const auto omega = lbm::collision_model::omegaFromViscosity(simulationParameters.viscosity);
+      StreamCollideSweep_T streamCollideSweep(omegaFieldId, pdfFieldId, velocityFieldId, initialForce, omega);
+      WelfordSweep_T welfordSweep(meanVelocityFieldId, sosFieldId, velocityFieldId, 0_r);
+      TKEWelfordSweep_T welfordTKESweep(meanTkeSgsFieldId, tkeSgsFieldId, 0_r);
+      TkeSgsWriter_T tkeSgsWriter(omegaFieldId, pdfFieldId, tkeSgsFieldId, initialForce, omega);
+      /////////////////////////
+      /// Boundary Handling ///
+      /////////////////////////
+      WALBERLA_LOG_INFO_ON_ROOT("Creating boundary handling...")
+      const FlagUID fluidFlagUID("Fluid");
+      Config::Block boundaryBlock;
+      boundaries::createBoundaryConfig(simulationParameters, boundaryBlock);
+      std::unique_ptr<WFB_bottom_T> wfb_bottom_ptr = std::make_unique<WFB_bottom_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
+      std::unique_ptr<WFB_top_T > wfb_top_ptr = std::make_unique<WFB_top_T>(blocks, meanVelocityFieldId, pdfFieldId, omega, simulationParameters.targetFrictionVelocity);
+      NoSlip_T noSlip(blocks, pdfFieldId);
+      FreeSlip_top_T freeSlip_top(blocks, pdfFieldId);
+      geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldId, Config::BlockHandle(&boundaryBlock));
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldId, fluidFlagUID);
+      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("NoSlip"), fluidFlagUID);
+      freeSlip_top.fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("FreeSlip"), fluidFlagUID);
+      wfb_bottom_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_bottom"), fluidFlagUID);
+      wfb_top_ptr->fillFromFlagField< FlagField_T >(blocks, flagFieldId, FlagUID("WFB_top"), fluidFlagUID);
+      //////////////
+      /// Output ///
+      //////////////
+      WALBERLA_LOG_INFO_ON_ROOT("Creating field output...")
+      // vtk output
+      auto vtkWriter = vtk::createVTKOutput_BlockData(
+         blocks, "field_writer", simulationParameters.vtkFrequency, 0, false, "vtk_out", "simulation_step",
+         false, false, true, false
+      );
+      vtkWriter->setInitialWriteCallsToSkip(simulationParameters.vtkStart);
+      // velocity field writer
+      auto velocityWriter = std::make_shared<field::VTKWriter<VectorField_T>>(velocityFieldId, "instantaneous velocity");
+      vtkWriter->addCellDataWriter(velocityWriter);
+      auto meanVelocityFieldWriter = std::make_shared<field::VTKWriter<VectorField_T>>(meanVelocityFieldId, "mean velocity");
+      vtkWriter->addCellDataWriter(meanVelocityFieldWriter);
+      // vtk writer
+      {
+         auto flagOutput = vtk::createVTKOutput_BlockData(
+            blocks, "flag_writer", 1, 1, false, "vtk_out", "simulation_step",
+            false, true, true, false
+         );
+         auto flagWriter = std::make_shared<field::VTKWriter<FlagField_T>>(flagFieldId, "flag field");
+         flagOutput->addCellDataWriter(flagWriter);
+         flagOutput->write();
+      }
+      /////////////////
+      /// Time Loop ///
+      /////////////////
+      WALBERLA_LOG_INFO_ON_ROOT("Creating timeloop...")
+      SweepTimeloop timeloop(blocks->getBlockStorage(), simulationParameters.timesteps);
+      // Communication
+      blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
+      communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
+      auto setNewForce = [&](const real_t newForce) {
+         streamCollideSweep.setF_x(newForce);
+         tkeSgsWriter.setF_x(newForce);
+      };
+      // plotting
+      const bool outputSeparateFiles = channelParameter.getParameter<bool>("separate_files", false);
+      const TurbulentChannelPlotter<WelfordSweep_T > plotter(&simulationParameters, &timeloop, &forceCalculator, blocks,
+                                                             velocityFieldId, meanVelocityFieldId,
+                                                             meanTkeSgsFieldId, sosFieldId, &welfordSweep,
+                                                             outputSeparateFiles);
+      //NOTE must convert sweeps that are altered to lambdas, otherwise copy and counter will stay 0
+      auto welfordLambda = [&welfordSweep, &welfordTKESweep](IBlock * block) {
+         welfordSweep(block);
+         welfordTKESweep(block);
+      };
+      auto wfbLambda = [&wfb_bottom_ptr, &wfb_top_ptr](IBlock * block) {
+         wfb_bottom_ptr->operator()(block);
+         wfb_top_ptr->operator()(block);
+      };
+      auto streamCollideLambda = [&streamCollideSweep](IBlock * block) {
+         streamCollideSweep(block);
+      };
+      // Timeloop
+      timeloop.add() << BeforeFunction(communication, "communication")
+                     << BeforeFunction([&](){forceCalculator.calculateBulkVelocity();}, "bulk velocity calculation")
+                     << BeforeFunction([&](){
+                           const auto newForce = forceCalculator.calculateDrivingForce();
+                           setNewForce(newForce);
+                        }, "new force setter")
+                     << Sweep([](IBlock *){}, "new force setter");
+      timeloop.add() << Sweep(freeSlip_top, "freeSlip");
+      timeloop.add() << Sweep(noSlip, "noSlip");
+      timeloop.add() << Sweep(wfbLambda, "wall function bounce");
+      timeloop.add() << Sweep(streamCollideLambda, "stream and collide");
+      timeloop.add() << BeforeFunction([&](){
+                           const uint_t velCtr = uint_c(welfordSweep.getCounter());
+                           if((timeloop.getCurrentTimeStep() == simulationParameters.samplingStart) ||
+                              (timeloop.getCurrentTimeStep() > simulationParameters.samplingStart && simulationParameters.samplingInterval && (velCtr % simulationParameters.samplingInterval == 0))) {
+                              welfordSweep.setCounter(real_c(0.0));
+                              welfordTKESweep.setCounter(real_c(0.0));
+                               for(auto & block : *blocks) {
+                                  auto * sopField = block.template getData<TensorField_T >(sosFieldId);
+                                  sopField->setWithGhostLayer(0.0);
+                                  auto * tkeField = block.template getData<ScalarField_T>(tkeSgsFieldId);
+                                  tkeField->setWithGhostLayer(0.0);
+                               }
+                           }
+                           welfordSweep.setCounter(welfordSweep.getCounter() + real_c(1.0));
+                           welfordTKESweep.setCounter(welfordTKESweep.getCounter() + real_c(1.0));
+                        }, "welford sweep")
+                     << Sweep(welfordLambda, "welford sweep");
+      timeloop.add() << Sweep(tkeSgsWriter, "TKE_SGS writer");
+      timeloop.addFuncAfterTimeStep(vtk::writeFiles(vtkWriter), "VTK field output");
+      timeloop.addFuncAfterTimeStep(plotter, "Turbulent quantity plotting");
+      // LBM stability check
+      timeloop.addFuncAfterTimeStep( makeSharedFunctor( field::makeStabilityChecker< PdfField_T, FlagField_T >(
+                                       walberlaEnv.config(), blocks, pdfFieldId, flagFieldId, fluidFlagUID ) ),
+                                    "LBM stability check" );
+      // Time logger
+      timeloop.addFuncAfterTimeStep(timing::RemainingTimeLogger(timeloop.getNrOfTimeSteps(), 5_r),
+                                    "remaining time logger");
+      WALBERLA_LOG_INFO_ON_ROOT("Running timeloop with " << timeloop.getNrOfTimeSteps() - 1 << " timesteps...")
+      WcTimingPool timing;
+      WcTimer timer;
+      timer.start();
+      timeloop.run(timing);
+      timer.end();
+      double time = timer.max();
+      walberla::mpi::reduceInplace(time, walberla::mpi::MAX);
+      const auto timeloopTiming = timing.getReduced();
+      WALBERLA_LOG_INFO_ON_ROOT("Timeloop timing:\n" << *timeloopTiming)
+      const walberla::lbm::PerformanceEvaluation<FlagField_T> performance(blocks, flagFieldId, fluidFlagUID);
+      performance.logResultOnRoot(simulationParameters.timesteps, time);
+      return EXIT_SUCCESS;
+   }
+} // namespace walberla
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
--- a/apps/benchmarks/TurbulentChannel/TurbulentChannel.prm
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.prm
+TurbulentChannel {
+    channel_half_width           20;
+    full_channel                  0;
+    wall_boundary_condition     WFB;
+    target_friction_Reynolds    395;
+    target_bulk_velocity        0.1;
+    // turnover_periods              50;
+    timesteps                   100;
+    // sampling_start_timesteps      50;
+    // sampling_start_periods      1;
+    // sampling_interval_timesteps   20;
+    // sampling_interval_periods   1;
+    // vtk_start_timesteps        50;
+    // vtk_start_periods          1000;
+    // plot_start_timesteps       50;
+    // plot_start_periods         1000;
+    vtk_frequency              10;
+    plot_frequency             10;
+    separate_files                0;
+}
+StabilityChecker
+{
+   checkFrequency 10000;
+   streamOutput   false;
+   vtkOutput      true;
+}
--- a/apps/benchmarks/TurbulentChannel/TurbulentChannel.py
+++ b/apps/benchmarks/TurbulentChannel/TurbulentChannel.py
+import sympy as sp
+import pystencils as ps
+from lbmpy.enums import SubgridScaleModel
+from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, ForceModel
+from lbmpy.flow_statistics import welford_assignments
+from lbmpy.relaxationrates import lattice_viscosity_from_relaxation_rate
+from lbmpy.creationfunctions import create_lb_update_rule
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+from lbmpy.boundaries import NoSlip, FreeSlip, WallFunctionBounce
+from lbmpy.boundaries.wall_function_models import SpaldingsLaw, MoninObukhovSimilarityTheory
+from lbmpy.utils import frobenius_norm, second_order_moment_tensor
+from pystencils_walberla import CodeGeneration, generate_sweep, generate_pack_info_from_kernel
+from lbmpy_walberla import generate_boundary
+#   =====================
+#      Code Generation
+#   =====================
+info_header = """
+#ifndef TURBULENTCHANNEL_INCLUDES
+#define TURBULENTCHANNEL_INCLUDES
+#include <stencil/D{d}Q{q}.h>
+#include "TurbulentChannel_Sweep.h"
+#include "TurbulentChannel_PackInfo.h"
+#include "TurbulentChannel_Setter.h"
+#include "TurbulentChannel_Welford.h"
+#include "TurbulentChannel_Welford_TKE_SGS.h"
+#include "TurbulentChannel_TKE_SGS_Writer.h"
+#include "TurbulentChannel_NoSlip.h"
+#include "TurbulentChannel_FreeSlip_top.h"
+#include "TurbulentChannel_WFB_top.h"
+#include "TurbulentChannel_WFB_bottom.h"
+namespace walberla {{
+    namespace codegen {{
+        using Stencil_T = walberla::stencil::D{d}Q{q};
+        static constexpr uint_t flowAxis = {flow_axis};
+        static constexpr uint_t wallAxis = {wall_axis};
+        static constexpr field::Layout layout = field::{layout};
+    }}
+}}
+#endif // TURBULENTCHANNEL_INCLUDES
+"""
+def check_axis(flow_axis, wall_axis):
+    assert flow_axis != wall_axis
+    assert flow_axis < 3
+    assert wall_axis < 3
+with CodeGeneration() as ctx:
+    flow_axis = 0
+    wall_axis = 1
+    check_axis(flow_axis=flow_axis, wall_axis=wall_axis)
+    #   ========================
+    #      General Parameters
+    #   ========================
+    target = ps.Target.CPU
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    stencil = LBStencil(Stencil.D3Q19)
+    omega = sp.Symbol('omega')
+    F_x = sp.Symbol('F_x')
+    force_vector = [0] * 3
+    force_vector[flow_axis] = F_x
+    layout = 'fzyx'
+    normal_direction_top = [0] * 3
+    normal_direction_top[wall_axis] = -1
+    normal_direction_top = tuple(normal_direction_top)
+    normal_direction_bottom = [0] * 3
+    normal_direction_bottom[wall_axis] = 1
+    normal_direction_bottom = tuple(normal_direction_bottom)
+    #   PDF Fields
+    pdfs, pdfs_tmp = ps.fields(f'pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {data_type}[{stencil.D}D]', layout=layout)
+    #   Output Fields
+    omega_field = ps.fields(f"omega_out: {data_type}[{stencil.D}D]", layout=layout)
+    sgs_tke = ps.fields(f"sgs_tke: {data_type}[{stencil.D}D]", layout=layout)
+    mean_sgs_tke = ps.fields(f"mean_sgs_tke: {data_type}[{stencil.D}D]", layout=layout)
+    velocity = ps.fields(f"velocity({stencil.D}): {data_type}[{stencil.D}D]", layout=layout)
+    mean_velocity = ps.fields(f"mean_velocity({stencil.D}): {data_type}[{stencil.D}D]", layout=layout)
+    sum_of_squares_field = ps.fields(f"sum_of_squares_field({stencil.D**2}): {data_type}[{stencil.D}D]", layout=layout)
+    # LBM Optimisation
+    lbm_opt = LBMOptimisation(cse_global=True,
+                              symbolic_field=pdfs,
+                              symbolic_temporary_field=pdfs_tmp,
+                              field_layout=layout)
+    #   ==================
+    #      Method Setup
+    #   ==================
+    lbm_config = LBMConfig(stencil=stencil,
+                           method=Method.CUMULANT,
+                           force_model=ForceModel.GUO,
+                           force=tuple(force_vector),
+                           relaxation_rate=omega,
+                           subgrid_scale_model=SubgridScaleModel.QR,
+                           # galilean_correction=True,
+                           compressible=True,
+                           omega_output_field=omega_field,
+                           output={'velocity': velocity})
+    update_rule = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+    lbm_method = update_rule.method
+    #   ========================
+    #      PDF Initialization
+    #   ========================
+    initial_rho = sp.Symbol('rho_0')
+    pdfs_setter = macroscopic_values_setter(lbm_method,
+                                            initial_rho,
+                                            velocity.center_vector,
+                                            pdfs.center_vector)
+    #   LBM Sweep
+    generate_sweep(ctx, "TurbulentChannel_Sweep", update_rule, field_swaps=[(pdfs, pdfs_tmp)], target=target)
+    #   Pack Info
+    generate_pack_info_from_kernel(ctx, "TurbulentChannel_PackInfo", update_rule, target=target)
+    #   Macroscopic Values Setter
+    generate_sweep(ctx, "TurbulentChannel_Setter", pdfs_setter, target=target, ghost_layers_to_include=1)
+    #   Welford update
+    # welford_update = welford_assignments(vector_field=velocity, mean_vector_field=mean_velocity)
+    welford_update = welford_assignments(field=velocity, mean_field=mean_velocity,
+                                         sum_of_squares_field=sum_of_squares_field)
+    generate_sweep(ctx, "TurbulentChannel_Welford", welford_update, target=target)
+    tke_welford_update = welford_assignments(field=sgs_tke, mean_field=mean_sgs_tke)
+    generate_sweep(ctx, "TurbulentChannel_Welford_TKE_SGS", tke_welford_update, target=target)
+    # subgrid TKE output
+    @ps.kernel
+    def tke_sgs_writer():
+        f_neq = sp.Matrix(pdfs.center_vector) - lbm_method.get_equilibrium_terms()
+        rho = lbm_method.conserved_quantity_computation.density_symbol
+        strain_rate = frobenius_norm(-3 * omega_field.center / (2 * rho) * second_order_moment_tensor(f_neq, lbm_method.stencil))
+        eddy_viscosity = lattice_viscosity_from_relaxation_rate(omega_field.center) - lattice_viscosity_from_relaxation_rate(omega)
+        sgs_tke.center @= (eddy_viscosity * strain_rate**2)**(2.0/3.0)
+    tke_sgs_ac = ps.AssignmentCollection(
+        [lbm_method.conserved_quantity_computation.equilibrium_input_equations_from_pdfs(pdfs.center_vector),
+         *tke_sgs_writer]
+    )
+    generate_sweep(ctx, "TurbulentChannel_TKE_SGS_Writer", tke_sgs_ac)
+    #   Boundary conditions
+    nu = lattice_viscosity_from_relaxation_rate(omega)
+    u_tau_target = sp.Symbol("target_u_tau")
+    noslip = NoSlip()
+    freeslip_top = FreeSlip(stencil, normal_direction=normal_direction_top)
+    wfb_top = WallFunctionBounce(lbm_method, pdfs, normal_direction=normal_direction_top,
+                                 wall_function_model=SpaldingsLaw(viscosity=nu,
+                                                                  kappa=0.41, b=5.5, newton_steps=5),
+                                 mean_velocity=mean_velocity, data_type=data_type,
+                                 target_friction_velocity=u_tau_target)
+    wfb_bottom = WallFunctionBounce(lbm_method, pdfs, normal_direction=normal_direction_bottom,
+                                    wall_function_model=SpaldingsLaw(viscosity=nu,
+                                                                     kappa=0.41, b=5.5, newton_steps=5),
+                                    mean_velocity=mean_velocity, data_type=data_type,
+                                    target_friction_velocity=u_tau_target)
+    generate_boundary(ctx, "TurbulentChannel_NoSlip", noslip, lbm_method, target=target)
+    generate_boundary(ctx, "TurbulentChannel_FreeSlip_top", freeslip_top, lbm_method, target=target)
+    generate_boundary(ctx, "TurbulentChannel_WFB_bottom", wfb_bottom, lbm_method, target=target)
+    generate_boundary(ctx, "TurbulentChannel_WFB_top", wfb_top, lbm_method, target=target)
+    info_header_params = {
+        'layout': layout,
+        'd': stencil.D,
+        'q': stencil.Q,
+        'flow_axis': flow_axis,
+        'wall_axis': wall_axis
+    }
+    ctx.write_file("CodegenIncludes.h", info_header.format(**info_header_params))
--- a/apps/benchmarks/UniformGrid/CMakeLists.txt
+++ b/apps/benchmarks/UniformGrid/CMakeLists.txt
@@ -2,5 +2,5 @@
 waLBerla_link_files_to_builddir( "*.dat" )
-waLBerla_add_executable ( NAME UniformGridBenchmark 
+waLBerla_add_executable ( NAME UniformGridBenchmark
-                          DEPENDS blockforest boundary core domain_decomposition field lbm postprocessing timeloop vtk sqlite )
+      DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::domain_decomposition walberla::field walberla::lbm walberla::postprocessing walberla::timeloop walberla::vtk walberla::sqlite )
\ No newline at end of file
--- a/apps/benchmarks/UniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt
+waLBerla_link_files_to_builddir( "*.prm" )
+waLBerla_link_files_to_builddir( "*.py" )
+waLBerla_link_files_to_builddir( "simulation_setup" )
+foreach(streaming_pattern pull push aa esotwist esopull esopush)
+    foreach(stencil d3q19 d3q27)
+        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
+	    # KBC methods only for D2Q9 and D3Q27 defined
+	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+		    continue()
+        endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
+        if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
+            continue()
+        endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
+            set(config ${stencil}_${streaming_pattern}_${collision_setup})
+            waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
+                    FILE UniformGridCPU.py
+                    CODEGEN_CFG ${config}
+                    OUT_FILES UniformGridCPUStorageSpecification.h UniformGridCPUStorageSpecification.cpp
+                    UniformGridCPUSweepCollection.h UniformGridCPUSweepCollection.cpp
+                    NoSlip.cpp NoSlip.h
+                    UBB.cpp UBB.h
+                    UniformGridCPUBoundaryCollection.h
+                    UniformGridCPU_StreamOnlyKernel.cpp UniformGridCPU_StreamOnlyKernel.h
+                    UniformGridCPU_InfoHeader.h
+                    )
+            waLBerla_add_executable(NAME UniformGridCPU_${config}
+                    FILES UniformGridCPU.cpp
+                  DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::domain_decomposition walberla::field walberla::geometry walberla::python_coupling walberla::timeloop walberla::vtk UniformGridCPUGenerated_${config} )
+            # all configs are excluded from all except for pull d3q27.
+            if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
+                set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
+                set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
+            else()
+                set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+                set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+            endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
+        endforeach ()
+    endforeach()
+endforeach()
--- a/apps/benchmarks/UniformGridGenerated/InitShearVelocity.h
+++ b/apps/benchmarks/UniformGridGenerated/InitShearVelocity.h
@@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks,
        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
                                                         Cell globalCell;
        blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-        real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
+        const real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
        velField->get(x, y, z, 1) = real_t(0);
        velField->get(x, y, z, 2) = randomReal;

--- a/apps/benchmarks/UniformGridGenerated/ManualKernels.h
+++ b/apps/benchmarks/UniformGridGenerated/ManualKernels.h
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file UniformGridCPU.cpp
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "blockforest/Initialization.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "core/Environment.h"
+#include "core/OpenMP.h"
+#include "core/logging/Initialization.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+#include "domain_decomposition/SharedSweep.h"
+#include "field/AddToStorage.h"
+#include "field/vtk/VTKWriter.h"
+#include "geometry/InitBoundaryHandling.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/communication/UniformGeneratedPdfPackInfo.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
+#include "timeloop/all.h"
+#include <iomanip>
+#include "InitShearVelocity.h"
+#include "ManualKernels.h"
+#include "UniformGridCPU_InfoHeader.h"
+using namespace walberla;
+using StorageSpecification_T = lbm::UniformGridCPUStorageSpecification;
+using Stencil_T = lbm::UniformGridCPUStorageSpecification::Stencil;
+using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
+using FlagField_T = FlagField< uint8_t >;
+using BoundaryCollection_T = lbm::UniformGridCPUBoundaryCollection< FlagField_T >;
+using SweepCollection_T = lbm::UniformGridCPUSweepCollection;
+using blockforest::communication::UniformBufferedScheme;
+using macroFieldType = VelocityField_T::value_type;
+using pdfFieldType = PdfField_T::value_type;
+int main(int argc, char** argv)
+{
+   const mpi::Environment env(argc, argv);
+   const std::string input_filename(argv[1]);
+   const bool inputIsPython = string_ends_with(input_filename, ".py");
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
+   {
+      WALBERLA_MPI_WORLD_BARRIER()
+      auto config = *cfg;
+      logging::configureLogging(config);
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config);
+      // Reading parameters
+      auto parameters          = config->getOneBlock("Parameters");
+      const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const uint_t timesteps   = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
+      // Creating fields
+      const StorageSpecification_T StorageSpec = StorageSpecification_T();
+      auto fieldAllocator = make_shared< field::AllocateAligned< pdfFieldType, 64 > >();
+      const BlockDataID pdfFieldId  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, field::fzyx, fieldAllocator);
+      const BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx);
+      const BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx);
+      const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
+      // Initialize velocity on cpu
+      if (initShearFlow)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
+         initShearVelocity(blocks, velFieldId);
+      }
+      const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
+      SweepCollection_T sweepCollection(blocks, pdfFieldId, densityFieldId, velFieldId, omega, innerOuterSplit);
+      for (auto& block : *blocks)
+      {
+         sweepCollection.initialise(&block);
+      }
+      const pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId);
+      // Boundaries
+      const FlagUID fluidFlagUID("Fluid");
+      auto boundariesConfig   = config->getBlock("Boundaries");
+      if (boundariesConfig)
+      {
+         WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions")
+         geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+      }
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
+      BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldId, fluidFlagUID);
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      auto packInfo = std::make_shared<lbm_generated::UniformGeneratedPdfPackInfo< PdfField_T >>(pdfFieldId);
+      UniformBufferedScheme< Stencil_T > communication(blocks);
+      communication.addPackInfo(packInfo);
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
+      if (timeStepStrategy == "noOverlap") {
+         if (boundariesConfig){
+            timeLoop.add() << BeforeFunction(communication, "communication")
+                           << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions");
+            timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+         }else {
+            timeLoop.add() << BeforeFunction(communication, "communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");}
+      } else if (timeStepStrategy == "simpleOverlap") {
+         if (boundariesConfig){
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
+                           << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL), "Boundary Conditions");
+            timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER), "LBM StreamCollide Inner Frame");
+            timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER), "LBM StreamCollide Outer Frame");
+         }else{
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER), "LBM StreamCollide Inner Frame");
+            timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER), "LBM StreamCollide Outer Frame");}
+      } else if (timeStepStrategy == "kernelOnly") {
+         timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide");
+      } else if (timeStepStrategy == "StreamOnly") {
+         timeLoop.add() << Sweep(sweepCollection.stream(SweepCollection_T::ALL), "LBM Stream-Only");
+      } else {
+         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'")
+      }
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                             TIME LOOP SETUP                                                ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      if (vtkWriteFrequency > 0)
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                         "simulation_step", false, true, true, false, 0);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldId, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+         vtkOutput->addBeforeFunction([&]() {
+           for (auto& block : *blocks){
+              sweepCollection.calculateMacroscopicParameters(&block);}
+         });
+         timeLoop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
+      }
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID);
+      const uint_t warmupSteps     = parameters.getParameter< uint_t >("warmupSteps", uint_c(2));
+      const uint_t outerIterations = parameters.getParameter< uint_t >("outerIterations", uint_c(1));
+      for (uint_t i = 0; i < warmupSteps; ++i)
+         timeLoop.singleStep();
+      auto remainingTimeLoggerFrequency =
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
+      if (remainingTimeLoggerFrequency > 0)
+      {
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * outerIterations,
+                                                   remainingTimeLoggerFrequency);
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
+      }
+      for (uint_t outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
+      {
+         timeLoop.setCurrentTimeStepToZero();
+         WcTimingPool timeloopTiming;
+         WcTimer simTimer;
+         WALBERLA_MPI_WORLD_BARRIER()
+         WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+         simTimer.start();
+         timeLoop.run(timeloopTiming);
+         simTimer.end();
+         WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+         double time = simTimer.max();
+         WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
+         performance.logResultOnRoot(timesteps, time);
+         const auto reducedTimeloopTiming = timeloopTiming.getReduced();
+         WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
+         WALBERLA_ROOT_SECTION()
+         {
+            if(inputIsPython){
+               python_coupling::PythonCallback pythonCallbackResults("results_callback");
+               if (pythonCallbackResults.isCallable())
+               {
+                  pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
+                  pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
+                  pythonCallbackResults.data().exposeValue("numCores", performance.cores());
+                  pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
+                  pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
+                  pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
+                  pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
+                  pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
+                  pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+                  pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+                  pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+                  pythonCallbackResults.data().exposeValue("vectorised", vectorised);
+                  pythonCallbackResults.data().exposeValue("nontemporal", nontemporal);
+                  pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+                  pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+                  // Call Python function to report results
+                  pythonCallbackResults();
+               }
+            }
+         }
+      }
+   }
+   return EXIT_SUCCESS;
+}
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py
+from dataclasses import replace
+import sympy as sp
+import pystencils as ps
+from lbmpy.advanced_streaming import is_inplace
+from lbmpy.advanced_streaming.utility import streaming_patterns
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule
+from lbmpy.enums import Method, Stencil, SubgridScaleModel
+from lbmpy.moments import get_default_moment_set_for_stencil
+from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
+omega = sp.symbols('omega')
+omega_free = sp.Symbol('omega_free')
+options_dict = {
+    'srt': {
+        'method': Method.SRT,
+        'relaxation_rate': omega,
+        'compressible': False,
+    },
+    'trt': {
+        'method': Method.TRT,
+        'relaxation_rate': omega,
+        'compressible': False,
+    },
+    'mrt': {
+        'method': Method.MRT,
+        'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
+        'compressible': False,
+    },
+    'mrt-overrelax': {
+        'method': Method.MRT,
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
+        'compressible': False,
+    },
+    'central': {
+        'method': Method.CENTRAL_MOMENT,
+        'relaxation_rate': omega,
+        'compressible': True,
+    },
+    'central-overrelax': {
+        'method': Method.CENTRAL_MOMENT,
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
+        'compressible': True,
+    },
+    'cumulant': {
+        'method': Method.MONOMIAL_CUMULANT,
+        'relaxation_rate': omega,
+        'compressible': True,
+    },
+    'cumulant-overrelax': {
+        'method': Method.MONOMIAL_CUMULANT,
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
+        'compressible': True,
+    },
+    'cumulant-K17': {
+        'method': Method.CUMULANT,
+        'relaxation_rate': omega,
+        'compressible': True,
+        'fourth_order_correction': 0.01
+    },
+    'entropic': {
+        'method': Method.TRT_KBC_N4,
+        'compressible': True,
+        'zero_centered': False,
+        'relaxation_rates': [omega, omega_free],
+        'entropic': True,
+        'entropic_newton_iterations': False
+    },
+    'smagorinsky': {
+        'method': Method.SRT,
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
+        'relaxation_rate': omega,
+    },
+    'qr': {
+        'method': Method.SRT,
+        'subgrid_scale_model': SubgridScaleModel.QR,
+        'relaxation_rate': omega,
+    }
+}
+info_header = """
+const char * infoStencil = "{stencil}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
+const bool vectorised = {vec};
+const bool nontemporal = {nt_stores};
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+with CodeGeneration() as ctx:
+    openmp = True if ctx.openmp else False
+    field_type = "float64" if ctx.double_accuracy else "float32"
+    # This base pointer specification causes introduces temporary pointers in the outer loop such that the inner loop
+    # only contains aligned memory addresses. Doing so NT Stores are much more effective which causes great perfomance
+    # gains especially for the pull scheme on skylake architectures
+    base_pointer_spec = None  # [['spatialInner0'], ['spatialInner1']]
+    # cpu_vec = {"instruction_set": "best", "nontemporal": False,
+    #            "assume_aligned": True, 'assume_sufficient_line_padding': True}
+    cpu_vec = {"instruction_set": None}
+    nt_stores = False
+    config_tokens = ctx.config.split('_')
+    assert len(config_tokens) >= 3
+    stencil_str = config_tokens[0]
+    streaming_pattern = config_tokens[1]
+    collision_setup = config_tokens[2]
+    if stencil_str == "d3q27":
+        stencil = LBStencil(Stencil.D3Q27)
+    elif stencil_str == "d3q19":
+        stencil = LBStencil(Stencil.D3Q19)
+    else:
+        raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
+    assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
+    options = options_dict[collision_setup]
+    assert stencil.D == 3, "This application supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
+    lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
+    lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
+    # This creates a simplified version of the central moment collision operator where the bulk and shear viscosity is
+    # not seperated. This is done to get a fair comparison with the monomial cumulants.
+    if lbm_config.method == Method.CENTRAL_MOMENT:
+        lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
+    if not is_inplace(streaming_pattern):
+        lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
+    # This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
+    # the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
+    # typically remove the copy of the center PDF which results in an overestimation of the maximum performance
+    stream_only_kernel = []
+    for i in range(stencil.Q):
+        stream_only_kernel.append(ps.Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
+    # LB Sweep
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
+                                     boundary_object=NoSlip())
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_type))
+    generate_lbm_package(ctx, name="UniformGridCPU",
+                         collision_rule=collision_rule,
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=False, boundaries=[no_slip, ubb],
+                         macroscopic_fields=macroscopic_fields,
+                         cpu_openmp=openmp, cpu_vectorize_info=cpu_vec,
+                         base_pointer_specification=base_pointer_spec)
+    # Stream only kernel
+    cpu_vec_stream = None
+    if ctx.optimize_for_localhost:
+        cpu_vec_stream = {"instruction_set": "best", "nontemporal": True,
+                          "assume_aligned": True, 'assume_sufficient_line_padding': True,
+                          "assume_inner_stride_one": True}
+    generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel,
+                   target=ps.Target.CPU, cpu_openmp=openmp,
+                   cpu_vectorize_info=cpu_vec_stream, base_pointer_specification=[['spatialInner0'], ['spatialInner1']])
+    infoHeaderParams = {
+        'stencil': stencil_str,
+        'streaming_pattern': streaming_pattern,
+        'collision_setup': collision_setup,
+        'vec': int(True if cpu_vec else False),
+        'nt_stores': int(nt_stores),
+        'cse_global': int(lbm_opt.cse_global),
+        'cse_pdfs': int(lbm_opt.cse_pdfs),
+    }
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'UniformGridCPU_InfoHeader',
+                         field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
--- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+import os
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sys
+import sqlite3
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+# Number of time steps run for a workload of 128^3 per process
+# if double as many cells are on the process, half as many time steps are run etc.
+# increase this to get more reliable measurements
+TIME_STEPS_FOR_128_BLOCK = 10
+DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))
+def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
+    cells = block_size[0] * block_size[1] * block_size[2]
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    if time_steps < TIME_STEPS_FOR_128_BLOCK:
+        time_steps = 5
+    return int(time_steps)
+ldc_setup = {'Border': [
+    {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+]}
+class Scenario:
+    def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1,
+                 timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
+                 warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
+                 vtk_write_frequency=0, remaining_time_logger_frequency=-1, db_file_name=None):
+        if boundary_setup:
+            init_shear_flow = False
+            periodic = (0, 0, 0)
+        self.blocks_per_process = blocks_per_process
+        self.blocks = block_decomposition(self.blocks_per_process * wlb.mpi.numProcesses())
+        self.cells_per_block = cells_per_block
+        self.periodic = periodic
+        self.time_step_strategy = time_step_strategy
+        self.omega = omega
+        self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
+        self.inner_outer_split = inner_outer_split
+        self.init_shear_flow = init_shear_flow
+        self.boundary_setup = boundary_setup
+        self.warmup_steps = warmup_steps
+        self.outer_iterations = outer_iterations
+        self.vtk_write_frequency = vtk_write_frequency
+        self.remaining_time_logger_frequency = remaining_time_logger_frequency
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
+        self.config_dict = self.config(print_dict=False)
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        from pprint import pformat
+        config_dict = {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+                'cartesianSetup': (self.blocks_per_process == 1)
+            },
+            'Parameters': {
+                'omega': self.omega,
+                'warmupSteps': self.warmup_steps,
+                'outerIterations': self.outer_iterations,
+                'timeStepStrategy': self.time_step_strategy,
+                'timesteps': self.timesteps,
+                'initShearFlow': self.init_shear_flow,
+                'innerOuterSplit': self.inner_outer_split,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
+            }
+        }
+        if self.boundary_setup:
+            config_dict["Boundaries"] = ldc_setup
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+        return config_dict
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()                        # generate subclasses
+            state.update()                          # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+        sequenceValuesToScalars(data)
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = f"runs"
+        table_name = table_name.replace("-", "_")
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+# -------------------------------------- Profiling -----------------------------------
+def profiling():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running 2 timesteps for profiling")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    cells = (128, 128, 128)
+    scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly',
+                           inner_outer_split=(1, 1, 1), timesteps=2,
+                           outer_iterations=1, warmup_steps=0))
+# -------------------------------------- Functions trying different parameter sets -----------------------------------
+def overlap_benchmark():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running different communication overlap strategies")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    inner_outer_splits = [(1, 1, 1), (4, 1, 1)]
+    cells_per_block = [(i, i, i) for i in (16, 32, 64, 128)]
+    for cell_per_block in cells_per_block:
+        scenarios.add(Scenario(time_step_strategy='noOverlap',
+                               inner_outer_split=(1, 1, 1),
+                               cells_per_block=cell_per_block))
+        for inner_outer_split in inner_outer_splits:
+            scenario = Scenario(time_step_strategy='simpleOverlap',
+                                inner_outer_split=inner_outer_split,
+                                cells_per_block=cell_per_block)
+            scenarios.add(scenario)
+def weak_scaling_benchmark():
+    wlb.log_info_on_root("Running weak scaling benchmark with one block per proc")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(time_step_strategy=t,
+                               inner_outer_split=(1, 1, 1),
+                               cells_per_block=(WeakX, WeakY, WeakZ),
+                               boundary_setup=True,
+                               outer_iterations=1,
+                               db_file_name="weakScalingUniformGridOneBlock.sqlite3"))
+def strong_scaling_benchmark():
+    wlb.log_info_on_root("Running strong scaling benchmark with one block per proc")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    domain_size = (StrongX, StrongY, StrongZ)
+    blocks = block_decomposition(wlb.mpi.numProcesses())
+    cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
+    for t in ["noOverlap", "simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
+                               time_step_strategy=t,
+                               outer_iterations=1,
+                               timesteps=10,
+                               boundary_setup=True,
+                               db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
+def single_node_benchmark():
+    """Benchmarks only the LBM compute kernel"""
+    wlb.log_info_on_root("Running single Node benchmarks")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(cells_per_block=(128, 128, 128),
+                        time_step_strategy='kernelOnly',
+                        outer_iterations=1,
+                        timesteps=10)
+    scenarios.add(scenario)
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+    wlb.log_info_on_root("")
+    time_step_strategy = "noOverlap"  # "noOverlap"
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(cells_per_block=(64, 64, 64),
+                        time_step_strategy=time_step_strategy,
+                        timesteps=201,
+                        outer_iterations=1,
+                        warmup_steps=0,
+                        init_shear_flow=False,
+                        boundary_setup=True,
+                        vtk_write_frequency=50,
+                        remaining_time_logger_frequency=10)
+    scenarios.add(scenario)
+wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
+# Select the benchmark you want to run
+# single_node_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
+# performance of compute kernel (no communication)
+# overlap_benchmark()  # benchmarks different communication overlap options
+# profiling()  # run only two timesteps on a smaller domain for profiling only
+# validation_run()
+# scaling_benchmark()
+if BENCHMARK == 0:
+    single_node_benchmark()
+elif BENCHMARK == 1:
+    weak_scaling_benchmark()
+elif BENCHMARK == 2:
+    strong_scaling_benchmark()
+else:
+    validation_run()
--- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs_RDM.py
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs_RDM.py
+import os
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+import sys
+import sqlite3
+from pprint import pformat
+try:
+    import machinestate as ms
+except ImportError:
+    ms = None
+# Number of time steps run for a workload of 128^3 per process
+# if double as many cells are on the process, half as many time steps are run etc.
+# increase this to get more reliable measurements
+TIME_STEPS_FOR_128_BLOCK = int(os.environ.get('TIME_STEPS_FOR_128_BLOCK', 100))
+DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
+BENCHMARK = int(os.environ.get('BENCHMARK', 0))
+WeakX = int(os.environ.get('WeakX', 128))
+WeakY = int(os.environ.get('WeakY', 128))
+WeakZ = int(os.environ.get('WeakZ', 128))
+StrongX = int(os.environ.get('StrongX', 128))
+StrongY = int(os.environ.get('StrongY', 128))
+StrongZ = int(os.environ.get('StrongZ', 128))
+def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
+    """
+    Calculate the number of time steps based on the block size.
+    This function computes the number of time steps required for a given block size
+    by scaling the time steps that could be executed on one process within one second 
+    for a 128x128x128 cells_per_block to the given cells_per_block size.
+    Parameters:
+    block_size (tuple): A tuple of three integers representing the dimensions of the cells_per_block (x, y, z).
+    time_steps_for_128_block (int, optional): The number of time steps for a 128x128x128 block. Default is 100.
+    Returns:
+    int: The calculated number of time steps, with a minimum value of 5.
+    """
+    cells = block_size[0] * block_size[1] * block_size[2]
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    if time_steps < 5:
+        time_steps = 5
+    return int(time_steps)
+ldc_setup = {'Border': [
+    {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+]}
+class Scenario:
+    def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1,
+                 timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
+                 warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
+                 vtk_write_frequency=0, remaining_time_logger_frequency=-1, db_file_name=None):
+        if boundary_setup:
+            init_shear_flow = False
+            periodic = (0, 0, 0)
+        self.blocks_per_process = blocks_per_process
+        self.blocks = block_decomposition(self.blocks_per_process * wlb.mpi.numProcesses())
+        self.cells_per_block = cells_per_block
+        self.periodic = periodic
+        self.time_step_strategy = time_step_strategy
+        self.omega = omega
+        self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
+        self.inner_outer_split = inner_outer_split
+        self.init_shear_flow = init_shear_flow
+        self.boundary_setup = boundary_setup
+        self.warmup_steps = warmup_steps
+        self.outer_iterations = outer_iterations
+        self.vtk_write_frequency = vtk_write_frequency
+        self.remaining_time_logger_frequency = remaining_time_logger_frequency
+        self.db_file_name = DB_FILE if db_file_name is None else db_file_name
+        self.config_dict = self.config(print_dict=False)
+    @wlb.member_callback
+    def config(self, print_dict=True):
+        config_dict = {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+                'cartesianSetup': (self.blocks_per_process == 1)
+            },
+            'Parameters': {
+                'omega': self.omega,
+                'warmupSteps': self.warmup_steps,
+                'outerIterations': self.outer_iterations,
+                'timeStepStrategy': self.time_step_strategy,
+                'timesteps': self.timesteps,
+                'initShearFlow': self.init_shear_flow,
+                'innerOuterSplit': self.inner_outer_split,
+                'vtkWriteFrequency': self.vtk_write_frequency,
+                'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
+            }
+        }
+        if self.boundary_setup:
+            config_dict["Boundaries"] = ldc_setup
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+        return config_dict
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        if ms:
+            state = ms.MachineState(extended=False, anonymous=True)
+            state.generate()  # generate subclasses
+            state.update()  # read information
+            data["MachineState"] = str(state.get())
+        else:
+            print("MachineState module is not available. MachineState was not saved")
+        sequenceValuesToScalars(data)
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        table_name = "runs"
+        table_name = table_name.replace("-", "_")
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, table_name, self.db_file_name)
+                storeSingle(result, table_name, self.db_file_name)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+# -------------------------------------- Functions trying different parameter sets -----------------------------------
+def weak_scaling_benchmark():
+    wlb.log_info_on_root("Running weak scaling benchmark with one block per proc")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    for t in ["simpleOverlap"]:
+        scenarios.add(Scenario(time_step_strategy=t,
+                               inner_outer_split=(1, 1, 1),
+                               cells_per_block=(WeakX, WeakY, WeakZ),
+                               boundary_setup=True,
+                               outer_iterations=1,
+                               db_file_name="weakScalingUniformGridOneBlock.sqlite3"))
+def strong_scaling_benchmark():
+    wlb.log_info_on_root("Running strong scaling benchmark with one block per proc")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    domain_size = (StrongX, StrongY, StrongZ)
+    blocks = block_decomposition(wlb.mpi.numProcesses())
+    cells_per_block = tuple([d // b for d, b in zip(domain_size, reversed(blocks))])
+    for t in ["simpleOverlap"]:
+        scenarios.add(Scenario(cells_per_block=cells_per_block,
+                               time_step_strategy=t,
+                               outer_iterations=1,
+                               timesteps=num_time_steps(cells_per_block),
+                               boundary_setup=True,
+                               db_file_name="strongScalingUniformGridOneBlock.sqlite3"))
+def single_node_benchmark():
+    """Benchmarks only the LBM compute kernel"""
+    wlb.log_info_on_root("Running single Node benchmarks")
+    wlb.log_info_on_root("")
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(cells_per_block=(128, 128, 128),
+                        time_step_strategy='kernelOnly',
+                        outer_iterations=1,
+                        timesteps=10)
+    scenarios.add(scenario)
+def validation_run():
+    """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
+    wlb.log_info_on_root("Validation run")
+    wlb.log_info_on_root("")
+    time_step_strategy = "noOverlap"  # "noOverlap"
+    scenarios = wlb.ScenarioManager()
+    scenario = Scenario(cells_per_block=(64, 64, 64),
+                        time_step_strategy=time_step_strategy,
+                        timesteps=201,
+                        outer_iterations=1,
+                        warmup_steps=0,
+                        init_shear_flow=False,
+                        boundary_setup=True,
+                        vtk_write_frequency=50,
+                        remaining_time_logger_frequency=10)
+    scenarios.add(scenario)
+wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
+# Select the benchmark you want to run
+# single_node_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
+# performance of compute kernel (no communication)
+# overlap_benchmark()  # benchmarks different communication overlap options
+# profiling()  # run only two timesteps on a smaller domain for profiling only
+# validation_run()
+# scaling_benchmark()
+if BENCHMARK == 0:
+    single_node_benchmark()
+elif BENCHMARK == 1:
+    weak_scaling_benchmark()
+elif BENCHMARK == 2:
+    strong_scaling_benchmark()
+else:
+    validation_run()
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
 waLBerla_link_files_to_builddir( "*.prm" )
 waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )
-foreach (config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4
+foreach(streaming_pattern pull push aa esotwist esopull esopush)
-      entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt
+    foreach(stencil d3q19 d3q27)
-      cumulant cumulant_d3q27
+        foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
-      srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt)
+	    # KBC methods only for D2Q9 and D3Q27 defined
+	    if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
-    waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
+		    continue()
-          FILE UniformGridGPU.py
+        endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
-          CODEGEN_CFG ${config}
+        if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
-          OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h
+            continue()
-          UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
+        endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
-          UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
+            set(config ${stencil}_${streaming_pattern}_${collision_setup})
-          UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
+            waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
-          UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
+                    FILE UniformGridGPU.py
-          UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h
+                    CODEGEN_CFG ${config}
-          UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h
+                    OUT_FILES UniformGridGPUStorageSpecification.h UniformGridGPUStorageSpecification.${CODEGEN_FILE_SUFFIX}
-          UniformGridGPU_Defines.h
+                    UniformGridGPUSweepCollection.h UniformGridGPUSweepCollection.${CODEGEN_FILE_SUFFIX}
-          )
+                    NoSlip.h NoSlip.${CODEGEN_FILE_SUFFIX}
+                    UBB.h UBB.${CODEGEN_FILE_SUFFIX}
+                    UniformGridGPUBoundaryCollection.h
-    waLBerla_add_executable(NAME UniformGridBenchmarkGPU_${config}
+                    UniformGridGPU_StreamOnlyKernel.h UniformGridGPU_StreamOnlyKernel.${CODEGEN_FILE_SUFFIX}
-          FILES UniformGridGPU.cpp
+                    UniformGridGPU_InfoHeader.h
-          DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_${config})
+                    )
-    set_target_properties( UniformGridBenchmarkGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
-endforeach ()
+            waLBerla_add_executable(NAME UniformGridGPU_${config}
+                    FILES UniformGridGPU.cpp
+                  DEPENDS walberla::blockforest walberla::boundary walberla::core walberla::gpu walberla::domain_decomposition walberla::field walberla::geometry walberla::lbm_generated walberla::python_coupling walberla::timeloop walberla::vtk UniformGridGPUGenerated_${config} )
-foreach (config srt trt mrt smagorinsky entropic)
+            # all configs are excluded from all except for pull d3q27.
-    waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_AA_${config}
+            if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
-          FILE UniformGridGPU_AA.py
+                set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
-          CODEGEN_CFG ${config}
+                set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
-          OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
+            else()
-          UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h
+                set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
-          UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
+                set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
-          UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
+            endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
-          UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
-          UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
+        endforeach ()
-          UniformGridGPU_AA_Defines.h
+    endforeach()
-          )
+endforeach()
-    waLBerla_add_executable(NAME UniformGridBenchmarkGPU_AA_${config}
-          FILES UniformGridGPU_AA.cpp
-          DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_${config})
-    set_target_properties( UniformGridBenchmarkGPU_AA_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
-endforeach ()
--- a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
+++ b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
@@ -16,7 +16,7 @@ inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks,
        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
                                                         Cell globalCell;
        blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-        real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
+        const real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
        velField->get(x, y, z, 1) = real_t(0);
        velField->get(x, y, z, 2) = randomReal;

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file UniformGridGPU.cpp
+//! \author Martin Bauer <martin.bauer@fau.de>
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+#include "blockforest/Initialization.h"
 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
-#include "core/math/Random.h"
+#include "core/timing/RemainingTimeLogger.h"
-#include "python_coupling/CreateConfig.h"
+#include "core/timing/TimingPool.h"
-#include "python_coupling/PythonCallback.h"
-#include "python_coupling/DictWrapper.h"
-#include "blockforest/Initialization.h"
-#include "field/FlagField.h"
 #include "field/AddToStorage.h"
+#include "field/FlagField.h"
 #include "field/vtk/VTKWriter.h"
-#include "field/communication/PackInfo.h"
-#include "lbm/PerformanceLogger.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-#include "timeloop/all.h"
-#include "core/math/Random.h"
-#include "geometry/all.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/ParallelStreams.h"
-#include "cuda/NVTX.h"
-#include "core/timing/TimingPool.h"
-#include "core/timing/RemainingTimeLogger.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/DeviceSelectMPI.h"
-#include "domain_decomposition/SharedSweep.h"
-#include "gui/Gui.h"
-#include "lbm/gui/Connection.h"
-#include "UniformGridGPU_LatticeModel.h"
-#include "UniformGridGPU_LbKernel.h"
-#include "UniformGridGPU_PackInfo.h"
-#include "UniformGridGPU_UBB.h"
-#include "UniformGridGPU_NoSlip.h"
-#include "UniformGridGPU_Communication.h"
-#include "UniformGridGPU_MacroSetter.h"
-#include "UniformGridGPU_MacroGetter.h"
-#include "UniformGridGPU_Defines.h"
+#include "geometry/InitBoundaryHandling.h"
-using namespace walberla;
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "gpu/FieldCopy.h"
+#include "gpu/GPUWrapper.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/communication/UniformGPUScheme.h"
+#include "lbm_generated/field/PdfField.h"
+#include "lbm_generated/field/AddToStorage.h"
+#include "lbm_generated/gpu/UniformGeneratedGPUPdfPackInfo.h"
+#include "lbm_generated/gpu/GPUPdfField.h"
+#include "lbm_generated/gpu/AddToStorage.h"
+#include "lbm_generated/evaluation/PerformanceEvaluation.h"
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
-using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
+#include "timeloop/SweepTimeloop.h"
-const auto Q = LatticeModel_T::Stencil::Q;
+#include <cmath>
+#include "InitShearVelocity.h"
+#include "UniformGridGPU_InfoHeader.h"
-using Stencil_T = LatticeModel_T::Stencil;
+using namespace walberla;
-using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
-using PdfField_T = GhostLayerField<real_t, Q>;
-using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
-using VelocityField_T = GhostLayerField<real_t, 3>;
-using flag_t = walberla::uint8_t;
-using FlagField_T = FlagField<flag_t>;
+using StorageSpecification_T = lbm::UniformGridGPUStorageSpecification;
+using Stencil_T = lbm::UniformGridGPUStorageSpecification::Stencil;
-void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
+using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >;
-                       const real_t xMagnitude=real_t(0.1), const real_t fluctuationMagnitude=real_t(0.05) )
+using GPUPdfField_T = lbm_generated::GPUPdfField< StorageSpecification_T >;
-{
+using FlagField_T = FlagField< uint8_t >;
-    math::seedRandomGenerator(0);
+using BoundaryCollection_T = lbm::UniformGridGPUBoundaryCollection< FlagField_T >;
-    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
-    for( auto & block: *blocks)
+using SweepCollection_T = lbm::UniformGridGPUSweepCollection;
-    {
-        auto velField = block.getData<VelocityField_T>( velFieldID );
-        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
-            Cell globalCell;
-            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-            real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
-            velField->get(x, y, z, 1) = real_t(0);
-            velField->get(x, y, z, 2) = randomReal;
-            if( globalCell[2] >= halfZ ) {
-                velField->get(x, y, z, 0) = xMagnitude;
-            } else {
-                velField->get(x, y, z, 0) = -xMagnitude;
-            }
-        );
-    }
-}
+using gpu::communication::UniformGPUScheme;
-int main( int argc, char **argv )
+using macroFieldType = VelocityField_T::value_type;
+int main(int argc, char** argv)
 {
-   mpi::Environment env( argc, argv );
+   mpi::Environment const env(argc, argv);
-   cuda::selectDeviceBasedOnMpiRank();
+   gpu::selectDeviceBasedOnMpiRank();
-   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
   {
-      WALBERLA_MPI_WORLD_BARRIER();
+      WALBERLA_MPI_WORLD_BARRIER()
+      WALBERLA_GPU_CHECK(gpuPeekAtLastError())
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                        SETUP AND CONFIGURATION                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      auto config = *cfg;
-      logging::configureLogging( config );
+      logging::configureLogging(config);
-      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config);
-      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
      // Reading parameters
-      auto parameters = config->getOneBlock( "Parameters" );
+      auto parameters          = config->getOneBlock("Parameters");
-      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
+      const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
-      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
+      const uint_t timesteps   = parameters.getParameter< uint_t >("timesteps", uint_c(50));
-      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+      const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
-      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
+      const bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
      // Creating fields
-      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
+      const StorageSpecification_T StorageSpec = StorageSpecification_T();
-      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
+      const BlockDataID pdfFieldCpuID  = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, uint_c(1), field::fzyx);
-      if( timeStepStrategy != "kernelOnlyNoInit")
-      {
-          if ( initShearFlow )
-          {
-              WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
-              initShearVelocity( blocks, velFieldCpuID );
-          }
-          pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
-          for( auto & block : *blocks )
-              setterSweep( &block );
-          // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
-          blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
-          initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
-          initialComm();
-      }
-      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
+      auto allocator = make_shared< gpu::HostFieldAllocator<macroFieldType> >(); // use pinned memory allocator for faster CPU-GPU memory transfers
+      const BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx, uint_c(1), allocator);
+      const BlockDataID densityFieldCpuID = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx, uint_c(1), allocator);
+      const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
-      // Boundaries
+      // Initialize velocity on cpu
-      const FlagUID fluidFlagUID( "Fluid" );
+      if (initShearFlow)
-      auto boundariesConfig = config->getBlock( "Boundaries" );
-      bool disableBoundaries = true;
-      if( boundariesConfig )
      {
-          disableBoundaries = false;
+         WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
-          geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
+         initShearVelocity(blocks, velFieldCpuID);
-          geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
      }
-      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
+      const BlockDataID pdfFieldGpuID = lbm_generated::addGPUPdfFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, StorageSpec, "pdfs on GPU", true);
-      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
+      const BlockDataID velFieldGpuID =
+         gpu::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
-      ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
+      const BlockDataID densityFieldGpuID =
-      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
+         gpu::addGPUFieldToStorage< ScalarField_T >(blocks, densityFieldCpuID, "velocity on GPU", true);
-       // Communication setup
-      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
-      Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
-      const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
-      CommunicationSchemeType communicationScheme;
-      if( communicationSchemeStr == "GPUPackInfo_Baseline")
-          communicationScheme = GPUPackInfo_Baseline;
-      else if (communicationSchemeStr == "GPUPackInfo_Streams")
-          communicationScheme = GPUPackInfo_Streams;
-      else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
-          communicationScheme = UniformGPUScheme_Baseline;
-      else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
-          communicationScheme = UniformGPUScheme_Memcpy;
-      else if (communicationSchemeStr == "MPIDatatypes")
-          communicationScheme = MPIDatatypes;
-      else if (communicationSchemeStr == "MPIDatatypesFull")
-          communicationScheme = MPIDatatypesFull;
-      else {
-          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
-      }
-      Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
+      const Cell innerOuterSplit = Cell(parameters.getParameter< Vector3<cell_idx_t> >("innerOuterSplit", Vector3<cell_idx_t>(1, 1, 1)));
-      for(uint_t i=0; i< 3; ++i)
+      Vector3< int32_t > gpuBlockSize = parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
+      SweepCollection_T sweepCollection(blocks, pdfFieldGpuID, densityFieldGpuID, velFieldGpuID, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], omega, innerOuterSplit);
+      for (auto& block : *blocks)
      {
-          if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
+         sweepCollection.initialise(&block);
-              WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
-          }
      }
      int streamHighPriority = 0;
-      int streamLowPriority = 0;
+      int streamLowPriority  = 0;
-      WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
+      WALBERLA_GPU_CHECK(gpuDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
-      WALBERLA_CHECK(gpuBlockSize[2] == 1);
+      sweepCollection.setOuterPriority(streamHighPriority);
-      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-                                                    1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
+      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
-                                                    gpuBlockSize[0], gpuBlockSize[1],
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-                                                    Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
+      const pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID);
-      lbKernel.setOuterPriority( streamHighPriority );
-      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
-         gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
-      auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
-      auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
-      uint_t currentTimeStep = 0;
-      auto simpleOverlapTimeStep = [&] ()
-      {
-          gpuComm.startCommunication(defaultStream);
-          for( auto &block: *blocks )
-              lbKernel.inner( &block, defaultStream );
-          gpuComm.wait(defaultStream);
-          for( auto &block: *blocks )
-              lbKernel.outer( &block, defaultStream );
-      };
-      auto overlapTimeStep = [&]()
-      {
-         cuda::NvtxRange namedRange("timestep");
-         auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
-         innerOuterSection.run([&]( auto innerStream )
+      // Boundaries
-         {
+      const FlagUID fluidFlagUID("Fluid");
-            cuda::nameStream(innerStream, "inner stream");
+      auto boundariesConfig   = config->getBlock("Boundaries");
-            for( auto &block: *blocks )
+      if (boundariesConfig)
-            {
-               if(!disableBoundaries)
-               {
-                  auto p = boundaryInnerStreams.parallelSection( innerStream );
-                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
-                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
-               }
-               lbKernel.inner( &block, innerStream );
-            }
-         });
-         innerOuterSection.run([&]( auto outerStream )
-         {
-            cuda::nameStream(outerStream, "outer stream");
-            gpuComm( outerStream );
-            for( auto &block: *blocks )
-            {
-               if(!disableBoundaries)
-               {
-                  auto p = boundaryOuterStreams.parallelSection( outerStream );
-                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
-                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
-               }
-               lbKernel.outer( &block, outerStream );
-            }
-         });
-         currentTimeStep += 1;
-      };
-      auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
-      auto normalTimeStep = [&]()
-      {
-         gpuComm();
-         for( auto &block: *blocks )
-         {
-            if(!disableBoundaries)
-            {
-               auto p = boundaryStreams.parallelSection( defaultStream );
-               p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
-               p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
-            }
-            lbKernel( &block );
-         }
-      };
-      auto kernelOnlyFunc = [&] ()
      {
-          for( auto &block: *blocks )
+         WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions")
-              lbKernel( &block );
+         geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
-      };
-      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
-      std::function<void()> timeStep;
-      if (timeStepStrategy == "noOverlap")
-          timeStep = std::function<void()>( normalTimeStep );
-      else if (timeStepStrategy == "complexOverlap")
-          timeStep = std::function<void()>( overlapTimeStep );
-      else if (timeStepStrategy == "simpleOverlap")
-          timeStep = simpleOverlapTimeStep;
-      else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
-          WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
-          timeStep = kernelOnlyFunc;
      }
-      else {
+      geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
-          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
+      BoundaryCollection_T boundaryCollection(blocks, flagFieldID, pdfFieldGpuID, fluidFlagUID);
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      UniformGPUScheme< Stencil_T > communication(blocks, cudaEnabledMPI);
+      auto packInfo = std::make_shared<lbm_generated::UniformGeneratedGPUPdfPackInfo< GPUPdfField_T >>(pdfFieldGpuID);
+      communication.addPackInfo(packInfo);
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
+      auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
+      if (timeStepStrategy == "noOverlap") {
+         if (boundariesConfig){
+            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
+                           << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
+            timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");
+         }else {
+            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");}
+      } else if (timeStepStrategy == "simpleOverlap") {
+         if (boundariesConfig){
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
+                           << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
+            timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
+            timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER, defaultStream), "LBM StreamCollide Outer Frame");
+         }else{
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
+            timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
+                           << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER,defaultStream), "LBM StreamCollide Outer Frame");}
+      } else if (timeStepStrategy == "kernelOnly") {
+         timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");
+      } else if (timeStepStrategy == "StreamOnly") {
+         timeLoop.add() << Sweep(StreamOnlyKernel, "LBM Stream Only");
+      } else {
+         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'")
      }
-      timeLoop.add() << BeforeFunction( timeStep  )
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-                     << Sweep( []( IBlock * ) {}, "time step" );
+      ///                                             TIME LOOP SETUP                                                ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
      // VTK
-      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
+      const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
-      if( vtkWriteFrequency > 0 )
+      if (vtkWriteFrequency > 0)
      {
-         auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                          "simulation_step", false, true, true, false, 0 );
+                                                         "simulation_step", false, true, true, false, 0);
-         auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldCpuID, "vel");
         vtkOutput->addCellDataWriter(velWriter);
-         vtkOutput->addBeforeFunction( [&]() {
-             cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+         vtkOutput->addBeforeFunction([&]() {
-             for( auto & block : *blocks )
+            for (auto& block : *blocks)
-                 getterSweep( &block );
+               sweepCollection.calculateMacroscopicParameters(&block);
+            gpu::fieldCpy< VelocityField_T, gpu::GPUField< VelocityField_T::value_type > >(blocks, velFieldCpuID, velFieldGpuID);
         });
-         timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
      }
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID);
-      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
+      const uint_t warmupSteps     = parameters.getParameter< uint_t >("warmupSteps", uint_c(2));
-      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
+      const uint_t outerIterations = parameters.getParameter< uint_t >("outerIterations", uint_c(1));
-      for(int i=0; i < warmupSteps; ++i )
+      for (uint_t i = 0; i < warmupSteps; ++i)
         timeLoop.singleStep();
-      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
+      auto remainingTimeLoggerFrequency =
-      if (remainingTimeLoggerFrequency > 0) {
+         parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(-1.0)); // in seconds
-          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
+      if (remainingTimeLoggerFrequency > 0)
-          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
-      }
-      bool useGui = parameters.getParameter<bool>( "useGui", false );
-      if( useGui )
      {
-          GUI gui( timeLoop, blocks, argc, argv);
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations),
-          lbm::connectToGui<LatticeModel_T>(gui);
+                                                   remainingTimeLoggerFrequency);
-          gui.run();
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
      }
-      else
+      for (uint_t outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
      {
-          for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
+         WALBERLA_GPU_CHECK(gpuPeekAtLastError())
-          {
-              timeLoop.setCurrentTimeStepToZero();
+         timeLoop.setCurrentTimeStepToZero();
-              WcTimer simTimer;
+         WcTimingPool timeloopTiming;
-              cudaDeviceSynchronize();
+         WcTimer simTimer;
-              WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
-              simTimer.start();
+         WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
-              timeLoop.run();
+         WALBERLA_GPU_CHECK( gpuPeekAtLastError() )
-              cudaDeviceSynchronize();
+         WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
-              simTimer.end();
+         simTimer.start();
-              WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+         timeLoop.run(timeloopTiming);
-              auto time = simTimer.last();
+         WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
-              auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+         simTimer.end();
-              auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-              WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+         WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
-              WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+         double time = simTimer.max();
-              WALBERLA_ROOT_SECTION()
+         WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
-              {
+         performance.logResultOnRoot(timesteps, time);
-                  python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                  if ( pythonCallbackResults.isCallable())
+         const auto reducedTimeloopTiming = timeloopTiming.getReduced();
-                  {
+         WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
-                      const char * storagePattern = "twofield";
-                      pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+         WALBERLA_ROOT_SECTION()
-                      pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
+         {
-                      pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
+            python_coupling::PythonCallback pythonCallbackResults("results_callback");
-                      pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
+            if (pythonCallbackResults.isCallable())
-                      pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
+            {
-                      pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
+               pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
-                      // Call Python function to report results
+               pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
-                      pythonCallbackResults();
+               pythonCallbackResults.data().exposeValue("numCores", performance.cores());
-                  }
+               pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
-              }
+               pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
-          }
+               pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
+               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+               // Call Python function to report results
+               pythonCallbackResults();
+            }
+         }
      }
   }
+   return EXIT_SUCCESS;
-   return 0;
 }
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
 import sympy as sp
 import numpy as np
 import pystencils as ps
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
-from lbmpy.boundaries import NoSlip, UBB
+from dataclasses import replace
-from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
-from pystencils_walberla import generate_pack_info_from_kernel
+from pystencils import Assignment
-from lbmpy_walberla import generate_lattice_model, generate_boundary
+from pystencils.typing import TypedSymbol
-from pystencils_walberla import CodeGeneration, generate_sweep
-from pystencils.data_types import TypedSymbol
 from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
-from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
+from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil, SubgridScaleModel
+from lbmpy.advanced_streaming import is_inplace
+from lbmpy.advanced_streaming.utility import streaming_patterns
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_collision_rule
+from lbmpy.moments import get_default_moment_set_for_stencil
+from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
+from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
 omega = sp.symbols("omega")
 omega_free = sp.Symbol("omega_free")
-omega_fill = sp.symbols("omega_:10")
 compile_time_block_size = False
+max_threads = 256
 if compile_time_block_size:
    sweep_block_size = (128, 1, 1)
 else:
    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
                        TypedSymbol("cudaBlockSize1", np.int32),
-                        1)
+                        TypedSymbol("cudaBlockSize2", np.int32))
-sweep_params = {'block_size': sweep_block_size}
+gpu_indexing_params = {'block_size': sweep_block_size}
 options_dict = {
    'srt': {
-        'method': 'srt',
+        'method': Method.SRT,
-        'stencil': 'D3Q19',
        'relaxation_rate': omega,
        'compressible': False,
    },
    'trt': {
-        'method': 'trt',
+        'method': Method.TRT,
-        'stencil': 'D3Q19',
        'relaxation_rate': omega,
+        'compressible': False,
    },
    'mrt': {
-        'method': 'mrt',
+        'method': Method.MRT,
-        'stencil': 'D3Q19',
+        'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
-        'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235],
+        'compressible': False,
    },
-    'mrt_full': {
+    'mrt-overrelax': {
-        'method': 'mrt',
+        'method': Method.MRT,
-        'stencil': 'D3Q19',
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
-        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
+        'compressible': False,
-                             omega_fill[3], omega_fill[4], omega_fill[5]],
    },
-    'entropic': {
+    'central': {
-        'method': 'mrt',
+        'method': Method.CENTRAL_MOMENT,
-        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
        'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free],
-        'entropic': True,
    },
-    'entropic_kbc_n4': {
+    'central-overrelax': {
-        'method': 'trt-kbc-n4',
+        'method': Method.CENTRAL_MOMENT,
-        'stencil': 'D3Q27',
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
+        'compressible': True,
+    },
+    'cumulant': {
+        'method': Method.MONOMIAL_CUMULANT,
+        'relaxation_rate': omega,
        'compressible': True,
+    },
+    'cumulant-overrelax': {
+        'method': Method.MONOMIAL_CUMULANT,
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)],
+        'compressible': True,
+    },
+    'cumulant-K17': {
+        'method': Method.CUMULANT,
+        'relaxation_rate': omega,
+        'compressible': True,
+        'fourth_order_correction': 0.01
+    },
+    'entropic': {
+        'method': Method.TRT_KBC_N4,
+        'compressible': True,
+        'zero_centered': False,
        'relaxation_rates': [omega, omega_free],
        'entropic': True,
+        'entropic_newton_iterations': False
    },
    'smagorinsky': {
-        'method': 'srt',
+        'method': Method.SRT,
-        'stencil': 'D3Q19',
+        'subgrid_scale_model': SubgridScaleModel.SMAGORINSKY,
-        'smagorinsky': True,
        'relaxation_rate': omega,
    },
-    'cumulant': {
+    'qr': {
-        'method': 'cumulant',
+        'method': Method.SRT,
-        'stencil': 'D3Q19',
+        'subgrid_scale_model': SubgridScaleModel.QR,
-        'compressible': True,
        'relaxation_rate': omega,
-    },
+    }
 }
 info_header = """
-#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
 const char * infoStencil = "{stencil}";
-const char * infoConfigName = "{configName}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
 const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
+# DEFAULTS
+optimize = True
 with CodeGeneration() as ctx:
-    accessor = StreamPullTwoFieldsAccessor()
+    pdf_data_type = "float64"
-    # accessor = StreamPushTwoFieldsAccessor()
+    field_data_type = "float64"
-    assert not accessor.is_inplace, "This app does not work for inplace accessors"
+    config_tokens = ctx.config.split('_')
-    common_options = {
+    assert len(config_tokens) >= 3
-        'field_name': 'pdfs',
+    stencil_str = config_tokens[0]
-        'temporary_field_name': 'pdfs_tmp',
+    streaming_pattern = config_tokens[1]
-        'kernel_type': accessor,
+    collision_setup = config_tokens[2]
-        'optimization': {'cse_global': True,
-                         'cse_pdfs': False}
+    if len(config_tokens) >= 4:
-    }
+        optimize = (config_tokens[3] != 'noopt')
-    config_name = ctx.config
-    noopt = False
+    if stencil_str == "d3q27":
-    d3q27 = False
+        stencil = LBStencil(Stencil.D3Q27)
-    if config_name.endswith("_noopt"):
+    elif stencil_str == "d3q19":
-        noopt = True
+        stencil = LBStencil(Stencil.D3Q19)
-        config_name = config_name[:-len("_noopt")]
+    else:
-    if config_name.endswith("_d3q27"):
+        raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment")
-        d3q27 = True
-        config_name = config_name[:-len("_d3q27")]
+    assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
-    options = options_dict[config_name]
+    options = options_dict[collision_setup]
-    options.update(common_options)
-    options = options.copy()
+    assert stencil.D == 3, "This application supports only three-dimensional stencils"
+    pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {pdf_data_type}[3D]", layout='fzyx')
-    if noopt:
+    density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_data_type}[3D]", layout='fzyx')
-        options['optimization']['cse_global'] = False
+    macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
-        options['optimization']['cse_pdfs'] = False
-    if d3q27:
+    lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
-        options['stencil'] = 'D3Q27'
+    lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
-    stencil_str = options['stencil']
+    if lbm_config.method == Method.CENTRAL_MOMENT:
-    q = int(stencil_str[stencil_str.find('Q') + 1:])
+        lbm_config = replace(lbm_config, nested_moments=get_default_moment_set_for_stencil(stencil))
-    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
-    options['optimization']['symbolic_field'] = pdfs
+    if not is_inplace(streaming_pattern):
+        lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp)
-    vp = [
+        field_swaps = [(pdfs, pdfs_tmp)]
-        ('double', 'omega_0'),
+    else:
-        ('double', 'omega_1'),
+        field_swaps = []
-        ('double', 'omega_2'),
-        ('double', 'omega_3'),
+    # This is a microbenchmark for testing how fast Q PDFs can be updated per cell. To avoid optimisations from
-        ('double', 'omega_4'),
+    # the compiler the PDFs are shuffled inside a cell. Otherwise, for common streaming patterns compilers would
-        ('double', 'omega_5'),
+    # typically remove the copy of the center PDF which results in an overestimation of the maximum performance
-        ('double', 'omega_6'),
+    stream_only_kernel = []
-        ('int32_t', 'cudaBlockSize0'),
+    for i in range(stencil.Q):
-        ('int32_t', 'cudaBlockSize1'),
+        stream_only_kernel.append(Assignment(pdfs(i), pdfs((i + 3) % stencil.Q)))
-    ]
-    lb_method = create_lb_method(**options)
+    # LB Sweep
-    update_rule = create_lb_update_rule(lb_method=lb_method, **options)
+    collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt)
-    if not noopt:
+    if optimize:
-        update_rule = insert_fast_divisions(update_rule)
+        collision_rule = insert_fast_divisions(collision_rule)
-        update_rule = insert_fast_sqrts(update_rule)
+        collision_rule = insert_fast_sqrts(collision_rule)
-    # CPU lattice model - required for macroscopic value computation, VTK output etc.
+    lb_method = collision_rule.method
-    options_without_opt = options.copy()
-    del options_without_opt['optimization']
+    no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip',
-    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method,
+                                     boundary_object=NoSlip(), field_data_type=pdf_data_type)
-                                                                                        **options_without_opt))
+    ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB',
+                                 boundary_object=UBB([0.05, 0, 0], data_type=field_data_type),
-    # gpu LB sweep & boundaries
+                                 field_data_type=pdf_data_type)
-    generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
-                   field_swaps=[('pdfs', 'pdfs_tmp')],
+    generate_lbm_package(ctx, name="UniformGridGPU",
-                   inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
+                         collision_rule=collision_rule,
-                   varying_parameters=vp)
+                         lbm_config=lbm_config, lbm_optimisation=lbm_opt,
+                         nonuniform=False, boundaries=[no_slip, ubb],
-    generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
+                         macroscopic_fields=macroscopic_fields,
-    generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
+                         target=ps.Target.GPU, gpu_indexing_params=gpu_indexing_params,
+                         data_type=field_data_type, pdfs_data_type=pdf_data_type,
-    # getter & setter
+                         max_threads=max_threads)
-    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=1.0)
+    # Stream only kernel
-    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
+    generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel,
-                                                   pdfs=pdfs.center_vector, density=None)
+                   gpu_indexing_params={'block_size': (128, 1, 1)}, target=ps.Target.GPU,
-    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments)
+                   max_threads=max_threads)
-    generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
-    # communication
-    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
    infoHeaderParams = {
        'stencil': stencil_str,
-        'q': q,
+        'streaming_pattern': streaming_pattern,
-        'configName': ctx.config,
+        'collision_setup': collision_setup,
-        'cse_global': int(options['optimization']['cse_global']),
+        'cse_global': int(lbm_opt.cse_global),
-        'cse_pdfs': int(options['optimization']['cse_pdfs']),
+        'cse_pdfs': int(lbm_opt.cse_pdfs),
    }
-    ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams))
+    field_typedefs = {'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'UniformGridGPU_InfoHeader',
+                         field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
-#include "core/Environment.h"
-#include "core/logging/Initialization.h"
-#include "python_coupling/CreateConfig.h"
-#include "python_coupling/PythonCallback.h"
-#include "python_coupling/DictWrapper.h"
-#include "blockforest/Initialization.h"
-#include "field/FlagField.h"
-#include "field/AddToStorage.h"
-#include "field/vtk/VTKWriter.h"
-#include "field/communication/PackInfo.h"
-#include "lbm/PerformanceLogger.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-#include "timeloop/all.h"
-#include "geometry/all.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/ParallelStreams.h"
-#include "core/timing/TimingPool.h"
-#include "core/timing/RemainingTimeLogger.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/DeviceSelectMPI.h"
-#include "domain_decomposition/SharedSweep.h"
-#include "InitShearVelocity.h"
-#include "gui/Gui.h"
-#ifdef WALBERLA_ENABLE_GUI
-#include "lbm/gui/PdfFieldDisplayAdaptor.h"
-#endif
-#include "UniformGridGPU_AA_PackInfoPush.h"
-#include "UniformGridGPU_AA_PackInfoPull.h"
-#include "UniformGridGPU_AA_MacroSetter.h"
-#include "UniformGridGPU_AA_MacroGetter.h"
-#include "UniformGridGPU_AA_LbKernelEven.h"
-#include "UniformGridGPU_AA_LbKernelOdd.h"
-#include "UniformGridGPU_AA_Defines.h"
-#include <cmath>
-using namespace walberla;
-using CommunicationStencil_T = Stencil_T;
-using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
-using VelocityField_T = GhostLayerField< real_t, 3 >;
-int main( int argc, char **argv )
-{
-    mpi::Environment env( argc, argv );
-    cuda::selectDeviceBasedOnMpiRank();
-    for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
-    {
-        WALBERLA_MPI_WORLD_BARRIER();
-        WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
-        auto config = *cfg;
-        logging::configureLogging( config );
-        auto blocks = blockforest::createUniformBlockGridFromConfig( config );
-        Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" );
-        // Reading parameters
-        auto parameters = config->getOneBlock( "Parameters" );
-        const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 ));
-        const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));
-        // Creating fields
-        BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx );
-        BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
-        WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
-        initShearVelocity( blocks, velFieldCpuID );
-        pystencils::UniformGridGPU_AA_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
-        pystencils::UniformGridGPU_AA_MacroSetter setterSweep( pdfFieldCpuID, velFieldCpuID );
-        for ( auto &block : *blocks )
-            setterSweep( &block );
-        BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
-        Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
-        for(uint_t i=0; i< 3; ++i)
-        {
-            if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
-                WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
-            }
-        }
-        Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
-        bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
-        Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
-        int streamHighPriority = 0;
-        int streamLowPriority = 0;
-        WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange( &streamLowPriority, &streamHighPriority ));
-        WALBERLA_CHECK( gpuBlockSize[2] == 1 );
-        using KernelEven = pystencils::UniformGridGPU_AA_LbKernelEven;
-        using KernelOdd = pystencils::UniformGridGPU_AA_LbKernelOdd;
-        using PackInfoPull = pystencils::UniformGridGPU_AA_PackInfoPull;
-        using PackInfoPush = pystencils::UniformGridGPU_AA_PackInfoPush;
-        using cuda::communication::UniformGPUScheme;
-        KernelEven kernelEven( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
-        KernelOdd  kernelOdd ( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
-        kernelEven.setOuterPriority( streamHighPriority );
-        kernelOdd .setOuterPriority( streamHighPriority );
-        auto pullScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
-        auto pushScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
-        pullScheme->addPackInfo( make_shared< PackInfoPull >( pdfFieldGpuID ) );
-        pushScheme->addPackInfo( make_shared< PackInfoPush >( pdfFieldGpuID ) );
-        auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
-        auto setupPhase = [&]() {
-            for ( auto &block: *blocks )
-                kernelEven( &block );
-            pullScheme->communicate();
-            for ( auto &block: *blocks )
-                kernelOdd( &block );
-        };
-        auto tearDownPhase = [&]() {
-            pushScheme->communicate();
-            cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
-            for ( auto &block : *blocks )
-                getterSweep( &block );
-        };
-        auto simpleOverlapTimeStep = [&]()
-        {
-            // Even
-            pushScheme->startCommunication( defaultStream );
-            for ( auto &block: *blocks )
-                kernelEven.inner( &block, defaultStream );
-            pushScheme->wait( defaultStream );
-            for ( auto &block: *blocks )
-                kernelEven.outer( &block, defaultStream );
-            // Odd
-            pullScheme->startCommunication( defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd.inner( &block, defaultStream );
-            pullScheme->wait( defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd.outer( &block, defaultStream );
-        };
-        auto normalTimeStep = [&]()
-        {
-            pushScheme->communicate( defaultStream );
-            for ( auto &block: *blocks )
-                kernelEven( &block, defaultStream );
-            pullScheme->communicate( defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd( &block, defaultStream );
-        };
-        auto kernelOnlyFunc = [&]()
-        {
-            for ( auto &block: *blocks )
-                kernelEven( &block, defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd( &block, defaultStream );
-        };
-        SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
-        const std::string timeStepStrategy = parameters.getParameter< std::string >( "timeStepStrategy", "normal" );
-        std::function< void() > timeStep;
-        if ( timeStepStrategy == "noOverlap" )
-            timeStep = std::function< void() >( normalTimeStep );
-        else if ( timeStepStrategy == "simpleOverlap" )
-            timeStep = simpleOverlapTimeStep;
-        else if ( timeStepStrategy == "kernelOnly" )
-        {
-            WALBERLA_LOG_INFO_ON_ROOT( "Running only compute kernel without boundary - this makes only sense for benchmarking!" )
-            timeStep = kernelOnlyFunc;
-        }
-        else
-        {
-            WALBERLA_ABORT_NO_DEBUG_INFO(
-                    "Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'" );
-        }
-        timeLoop.add() << BeforeFunction( timeStep )
-                       << Sweep( []( IBlock * ) {}, "time step" );
-        // VTK
-        uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 );
-        if ( vtkWriteFrequency > 0 )
-        {
-            auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                             "simulation_step", false, true, true, false, 0 );
-            auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldCpuID, "vel" );
-            vtkOutput->addCellDataWriter( velWriter );
-            vtkOutput->addBeforeFunction( [&]()
-                                          {
-                                              tearDownPhase();
-                                              setupPhase();
-                                          } );
-            timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
-        }
-        int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 );
-        int outerIterations = parameters.getParameter< int >( "outerIterations", 1 );
-        setupPhase();
-        for ( int i = 0; i < warmupSteps; ++i )
-            timeLoop.singleStep();
-        double  remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
-        if ( remainingTimeLoggerFrequency > 0 )
-        {
-            auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c(outerIterations), remainingTimeLoggerFrequency );
-            timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
-        }
-        bool useGui = parameters.getParameter<bool>( "useGui", false );
-        if( useGui )
-        {
-#ifdef WALBERLA_ENABLE_GUI
-            cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
-            timeLoop.addFuncAfterTimeStep( cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ), "copy to CPU" );
-            GUI gui( timeLoop, blocks, argc, argv);
-                gui.registerDisplayAdaptorCreator(
-                [&](const IBlock & block, ConstBlockDataID blockDataID) -> gui::DisplayAdaptor * {
-                    if ( block.isDataOfType< PdfField_T >( blockDataID) )
-                        return new lbm::PdfFieldDisplayAdaptor<GhostLayerField<real_t, Stencil_T::Q>, Stencil_T >( blockDataID );
-                    return nullptr;
-                });
-            gui.run();
-#else
-            WALBERLA_ABORT_NO_DEBUG_INFO("Application was built without GUI. Set useGui to false or re-compile with GUI.")
-#endif
-        }
-        else
-        {
-            for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
-            {
-                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
-                timeLoop.setCurrentTimeStepToZero();
-                WcTimer simTimer;
-                cudaDeviceSynchronize();
-                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
-                WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
-                simTimer.start();
-                timeLoop.run();
-                cudaDeviceSynchronize();
-                simTimer.end();
-                WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
-                auto time = simTimer.last();
-                auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-                auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-                WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
-                WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
-                WALBERLA_ROOT_SECTION()
-                {
-                    python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                    if ( pythonCallbackResults.isCallable())
-                    {
-                        const char * storagePattern = "aa";
-                        pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                        pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
-                        pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
-                        pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
-                        pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
-                        pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
-                        // Call Python function to report results
-                        pythonCallbackResults();
-                    }
-                }
-            }
-        }
-    }
-    return 0;
-}
--- a/apps/benchmarks/UniformGridGPU/old_ideas/PizDaintJobScript.py
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/PizDaintJobScript.py
+#!/usr/bin/env python3
+import os
+from waLBerla.tools.config import block_decomposition
+job_script_header = """
+#!/bin/bash -l
+#SBATCH --job-name=scaling
+#SBATCH --time=01:00:00
+#SBATCH --nodes={nodes}
+#SBATCH -o out_scaling_{nodes}_%j.txt
+#SBATCH -e err_scaling_{nodes}_%j.txt
+#SBATCH --ntasks-per-core=1
+#SBATCH --cpus-per-task=1
+#SBATCH --partition=normal
+#SBATCH --constraint=gpu
+#SBATCH --account=s1042
+source ~/env.sh
+export MPICH_RDMA_ENABLED_CUDA=1  # allow GPU-GPU data transfer
+export CRAY_CUDA_MPS=1            # allow GPU sharing
+export MPICH_G2G_PIPELINE=256     # adapt maximum number of concurrent in-flight messages
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export CRAY_CUDA_MPS=1
+export MPICH_RANK_REORDER_METHOD=3
+export PMI_MMAP_SYNC_WAIT_TIME=300
+cd {folder}
+# grid_order -R -H -c 1,1,8 -g 16,16,8
+ulimit -c 0
+"""
+job_script_exe_part = """
+export WALBERLA_SCENARIO_IDX=0
+while srun -n {nodes} ./{app} {config}
+do
+ ((WALBERLA_SCENARIO_IDX++))
+done
+"""
+streaming_patterns = ['pull', 'push', 'aa', 'esotwist']
+stencils = ['d3q27', 'd3q19']
+methods = ['srt', 'mrt', 'cumulant', 'entropic']
+all_executables = []
+for stencil in stencils:
+    for streaming_pattern in streaming_patterns:
+        for method in methods:
+            all_executables.append(f"UniformGridGPU_{stencil}_{streaming_pattern}_{method}")
+all_executables = tuple(all_executables)
+def generate_jobscripts(exe_names=all_executables):
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
+        folder_name = "scaling_{:04d}".format(node_count)
+        os.makedirs(folder_name, exist_ok=True)
+        # run grid_order
+        import subprocess
+        decomposition = block_decomposition(node_count)
+        decomposition_str = ",".join(str(e) for e in decomposition)
+        subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
+        job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
+        for exe in exe_names:
+            job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count,
+                                                     config='../communication_compare.py')
+        with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
+            f.write(job_script)
+if __name__ == '__main__':
+    print("Called without waLBerla - generating job scripts for PizDaint")
+    generate_jobscripts()
--- a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Random.h"
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/PythonCallback.h"
+#include "python_coupling/DictWrapper.h"
+#include "blockforest/Initialization.h"
+#include "field/FlagField.h"
+#include "field/AddToStorage.h"
+#include "field/vtk/VTKWriter.h"
+#include "field/communication/PackInfo.h"
+#include "lbm/PerformanceLogger.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "timeloop/all.h"
+#include "core/math/Random.h"
+#include "geometry/all.h"
+#include "gpu/HostFieldAllocator.h"
+#include "gpu/communication/GPUPackInfo.h"
+#include "gpu/ParallelStreams.h"
+#include "gpu/NVTX.h"
+#include "core/timing/TimingPool.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "gpu/AddGPUFieldToStorage.h"
+#include "gpu/communication/UniformGPUScheme.h"
+#include "gpu/DeviceSelectMPI.h"
+#include "domain_decomposition/SharedSweep.h"
+#include "UniformGridGPU_LatticeModel.h"
+#include "UniformGridGPU_LbKernel.h"
+#include "UniformGridGPU_PackInfo.h"
+#include "UniformGridGPU_UBB.h"
+#include "UniformGridGPU_NoSlip.h"
+#include "UniformGridGPU_Communication.h"
+#include "UniformGridGPU_MacroSetter.h"
+#include "UniformGridGPU_MacroGetter.h"
+#include "UniformGridGPU_Defines.h"
+#include "InitShearVelocity.h"
+using namespace walberla;
+using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
+const auto Q = LatticeModel_T::Stencil::Q;
+using Stencil_T = LatticeModel_T::Stencil;
+using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
+using PdfField_T = GhostLayerField<real_t, Q>;
+using CommScheme_T = gpu::communication::UniformGPUScheme<CommunicationStencil_T>;
+using VelocityField_T = GhostLayerField<real_t, 3>;
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+int main( int argc, char **argv )
+{
+   mpi::Environment env( argc, argv );
+   gpu::selectDeviceBasedOnMpiRank();
+   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+   {
+      WALBERLA_MPI_WORLD_BARRIER();
+      auto config = *cfg;
+      logging::configureLogging( config );
+      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
+      // Reading parameters
+      auto parameters = config->getOneBlock( "Parameters" );
+      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
+      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
+      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
+      // Creating fields
+      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
+      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
+      if( timeStepStrategy != "kernelOnlyNoInit")
+      {
+          if ( initShearFlow )
+          {
+              WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
+              initShearVelocity( blocks, velFieldCpuID );
+          }
+          pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
+          for( auto & block : *blocks )
+              setterSweep( &block );
+          // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
+          blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
+          initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
+          initialComm();
+      }
+      BlockDataID pdfFieldGpuID = gpu::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
+      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
+      // Boundaries
+      const FlagUID fluidFlagUID( "Fluid" );
+      auto boundariesConfig = config->getBlock( "Boundaries" );
+      bool disableBoundaries = true;
+      if( boundariesConfig )
+      {
+          disableBoundaries = false;
+          geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
+          geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
+      }
+      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
+      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
+      ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
+      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
+       // Communication setup
+      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
+      Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
+      const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
+      CommunicationSchemeType communicationScheme;
+      if( communicationSchemeStr == "GPUPackInfo_Baseline")
+          communicationScheme = GPUPackInfo_Baseline;
+      else if (communicationSchemeStr == "GPUPackInfo_Streams")
+          communicationScheme = GPUPackInfo_Streams;
+      else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
+          communicationScheme = UniformGPUScheme_Baseline;
+      else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
+          communicationScheme = UniformGPUScheme_Memcpy;
+      else if (communicationSchemeStr == "MPIDatatypes")
+          communicationScheme = MPIDatatypes;
+      else if (communicationSchemeStr == "MPIDatatypesFull")
+          communicationScheme = MPIDatatypesFull;
+      else {
+          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
+      }
+      Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
+      for(uint_t i=0; i< 3; ++i)
+      {
+          if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
+              WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
+          }
+      }
+      int streamHighPriority = 0;
+      int streamLowPriority = 0;
+      WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
+      WALBERLA_CHECK(gpuBlockSize[2] == 1);
+      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
+                                                    1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
+                                                    gpuBlockSize[0], gpuBlockSize[1],
+                                                    Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
+      lbKernel.setOuterPriority( streamHighPriority );
+      UniformGridGPU_Communication< CommunicationStencil_T, gpu::GPUField< double > >
+         gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
+      auto defaultStream = gpu::StreamRAII::newPriorityStream( streamLowPriority );
+      auto innerOuterStreams = gpu::ParallelStreams( streamHighPriority );
+      auto boundaryOuterStreams = gpu::ParallelStreams( streamHighPriority );
+      auto boundaryInnerStreams = gpu::ParallelStreams( streamHighPriority );
+      uint_t currentTimeStep = 0;
+      auto simpleOverlapTimeStep = [&] ()
+      {
+          gpuComm.startCommunication(defaultStream);
+          for( auto &block: *blocks )
+              lbKernel.inner( &block, defaultStream );
+          gpuComm.wait(defaultStream);
+          for( auto &block: *blocks )
+              lbKernel.outer( &block, defaultStream );
+      };
+      auto overlapTimeStep = [&]()
+      {
+         gpu::NvtxRange namedRange("timestep");
+         auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
+         innerOuterSection.run([&]( auto innerStream )
+         {
+            gpu::nameStream(innerStream, "inner stream");
+            for( auto &block: *blocks )
+            {
+               if(!disableBoundaries)
+               {
+                  auto p = boundaryInnerStreams.parallelSection( innerStream );
+                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
+                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
+               }
+               lbKernel.inner( &block, innerStream );
+            }
+         });
+         innerOuterSection.run([&]( auto outerStream )
+         {
+            gpu::nameStream(outerStream, "outer stream");
+            gpuComm( outerStream );
+            for( auto &block: *blocks )
+            {
+               if(!disableBoundaries)
+               {
+                  auto p = boundaryOuterStreams.parallelSection( outerStream );
+                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
+                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
+               }
+               lbKernel.outer( &block, outerStream );
+            }
+         });
+         currentTimeStep += 1;
+      };
+      auto boundaryStreams = gpu::ParallelStreams( streamHighPriority );
+      auto normalTimeStep = [&]()
+      {
+         gpuComm();
+         for( auto &block: *blocks )
+         {
+            if(!disableBoundaries)
+            {
+               auto p = boundaryStreams.parallelSection( defaultStream );
+               p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
+               p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
+            }
+            lbKernel( &block );
+         }
+      };
+      auto kernelOnlyFunc = [&] ()
+      {
+          for( auto &block: *blocks )
+              lbKernel( &block );
+      };
+      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
+      std::function<void()> timeStep;
+      if (timeStepStrategy == "noOverlap")
+          timeStep = std::function<void()>( normalTimeStep );
+      else if (timeStepStrategy == "complexOverlap")
+          timeStep = std::function<void()>( overlapTimeStep );
+      else if (timeStepStrategy == "simpleOverlap")
+          timeStep = simpleOverlapTimeStep;
+      else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
+          WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
+          timeStep = kernelOnlyFunc;
+      }
+      else {
+          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
+      }
+      timeLoop.add() << BeforeFunction( timeStep  )
+                     << Sweep( []( IBlock * ) {}, "time step" );
+      pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
+      // VTK
+      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
+      if( vtkWriteFrequency > 0 )
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                          "simulation_step", false, true, true, false, 0 );
+         auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+         vtkOutput->addBeforeFunction( [&]() {
+             gpu::fieldCpy<PdfField_T, gpu::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+             for( auto & block : *blocks )
+                 getterSweep( &block );
+         });
+         timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+      }
+      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
+      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
+      for(int i=0; i < warmupSteps; ++i )
+         timeLoop.singleStep();
+      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
+      if (remainingTimeLoggerFrequency > 0) {
+          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
+          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
+      }
+       for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
+       {
+           timeLoop.setCurrentTimeStepToZero();
+           WcTimer simTimer;
+           cudaDeviceSynchronize();
+           WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+           simTimer.start();
+           timeLoop.run();
+           cudaDeviceSynchronize();
+           simTimer.end();
+           WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+           auto time = simTimer.last();
+           auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+           auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+           WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+           WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+           WALBERLA_ROOT_SECTION()
+           {
+               python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+               if ( pythonCallbackResults.isCallable())
+               {
+                   const char * storagePattern = "twofield";
+                   pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                   pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
+                   pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
+                   pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
+                   pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
+                   pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
+                   // Call Python function to report results
+                   pythonCallbackResults();
+               }
+           }
+       }
+   }
+   return 0;
+}
No results found