diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
index 571f3129ef00a3b4a2a258059b25263170cc73fb..f616ba407e10e644878df134f0f295aa75d8fcf6 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
@@ -1,9 +1,9 @@
 from pystencils.field import fields
 
-from lbmpy.advanced_streaming.utility import get_timesteps, Timestep
+from lbmpy.advanced_streaming.utility import get_timesteps
 from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
 from lbmpy.stencils import get_stencil
-from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_method, create_lb_update_rule
+from lbmpy.creationfunctions import create_lb_collision_rule
 from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
 
 from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index 29755644d22f0542ac29b50e848f20dd6a2c9974..b3ef93bb589a84f807ac00464fa84cd141c1eb0c 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -4,49 +4,27 @@ waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )
 
 
-foreach (config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4
-      entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt
-      cumulant cumulant_d3q27
-      srt_d3q27 mrt_d3q27 mrt_d3q27_noopt smagorinsky_d3q27 smagorinsky_d3q27_noopt mrt_full_d3q27 mrt_full_d3q27_noopt)
-
-    waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
-          FILE UniformGridGPU.py
-          CODEGEN_CFG ${config}
-          OUT_FILES UniformGridGPU_LatticeModel.cpp UniformGridGPU_LatticeModel.h
-          UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
-          UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
-          UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
-          UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
-          UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h
-          UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h
-          UniformGridGPU_Defines.h
-          )
-
-
-    waLBerla_add_executable(NAME UniformGridBenchmarkGPU_${config}
-          FILES UniformGridGPU.cpp
-          DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_${config})
-    set_target_properties( UniformGridBenchmarkGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
-endforeach ()
-
-
-foreach (config srt trt mrt smagorinsky entropic)
-
-    waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_AA_${config}
-          FILE UniformGridGPU_AA.py
-          CODEGEN_CFG ${config}
-          OUT_FILES UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
-          UniformGridGPU_AA_LbKernelOdd.cu UniformGridGPU_AA_LbKernelOdd.h
-          UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
-          UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
-          UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
-          UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
-          UniformGridGPU_AA_Defines.h
-          )
-
-
-    waLBerla_add_executable(NAME UniformGridBenchmarkGPU_AA_${config}
-          FILES UniformGridGPU_AA.cpp
-          DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui UniformGridGPUGenerated_AA_${config})
-    set_target_properties( UniformGridBenchmarkGPU_AA_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
-endforeach ()
+foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
+    foreach(stencil d3q27) # choose from {d3q19 d3q27}
+        foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky}
+            set(config ${stencil}_${streaming_pattern}_${collision_setup})
+            waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
+                    FILE UniformGridGPU.py
+                    CODEGEN_CFG ${config}
+                    OUT_FILES   UniformGridGPU_LbKernel.cu UniformGridGPU_LbKernel.h
+                    UniformGridGPU_PackInfoEven.cu UniformGridGPU_PackInfoEven.h
+                    UniformGridGPU_PackInfoOdd.cu UniformGridGPU_PackInfoOdd.h
+                    UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
+                    UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
+                    UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
+                    UniformGridGPU_InfoHeader.h
+                    )
+
+
+            waLBerla_add_executable(NAME UniformGridGPU_${config}
+                    FILES UniformGridGPU.cpp
+                    DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
+            set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
+        endforeach ()
+    endforeach()
+endforeach()
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index a123cbf34d4470fc4daa8943956faa7c206e9ad6..bdcffe3ad3a8c1992d7746f4e40fd34fc9325c4a 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -1,361 +1,302 @@
+#include "blockforest/Initialization.h"
+
 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
-#include "core/math/Random.h"
-#include "python_coupling/CreateConfig.h"
-#include "python_coupling/PythonCallback.h"
-#include "python_coupling/DictWrapper.h"
-#include "blockforest/Initialization.h"
-#include "field/FlagField.h"
-#include "field/AddToStorage.h"
-#include "field/vtk/VTKWriter.h"
-#include "field/communication/PackInfo.h"
-#include "lbm/PerformanceLogger.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-#include "timeloop/all.h"
-#include "core/math/Random.h"
-#include "geometry/all.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/ParallelStreams.h"
-#include "cuda/NVTX.h"
-#include "core/timing/TimingPool.h"
 #include "core/timing/RemainingTimeLogger.h"
+#include "core/timing/TimingPool.h"
+
 #include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/UniformGPUScheme.h"
 #include "cuda/DeviceSelectMPI.h"
-#include "domain_decomposition/SharedSweep.h"
-#include "gui/Gui.h"
-#include "lbm/gui/Connection.h"
-
-#include "UniformGridGPU_LatticeModel.h"
-#include "UniformGridGPU_LbKernel.h"
-#include "UniformGridGPU_PackInfo.h"
-#include "UniformGridGPU_UBB.h"
-#include "UniformGridGPU_NoSlip.h"
-#include "UniformGridGPU_Communication.h"
-#include "UniformGridGPU_MacroSetter.h"
-#include "UniformGridGPU_MacroGetter.h"
-#include "UniformGridGPU_Defines.h"
+#include "cuda/ParallelStreams.h"
+#include "cuda/communication/UniformGPUScheme.h"
+#include "cuda/FieldCopy.h"
+#include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
 
+#include "field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/communication/PackInfo.h"
+#include "field/vtk/VTKWriter.h"
 
-using namespace walberla;
+#include "geometry/InitBoundaryHandling.h"
 
-using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
+#include "lbm/inplace_streaming/TimestepTracker.h"
 
-const auto Q = LatticeModel_T::Stencil::Q;
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/DictWrapper.h"
+#include "python_coupling/PythonCallback.h"
 
+#include "timeloop/SweepTimeloop.h"
 
-using Stencil_T = LatticeModel_T::Stencil;
-using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
-using PdfField_T = GhostLayerField<real_t, Q>;
-using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
-using VelocityField_T = GhostLayerField<real_t, 3>;
-using flag_t = walberla::uint8_t;
-using FlagField_T = FlagField<flag_t>;
+#include "InitShearVelocity.h"
 
+#include <cmath>
 
-void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
-                       const real_t xMagnitude=real_t(0.1), const real_t fluctuationMagnitude=real_t(0.05) )
-{
-    math::seedRandomGenerator(0);
-    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
-    for( auto & block: *blocks)
-    {
-        auto velField = block.getData<VelocityField_T>( velFieldID );
-        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
-            Cell globalCell;
-            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
-            real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
-            velField->get(x, y, z, 1) = real_t(0);
-            velField->get(x, y, z, 2) = randomReal;
-
-            if( globalCell[2] >= halfZ ) {
-                velField->get(x, y, z, 0) = xMagnitude;
-            } else {
-                velField->get(x, y, z, 0) = -xMagnitude;
-            }
-        );
-    }
-}
+#include "UniformGridGPU_InfoHeader.h"
+using namespace walberla;
 
+using FlagField_T            = FlagField<uint8_t>;
 
-int main( int argc, char **argv )
+int main(int argc, char** argv)
 {
-   mpi::Environment env( argc, argv );
+   mpi::Environment env(argc, argv);
    cuda::selectDeviceBasedOnMpiRank();
 
-   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+   for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
-      WALBERLA_MPI_WORLD_BARRIER();
+      WALBERLA_MPI_WORLD_BARRIER()
+
+      WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                        SETUP AND CONFIGURATION                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
       auto config = *cfg;
-      logging::configureLogging( config );
-      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+      logging::configureLogging(config);
+      auto blocks = blockforest::createUniformBlockGridFromConfig(config);
 
-      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
+      Vector3< uint_t > cellsPerBlock =
+         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
       // Reading parameters
-      auto parameters = config->getOneBlock( "Parameters" );
-      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
-      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
-      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+      auto parameters        = config->getOneBlock("Parameters");
+      const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
       const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
 
       // Creating fields
-      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
-      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
+      BlockDataID pdfFieldCpuID =
+         field::addToStorage< PdfField_T >(blocks, "pdfs cpu", real_t(std::nan("")), field::fzyx);
+      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
+
+      // Initialize velocity on cpu
+      if( initShearFlow ){
+         WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
+         initShearVelocity(blocks, velFieldCpuID);
+      }
+
+      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >(blocks, pdfFieldCpuID, "pdfs on GPU", true);
+      // Velocity field is copied to the GPU
+      BlockDataID velFieldGpuID =
+         cuda::addGPUFieldToStorage< VelocityField_T >(blocks, velFieldCpuID, "velocity on GPU", true);
 
-      if( timeStepStrategy != "kernelOnlyNoInit")
+      pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldGpuID, velFieldGpuID);
+
+      // Set up initial PDF values
+      for (auto& block : *blocks)
+         setterSweep(&block);
+
+      Vector3< int > innerOuterSplit =
+         parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1));
+
+      for (uint_t i = 0; i < 3; ++i)
       {
-          if ( initShearFlow )
-          {
-              WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
-              initShearVelocity( blocks, velFieldCpuID );
-          }
-
-          pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
-          for( auto & block : *blocks )
-              setterSweep( &block );
-
-          // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
-          blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
-          initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
-          initialComm();
+         if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
+         {
+            WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
+         }
       }
 
-      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
-      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
+      Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
+      bool cudaEnabledMPI = parameters.getParameter< bool >("cudaEnabledMPI", false);
+      Vector3< int32_t > gpuBlockSize =
+         parameters.getParameter< Vector3< int32_t > >("gpuBlockSize", Vector3< int32_t >(256, 1, 1));
 
+      int streamHighPriority = 0;
+      int streamLowPriority  = 0;
+      WALBERLA_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority))
+
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                      LB SWEEPS AND BOUNDARY HANDLING                                       ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      using LbSweep      = lbm::UniformGridGPU_LbKernel;
+      using PackInfoEven = lbm::UniformGridGPU_PackInfoEven;
+      using PackInfoOdd  = lbm::UniformGridGPU_PackInfoOdd;
+      using cuda::communication::UniformGPUScheme;
+
+      LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
+      lbSweep.setOuterPriority(streamHighPriority);
 
       // Boundaries
       const FlagUID fluidFlagUID( "Fluid" );
+      BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field");
       auto boundariesConfig = config->getBlock( "Boundaries" );
-      bool disableBoundaries = true;
+      bool boundaries = false;
       if( boundariesConfig )
       {
-          disableBoundaries = false;
-          geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
-          geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
+         boundaries = true;
+         geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
+         geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
       }
 
-      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
       lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
+      noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
 
-      ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
-      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
-
-       // Communication setup
-      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
-      Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
-      const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
-      CommunicationSchemeType communicationScheme;
-      if( communicationSchemeStr == "GPUPackInfo_Baseline")
-          communicationScheme = GPUPackInfo_Baseline;
-      else if (communicationSchemeStr == "GPUPackInfo_Streams")
-          communicationScheme = GPUPackInfo_Streams;
-      else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
-          communicationScheme = UniformGPUScheme_Baseline;
-      else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
-          communicationScheme = UniformGPUScheme_Memcpy;
-      else if (communicationSchemeStr == "MPIDatatypes")
-          communicationScheme = MPIDatatypes;
-      else if (communicationSchemeStr == "MPIDatatypesFull")
-          communicationScheme = MPIDatatypesFull;
-      else {
-          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
-      }
+      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
+      ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
 
-      Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
-      for(uint_t i=0; i< 3; ++i)
-      {
-          if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
-              WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
-          }
-      }
+      // Initial setup is the post-collision state of an even time step
+      auto tracker = make_shared< lbm::TimestepTracker >(0);
 
-      int streamHighPriority = 0;
-      int streamLowPriority = 0;
-      WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
-      WALBERLA_CHECK(gpuBlockSize[2] == 1);
-      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
-                                                    1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
-                                                    gpuBlockSize[0], gpuBlockSize[1],
-                                                    Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
-      lbKernel.setOuterPriority( streamHighPriority );
-      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
-         gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
-
-      auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
-      auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
-      auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
-
-      uint_t currentTimeStep = 0;
-
-      auto simpleOverlapTimeStep = [&] ()
-      {
-          gpuComm.startCommunication(defaultStream);
-          for( auto &block: *blocks )
-              lbKernel.inner( &block, defaultStream );
-          gpuComm.wait(defaultStream);
-          for( auto &block: *blocks )
-              lbKernel.outer( &block, defaultStream );
-      };
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                           COMMUNICATION SCHEME                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-      auto overlapTimeStep = [&]()
-      {
-         cuda::NvtxRange namedRange("timestep");
-         auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
+      UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
+      auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
+      comm.addPackInfo(packInfo);
 
-         innerOuterSection.run([&]( auto innerStream )
-         {
-            cuda::nameStream(innerStream, "inner stream");
-            for( auto &block: *blocks )
-            {
-               if(!disableBoundaries)
-               {
-                  auto p = boundaryInnerStreams.parallelSection( innerStream );
-                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
-                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
-               }
-               lbKernel.inner( &block, innerStream );
-            }
-         });
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                          TIME STEP DEFINITIONS                                             ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-         innerOuterSection.run([&]( auto outerStream )
-         {
-            cuda::nameStream(outerStream, "outer stream");
-            gpuComm( outerStream );
+      auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
 
-            for( auto &block: *blocks )
-            {
-               if(!disableBoundaries)
-               {
-                  auto p = boundaryOuterStreams.parallelSection( outerStream );
-                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
-                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
-               }
-               lbKernel.outer( &block, outerStream );
-            }
-         });
-         currentTimeStep += 1;
+      auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+         noSlip.run(block, t, stream);
+         ubb.run(block, t, stream);
       };
 
+      auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+         noSlip.inner(block, t, stream);
+         ubb.inner(block, t, stream);
+      };
 
-      auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
-      auto normalTimeStep = [&]()
-      {
-         gpuComm();
-         for( auto &block: *blocks )
-         {
-            if(!disableBoundaries)
-            {
-               auto p = boundaryStreams.parallelSection( defaultStream );
-               p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
-               p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
-            }
-            lbKernel( &block );
+      auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+         noSlip.outer(block, t, stream);
+         ubb.outer(block, t, stream);
+      };
+
+      auto simpleOverlapTimeStep = [&]() {
+         // Communicate post-collision values of previous timestep...
+         comm.startCommunication(defaultStream);
+         for (auto& block : *blocks){
+            if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
+            lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
          }
+         comm.wait(defaultStream);
+         for (auto& block : *blocks){
+            if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
+            lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
+         }
+
+         tracker->advance();
       };
 
-      auto kernelOnlyFunc = [&] ()
-      {
-          for( auto &block: *blocks )
-              lbKernel( &block );
+      auto normalTimeStep = [&]() {
+         comm.communicate(defaultStream);
+         for (auto& block : *blocks){
+            if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
+            lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
+         }
+
+         tracker->advance();
       };
 
-      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
+      // With two-fields patterns, ghost layer cells act as constant stream-in boundaries;
+      // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries.
+      auto kernelOnlyFunc = [&]() {
+         tracker->advance();
+         for (auto& block : *blocks)
+            lbSweep(&block, tracker->getCounter(), defaultStream);
+      };
 
-      std::function<void()> timeStep;
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                             TIME LOOP SETUP                                                ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps);
+
+      const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
+      std::function< void() > timeStep;
       if (timeStepStrategy == "noOverlap")
-          timeStep = std::function<void()>( normalTimeStep );
-      else if (timeStepStrategy == "complexOverlap")
-          timeStep = std::function<void()>( overlapTimeStep );
+         timeStep = std::function< void() >(normalTimeStep);
       else if (timeStepStrategy == "simpleOverlap")
-          timeStep = simpleOverlapTimeStep;
-      else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
-          WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
-          timeStep = kernelOnlyFunc;
+         timeStep = simpleOverlapTimeStep;
+      else if (timeStepStrategy == "kernelOnly")
+      {
+         WALBERLA_LOG_INFO_ON_ROOT(
+            "Running only compute kernel without boundary - this makes only sense for benchmarking!")
+         // Run initial communication once to provide any missing stream-in populations
+         comm.communicate();
+         timeStep = kernelOnlyFunc;
       }
-      else {
-          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
+      else
+      {
+         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
+                                      "'simpleOverlap', 'kernelOnly'")
       }
 
-      timeLoop.add() << BeforeFunction( timeStep  )
-                     << Sweep( []( IBlock * ) {}, "time step" );
-
-      pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
+      timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
 
       // VTK
-      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
-      if( vtkWriteFrequency > 0 )
+      uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
+      if (vtkWriteFrequency > 0)
       {
-         auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                          "simulation_step", false, true, true, false, 0 );
-         auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
+         auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                         "simulation_step", false, true, true, false, 0);
+         auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldCpuID, "vel");
          vtkOutput->addCellDataWriter(velWriter);
-         vtkOutput->addBeforeFunction( [&]() {
-             cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
-             for( auto & block : *blocks )
-                 getterSweep( &block );
+
+         vtkOutput->addBeforeFunction([&]() {
+           cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
          });
-         timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
 
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///                                               BENCHMARK                                                    ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-
-      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
-      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
-      for(int i=0; i < warmupSteps; ++i )
+      int warmupSteps     = parameters.getParameter< int >("warmupSteps", 2);
+      int outerIterations = parameters.getParameter< int >("outerIterations", 1);
+      for (int i = 0; i < warmupSteps; ++i)
          timeLoop.singleStep();
 
-      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
-      if (remainingTimeLoggerFrequency > 0) {
-          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
-          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
-      }
-
-      bool useGui = parameters.getParameter<bool>( "useGui", false );
-      if( useGui )
+      double remainingTimeLoggerFrequency =
+         parameters.getParameter< double >("remainingTimeLoggerFrequency", -1.0); // in seconds
+      if (remainingTimeLoggerFrequency > 0)
       {
-          GUI gui( timeLoop, blocks, argc, argv);
-          lbm::connectToGui<LatticeModel_T>(gui);
-          gui.run();
+         auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations),
+                                                   remainingTimeLoggerFrequency);
+         timeLoop.addFuncAfterTimeStep(logger, "remaining time logger");
       }
-      else
+
+      for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
       {
-          for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
-          {
-              timeLoop.setCurrentTimeStepToZero();
-              WcTimer simTimer;
-              cudaDeviceSynchronize();
-              WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
-              simTimer.start();
-              timeLoop.run();
-              cudaDeviceSynchronize();
-              simTimer.end();
-              WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
-              auto time = simTimer.last();
-              auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-              auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-              WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
-              WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
-              WALBERLA_ROOT_SECTION()
-              {
-                  python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                  if ( pythonCallbackResults.isCallable())
-                  {
-                      const char * storagePattern = "twofield";
-                      pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                      pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
-                      pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
-                      pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
-                      pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
-                      pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
-                      // Call Python function to report results
-                      pythonCallbackResults();
-                  }
-              }
-          }
+         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+
+         timeLoop.setCurrentTimeStepToZero();
+         WcTimer simTimer;
+         cudaDeviceSynchronize();
+         WALBERLA_CUDA_CHECK(cudaPeekAtLastError())
+         WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
+         simTimer.start();
+         timeLoop.run();
+         cudaDeviceSynchronize();
+         simTimer.end();
+         WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
+         auto time      = simTimer.last();
+         auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
+
+         auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
+         WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess)
+         WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps))
+         WALBERLA_ROOT_SECTION()
+         {
+            python_coupling::PythonCallback pythonCallbackResults("results_callback");
+            if (pythonCallbackResults.isCallable())
+            {
+               pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
+               pythonCallbackResults.data().exposeValue("stencil", infoStencil);
+               pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
+               pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
+               pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
+               pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
+               // Call Python function to report results
+               pythonCallbackResults();
+            }
+         }
       }
    }
 
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
index 6f9670d1d6b3609b21be57fc7d026389229fb320..7d3c8e7f21e602128564f4ea8e22a119e7080989 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -1,19 +1,22 @@
 import sympy as sp
 import numpy as np
 import pystencils as ps
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
-from lbmpy.boundaries import NoSlip, UBB
-from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
-from pystencils_walberla import generate_pack_info_from_kernel
-from lbmpy_walberla import generate_lattice_model, generate_boundary
-from pystencils_walberla import CodeGeneration, generate_sweep
+
 from pystencils.data_types import TypedSymbol
 from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
-from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
+
+from lbmpy.advanced_streaming import Timestep, is_inplace
+from lbmpy.advanced_streaming.utility import streaming_patterns
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_collision_rule
+from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
+from lbmpy.stencils import get_stencil
+
+from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
+from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary
 
 omega = sp.symbols("omega")
 omega_free = sp.Symbol("omega_free")
-omega_fill = sp.symbols("omega_:10")
 compile_time_block_size = False
 
 if compile_time_block_size:
@@ -21,156 +24,158 @@ if compile_time_block_size:
 else:
     sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
                         TypedSymbol("cudaBlockSize1", np.int32),
-                        1)
+                        TypedSymbol("cudaBlockSize2", np.int32))
 
-sweep_params = {'block_size': sweep_block_size}
+gpu_indexing_params = {'block_size': sweep_block_size}
 
 options_dict = {
     'srt': {
         'method': 'srt',
-        'stencil': 'D3Q19',
         'relaxation_rate': omega,
         'compressible': False,
     },
     'trt': {
         'method': 'trt',
-        'stencil': 'D3Q19',
         'relaxation_rate': omega,
     },
     'mrt': {
         'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235],
+        'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1],
     },
-    'mrt_full': {
+    'mrt-overrelax': {
         'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
-                             omega_fill[3], omega_fill[4], omega_fill[5]],
+        'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
     },
-    'entropic': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
+    'cumulant': {
+        'method': 'cumulant',
+        'relaxation_rate': omega,
         'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free],
-        'entropic': True,
     },
-    'entropic_kbc_n4': {
-        'method': 'trt-kbc-n4',
-        'stencil': 'D3Q27',
+    'cumulant-overrelax': {
+        'method': 'cumulant',
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
         'compressible': True,
-        'relaxation_rates': [omega, omega_free],
+    },
+    'entropic': {
+        'method': 'mrt',
+        'compressible': True,
+        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
         'entropic': True,
     },
     'smagorinsky': {
         'method': 'srt',
-        'stencil': 'D3Q19',
         'smagorinsky': True,
         'relaxation_rate': omega,
-    },
-    'cumulant': {
-        'method': 'cumulant',
-        'stencil': 'D3Q19',
-        'compressible': True,
-        'relaxation_rate': omega,
-    },
+    }
 }
 
 info_header = """
-#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
 const char * infoStencil = "{stencil}";
-const char * infoConfigName = "{configName}";
+const char * infoStreamingPattern = "{streaming_pattern}";
+const char * infoCollisionSetup = "{collision_setup}";
 const bool infoCseGlobal = {cse_global};
 const bool infoCsePdfs = {cse_pdfs};
 """
 
+# DEFAULTS
+optimize = True
 
 with CodeGeneration() as ctx:
-    accessor = StreamPullTwoFieldsAccessor()
-    # accessor = StreamPushTwoFieldsAccessor()
-    assert not accessor.is_inplace, "This app does not work for inplace accessors"
+    config_tokens = ctx.config.split('_')
+
+    assert len(config_tokens) >= 3
+    stencil_str = config_tokens[0]
+    streaming_pattern = config_tokens[1]
+    collision_setup = config_tokens[2]
+
+    if len(config_tokens) >= 4:
+        optimize = (config_tokens[3] != 'noopt')
+
+    stencil = get_stencil(stencil_str)
+    assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}"
+
+    options = options_dict[collision_setup]
+
+    q = len(stencil)
+    dim = len(stencil[0])
+    assert dim == 3, "This app supports only three-dimensional stencils"
+    pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : double[3D]", layout='fzyx')
 
     common_options = {
-        'field_name': 'pdfs',
-        'temporary_field_name': 'pdfs_tmp',
-        'kernel_type': accessor,
-        'optimization': {'cse_global': True,
-                         'cse_pdfs': False}
+        'stencil': stencil,
+        'field_name': pdfs.name,
+        'optimization': {
+            'target': 'gpu',
+            'cse_global': True,
+            'cse_pdfs': False,
+            'symbolic_field': pdfs,
+            'field_layout': 'fzyx',
+            'gpu_indexing_params': gpu_indexing_params,
+        }
     }
-    config_name = ctx.config
-    noopt = False
-    d3q27 = False
-    if config_name.endswith("_noopt"):
-        noopt = True
-        config_name = config_name[:-len("_noopt")]
-    if config_name.endswith("_d3q27"):
-        d3q27 = True
-        config_name = config_name[:-len("_d3q27")]
-
-    options = options_dict[config_name]
-    options.update(common_options)
-    options = options.copy()
 
-    if noopt:
-        options['optimization']['cse_global'] = False
-        options['optimization']['cse_pdfs'] = False
-    if d3q27:
-        options['stencil'] = 'D3Q27'
+    options.update(common_options)
 
-    stencil_str = options['stencil']
-    q = int(stencil_str[stencil_str.find('Q') + 1:])
-    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
-    options['optimization']['symbolic_field'] = pdfs
+    if not is_inplace(streaming_pattern):
+        options['optimization']['symbolic_temporary_field'] = pdfs_tmp
+        field_swaps = [(pdfs, pdfs_tmp)]
+    else:
+        field_swaps = []
 
     vp = [
-        ('double', 'omega_0'),
-        ('double', 'omega_1'),
-        ('double', 'omega_2'),
-        ('double', 'omega_3'),
-        ('double', 'omega_4'),
-        ('double', 'omega_5'),
-        ('double', 'omega_6'),
         ('int32_t', 'cudaBlockSize0'),
         ('int32_t', 'cudaBlockSize1'),
+        ('int32_t', 'cudaBlockSize2')
     ]
-    lb_method = create_lb_method(**options)
-    update_rule = create_lb_update_rule(lb_method=lb_method, **options)
- 
-    if not noopt:
-        update_rule = insert_fast_divisions(update_rule)
-        update_rule = insert_fast_sqrts(update_rule)
-
-    # CPU lattice model - required for macroscopic value computation, VTK output etc.
-    options_without_opt = options.copy()
-    del options_without_opt['optimization']
-    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method,
-                                                                                        **options_without_opt))
-
-    # gpu LB sweep & boundaries
-    generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
-                   field_swaps=[('pdfs', 'pdfs_tmp')],
-                   inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
-                   varying_parameters=vp)
-
-    generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
-    generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
+
+    # LB Sweep
+    collision_rule = create_lb_collision_rule(**options)
+
+    if optimize:
+        collision_rule = insert_fast_divisions(collision_rule)
+        collision_rule = insert_fast_sqrts(collision_rule)
+
+    lb_method = collision_rule.method
+
+    generate_alternating_lbm_sweep(ctx, 'UniformGridGPU_LbKernel', collision_rule, streaming_pattern,
+                                   optimization=options['optimization'],
+                                   inner_outer_split=True, varying_parameters=vp, field_swaps=field_swaps)
 
     # getter & setter
-    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=1.0)
-    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=None)
-    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments)
-    generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
+    setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs,
+                                                   streaming_pattern=streaming_pattern,
+                                                   previous_timestep=Timestep.EVEN)
+    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu')
+
+    # Boundaries
+    noslip = NoSlip()
+    ubb = UBB((0.05, 0, 0))
+
+    generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_NoSlip', noslip, lb_method, field_name=pdfs.name,
+                                      streaming_pattern=streaming_pattern, target='gpu')
+    generate_alternating_lbm_boundary(ctx, 'UniformGridGPU_UBB', ubb, lb_method, field_name=pdfs.name,
+                                      streaming_pattern=streaming_pattern, target='gpu')
 
     # communication
-    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
+    generate_lb_pack_info(ctx, 'UniformGridGPU_PackInfo', stencil, pdfs,
+                          streaming_pattern=streaming_pattern, target='gpu',
+                          always_generate_separate_classes=True)
 
     infoHeaderParams = {
         'stencil': stencil_str,
-        'q': q,
-        'configName': ctx.config,
+        'streaming_pattern': streaming_pattern,
+        'collision_setup': collision_setup,
         'cse_global': int(options['optimization']['cse_global']),
         'cse_pdfs': int(options['optimization']['cse_pdfs']),
     }
-    ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams))
+
+    stencil_typedefs = {'Stencil_T': stencil,
+                        'CommunicationStencil_T': stencil}
+    field_typedefs = {'PdfField_T': pdfs,
+                      'VelocityField_T': velocity_field}
+
+    # Info header containing correct template definitions for stencil and field
+    generate_info_header(ctx, 'UniformGridGPU_InfoHeader',
+                         stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs,
+                         additional_code=info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
deleted file mode 100644
index dbda68b72a5f2990965e2ff2c1e6e759c6fe0d2e..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-#include "core/Environment.h"
-#include "core/logging/Initialization.h"
-#include "python_coupling/CreateConfig.h"
-#include "python_coupling/PythonCallback.h"
-#include "python_coupling/DictWrapper.h"
-#include "blockforest/Initialization.h"
-#include "field/FlagField.h"
-#include "field/AddToStorage.h"
-#include "field/vtk/VTKWriter.h"
-#include "field/communication/PackInfo.h"
-#include "lbm/PerformanceLogger.h"
-#include "blockforest/communication/UniformBufferedScheme.h"
-#include "timeloop/all.h"
-#include "geometry/all.h"
-#include "cuda/HostFieldAllocator.h"
-#include "cuda/communication/GPUPackInfo.h"
-#include "cuda/ParallelStreams.h"
-#include "core/timing/TimingPool.h"
-#include "core/timing/RemainingTimeLogger.h"
-#include "cuda/AddGPUFieldToStorage.h"
-#include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/DeviceSelectMPI.h"
-#include "domain_decomposition/SharedSweep.h"
-#include "InitShearVelocity.h"
-#include "gui/Gui.h"
-
-#ifdef WALBERLA_ENABLE_GUI
-#include "lbm/gui/PdfFieldDisplayAdaptor.h"
-#endif
-
-
-#include "UniformGridGPU_AA_PackInfoPush.h"
-#include "UniformGridGPU_AA_PackInfoPull.h"
-#include "UniformGridGPU_AA_MacroSetter.h"
-#include "UniformGridGPU_AA_MacroGetter.h"
-#include "UniformGridGPU_AA_LbKernelEven.h"
-#include "UniformGridGPU_AA_LbKernelOdd.h"
-#include "UniformGridGPU_AA_Defines.h"
-
-#include <cmath>
-
-using namespace walberla;
-
-using CommunicationStencil_T = Stencil_T;
-using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
-using VelocityField_T = GhostLayerField< real_t, 3 >;
-
-
-int main( int argc, char **argv )
-{
-    mpi::Environment env( argc, argv );
-    cuda::selectDeviceBasedOnMpiRank();
-
-    for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
-    {
-        WALBERLA_MPI_WORLD_BARRIER();
-
-        WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
-
-        auto config = *cfg;
-        logging::configureLogging( config );
-        auto blocks = blockforest::createUniformBlockGridFromConfig( config );
-
-        Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" );
-        // Reading parameters
-        auto parameters = config->getOneBlock( "Parameters" );
-        const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 ));
-        const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));
-
-        // Creating fields
-        BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx );
-        BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
-
-        WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
-        initShearVelocity( blocks, velFieldCpuID );
-
-        pystencils::UniformGridGPU_AA_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
-        pystencils::UniformGridGPU_AA_MacroSetter setterSweep( pdfFieldCpuID, velFieldCpuID );
-
-        for ( auto &block : *blocks )
-            setterSweep( &block );
-
-        BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
-
-        Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
-
-        for(uint_t i=0; i< 3; ++i)
-        {
-            if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
-                WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
-            }
-        }
-
-        Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
-        bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
-        Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
-
-        int streamHighPriority = 0;
-        int streamLowPriority = 0;
-        WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange( &streamLowPriority, &streamHighPriority ));
-        WALBERLA_CHECK( gpuBlockSize[2] == 1 );
-
-
-        using KernelEven = pystencils::UniformGridGPU_AA_LbKernelEven;
-        using KernelOdd = pystencils::UniformGridGPU_AA_LbKernelOdd;
-        using PackInfoPull = pystencils::UniformGridGPU_AA_PackInfoPull;
-        using PackInfoPush = pystencils::UniformGridGPU_AA_PackInfoPush;
-        using cuda::communication::UniformGPUScheme;
-
-        KernelEven kernelEven( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
-        KernelOdd  kernelOdd ( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
-
-        kernelEven.setOuterPriority( streamHighPriority );
-        kernelOdd .setOuterPriority( streamHighPriority );
-
-        auto pullScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
-        auto pushScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
-        pullScheme->addPackInfo( make_shared< PackInfoPull >( pdfFieldGpuID ) );
-        pushScheme->addPackInfo( make_shared< PackInfoPush >( pdfFieldGpuID ) );
-
-
-        auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
-
-        auto setupPhase = [&]() {
-            for ( auto &block: *blocks )
-                kernelEven( &block );
-
-            pullScheme->communicate();
-
-            for ( auto &block: *blocks )
-                kernelOdd( &block );
-        };
-
-
-        auto tearDownPhase = [&]() {
-            pushScheme->communicate();
-            cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
-            for ( auto &block : *blocks )
-                getterSweep( &block );
-        };
-
-
-        auto simpleOverlapTimeStep = [&]()
-        {
-            // Even
-            pushScheme->startCommunication( defaultStream );
-            for ( auto &block: *blocks )
-                kernelEven.inner( &block, defaultStream );
-            pushScheme->wait( defaultStream );
-            for ( auto &block: *blocks )
-                kernelEven.outer( &block, defaultStream );
-
-            // Odd
-            pullScheme->startCommunication( defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd.inner( &block, defaultStream );
-            pullScheme->wait( defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd.outer( &block, defaultStream );
-        };
-
-        auto normalTimeStep = [&]()
-        {
-            pushScheme->communicate( defaultStream );
-            for ( auto &block: *blocks )
-                kernelEven( &block, defaultStream );
-
-            pullScheme->communicate( defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd( &block, defaultStream );
-        };
-
-        auto kernelOnlyFunc = [&]()
-        {
-            for ( auto &block: *blocks )
-                kernelEven( &block, defaultStream );
-            for ( auto &block: *blocks )
-                kernelOdd( &block, defaultStream );
-        };
-
-        SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
-
-        const std::string timeStepStrategy = parameters.getParameter< std::string >( "timeStepStrategy", "normal" );
-        std::function< void() > timeStep;
-        if ( timeStepStrategy == "noOverlap" )
-            timeStep = std::function< void() >( normalTimeStep );
-        else if ( timeStepStrategy == "simpleOverlap" )
-            timeStep = simpleOverlapTimeStep;
-        else if ( timeStepStrategy == "kernelOnly" )
-        {
-            WALBERLA_LOG_INFO_ON_ROOT( "Running only compute kernel without boundary - this makes only sense for benchmarking!" )
-            timeStep = kernelOnlyFunc;
-        }
-        else
-        {
-            WALBERLA_ABORT_NO_DEBUG_INFO(
-                    "Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'" );
-        }
-
-        timeLoop.add() << BeforeFunction( timeStep )
-                       << Sweep( []( IBlock * ) {}, "time step" );
-
-
-        // VTK
-        uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 );
-        if ( vtkWriteFrequency > 0 )
-        {
-            auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                             "simulation_step", false, true, true, false, 0 );
-            auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldCpuID, "vel" );
-            vtkOutput->addCellDataWriter( velWriter );
-            vtkOutput->addBeforeFunction( [&]()
-                                          {
-                                              tearDownPhase();
-                                              setupPhase();
-                                          } );
-            timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
-        }
-
-        int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 );
-        int outerIterations = parameters.getParameter< int >( "outerIterations", 1 );
-        setupPhase();
-        for ( int i = 0; i < warmupSteps; ++i )
-            timeLoop.singleStep();
-
-        double  remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
-        if ( remainingTimeLoggerFrequency > 0 )
-        {
-            auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c(outerIterations), remainingTimeLoggerFrequency );
-            timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
-        }
-
-        bool useGui = parameters.getParameter<bool>( "useGui", false );
-        if( useGui )
-        {
-#ifdef WALBERLA_ENABLE_GUI
-            cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
-            timeLoop.addFuncAfterTimeStep( cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ), "copy to CPU" );
-            GUI gui( timeLoop, blocks, argc, argv);
-                gui.registerDisplayAdaptorCreator(
-                [&](const IBlock & block, ConstBlockDataID blockDataID) -> gui::DisplayAdaptor * {
-                    if ( block.isDataOfType< PdfField_T >( blockDataID) )
-                        return new lbm::PdfFieldDisplayAdaptor<GhostLayerField<real_t, Stencil_T::Q>, Stencil_T >( blockDataID );
-                    return nullptr;
-                });
-            gui.run();
-#else
-            WALBERLA_ABORT_NO_DEBUG_INFO("Application was built without GUI. Set useGui to false or re-compile with GUI.")
-#endif
-        }
-        else
-        {
-            for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
-            {
-                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
-
-                timeLoop.setCurrentTimeStepToZero();
-                WcTimer simTimer;
-                cudaDeviceSynchronize();
-                WALBERLA_CUDA_CHECK( cudaPeekAtLastError() );
-                WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
-                simTimer.start();
-                timeLoop.run();
-                cudaDeviceSynchronize();
-                simTimer.end();
-                WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
-                auto time = simTimer.last();
-                auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-
-                auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-                WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
-                WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
-                WALBERLA_ROOT_SECTION()
-                {
-                    python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                    if ( pythonCallbackResults.isCallable())
-                    {
-                        const char * storagePattern = "aa";
-                        pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                        pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
-                        pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
-                        pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
-                        pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
-                        pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
-                        // Call Python function to report results
-                        pythonCallbackResults();
-                    }
-                }
-            }
-        }
-    }
-
-    return 0;
-}
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
deleted file mode 100644
index c7e6341ae6d9a125e09427dec23f044e212709f8..0000000000000000000000000000000000000000
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import sympy as sp
-import numpy as np
-import pystencils as ps
-from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
-from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
-from pystencils_walberla import generate_pack_info_from_kernel
-from pystencils_walberla import CodeGeneration, generate_sweep
-from pystencils.data_types import TypedSymbol
-from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
-from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
-
-omega = sp.symbols("omega")
-omega_free = sp.Symbol("omega_free")
-compile_time_block_size = False
-
-if compile_time_block_size:
-    sweep_block_size = (128, 1, 1)
-else:
-    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
-                        TypedSymbol("cudaBlockSize1", np.int32),
-                        1)
-
-sweep_params = {'block_size': sweep_block_size}
-
-options_dict = {
-    'srt': {
-        'method': 'srt',
-        'stencil': 'D3Q19',
-        'relaxation_rate': omega,
-        'compressible': False,
-    },
-    'trt': {
-        'method': 'trt',
-        'stencil': 'D3Q19',
-        'relaxation_rate': omega,
-    },
-    'mrt': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
-    },
-    'entropic': {
-        'method': 'mrt',
-        'stencil': 'D3Q19',
-        'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
-        'entropic': True,
-    },
-    'smagorinsky': {
-        'method': 'srt',
-        'stencil': 'D3Q19',
-        'smagorinsky': True,
-        'relaxation_rate': omega,
-    }
-}
-
-
-info_header = """
-#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
-const char * infoStencil = "{stencil}";
-const char * infoConfigName = "{configName}";
-const bool infoCseGlobal = {cse_global};
-const bool infoCsePdfs = {cse_pdfs};
-"""
-
-
-with CodeGeneration() as ctx:
-    accessors = {
-        'Even': AAEvenTimeStepAccessor(),
-        'Odd': AAOddTimeStepAccessor()
-    }
-
-    common_options = {
-        'field_name': 'pdfs',
-        'optimization': {'cse_global': True,
-                         'cse_pdfs': False,
-                         'field_layout': 'fzyx',
-                         }
-    }
-    options = options_dict.get(ctx.config, options_dict['srt'])
-    options.update(common_options)
-
-    stencil_str = options['stencil']
-    q = int(stencil_str[stencil_str.find('Q') + 1:])
-    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
-    options['optimization']['symbolic_field'] = pdfs
-
-    vp = [
-        ('int32_t', 'cudaBlockSize0'),
-        ('int32_t', 'cudaBlockSize1')
-    ]
-    lb_method = create_lb_method(**options)
-
-    # Kernels
-    options_without_opt = options.copy()
-    del options_without_opt['optimization']
-    update_rules = {}
-    for name, accessor in accessors.items():
-        update_rule = create_lb_update_rule(lb_method=lb_method, kernel_type=accessor, **options)
-        update_rule = insert_fast_divisions(update_rule)
-        update_rule = insert_fast_sqrts(update_rule)
-        update_rules[name] = update_rule
-        generate_sweep(ctx, 'UniformGridGPU_AA_LbKernel' + name, update_rule,
-                       inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
-                       varying_parameters=vp)
-
-    # getter & setter
-    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=1.0)
-    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
-                                                   pdfs=pdfs.center_vector, density=None)
-    generate_sweep(ctx, 'UniformGridGPU_AA_MacroSetter', setter_assignments)
-    generate_sweep(ctx, 'UniformGridGPU_AA_MacroGetter', getter_assignments)
-
-    # communication
-    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'],
-                                   kind='pull', target='gpu')
-    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'],
-                                   kind='push', target='gpu')
-
-    infoHeaderParams = {
-        'stencil': stencil_str,
-        'q': q,
-        'configName': ctx.config,
-        'cse_global': int(options['optimization']['cse_global']),
-        'cse_pdfs': int(options['optimization']['cse_pdfs']),
-    }
-    ctx.write_file("UniformGridGPU_AA_Defines.h", info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48fca7135ba30b8f546b421c1329e847e0b6d129
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.cpp
@@ -0,0 +1,328 @@
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "core/math/Random.h"
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/PythonCallback.h"
+#include "python_coupling/DictWrapper.h"
+#include "blockforest/Initialization.h"
+#include "field/FlagField.h"
+#include "field/AddToStorage.h"
+#include "field/vtk/VTKWriter.h"
+#include "field/communication/PackInfo.h"
+#include "lbm/PerformanceLogger.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "timeloop/all.h"
+#include "core/math/Random.h"
+#include "geometry/all.h"
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/communication/GPUPackInfo.h"
+#include "cuda/ParallelStreams.h"
+#include "cuda/NVTX.h"
+#include "core/timing/TimingPool.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "cuda/AddGPUFieldToStorage.h"
+#include "cuda/communication/UniformGPUScheme.h"
+#include "cuda/DeviceSelectMPI.h"
+#include "domain_decomposition/SharedSweep.h"
+
+#include "UniformGridGPU_LatticeModel.h"
+#include "UniformGridGPU_LbKernel.h"
+#include "UniformGridGPU_PackInfo.h"
+#include "UniformGridGPU_UBB.h"
+#include "UniformGridGPU_NoSlip.h"
+#include "UniformGridGPU_Communication.h"
+#include "UniformGridGPU_MacroSetter.h"
+#include "UniformGridGPU_MacroGetter.h"
+#include "UniformGridGPU_Defines.h"
+
+#include "InitShearVelocity.h"
+
+
+using namespace walberla;
+
+using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
+
+const auto Q = LatticeModel_T::Stencil::Q;
+
+
+using Stencil_T = LatticeModel_T::Stencil;
+using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
+using PdfField_T = GhostLayerField<real_t, Q>;
+using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
+using VelocityField_T = GhostLayerField<real_t, 3>;
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+
+int main( int argc, char **argv )
+{
+   mpi::Environment env( argc, argv );
+   cuda::selectDeviceBasedOnMpiRank();
+
+   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+   {
+      WALBERLA_MPI_WORLD_BARRIER();
+
+      auto config = *cfg;
+      logging::configureLogging( config );
+      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+
+      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
+      // Reading parameters
+      auto parameters = config->getOneBlock( "Parameters" );
+      const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
+      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
+      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
+
+      // Creating fields
+      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
+      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
+
+      if( timeStepStrategy != "kernelOnlyNoInit")
+      {
+          if ( initShearFlow )
+          {
+              WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
+              initShearVelocity( blocks, velFieldCpuID );
+          }
+
+          pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
+          for( auto & block : *blocks )
+              setterSweep( &block );
+
+          // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
+          blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
+          initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
+          initialComm();
+      }
+
+      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
+      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
+
+
+      // Boundaries
+      const FlagUID fluidFlagUID( "Fluid" );
+      auto boundariesConfig = config->getBlock( "Boundaries" );
+      bool disableBoundaries = true;
+      if( boundariesConfig )
+      {
+          disableBoundaries = false;
+          geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
+          geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
+      }
+
+      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
+      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
+
+      ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
+      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
+
+       // Communication setup
+      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
+      Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
+      const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
+      CommunicationSchemeType communicationScheme;
+      if( communicationSchemeStr == "GPUPackInfo_Baseline")
+          communicationScheme = GPUPackInfo_Baseline;
+      else if (communicationSchemeStr == "GPUPackInfo_Streams")
+          communicationScheme = GPUPackInfo_Streams;
+      else if (communicationSchemeStr == "UniformGPUScheme_Baseline")
+          communicationScheme = UniformGPUScheme_Baseline;
+      else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
+          communicationScheme = UniformGPUScheme_Memcpy;
+      else if (communicationSchemeStr == "MPIDatatypes")
+          communicationScheme = MPIDatatypes;
+      else if (communicationSchemeStr == "MPIDatatypesFull")
+          communicationScheme = MPIDatatypesFull;
+      else {
+          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
+      }
+
+      Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
+      for(uint_t i=0; i< 3; ++i)
+      {
+          if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
+              WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
+          }
+      }
+
+      int streamHighPriority = 0;
+      int streamLowPriority = 0;
+      WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
+      WALBERLA_CHECK(gpuBlockSize[2] == 1);
+      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega,
+                                                    1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
+                                                    gpuBlockSize[0], gpuBlockSize[1],
+                                                    Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
+      lbKernel.setOuterPriority( streamHighPriority );
+      UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
+         gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
+
+      auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
+      auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
+      auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
+      auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
+
+      uint_t currentTimeStep = 0;
+
+      auto simpleOverlapTimeStep = [&] ()
+      {
+          gpuComm.startCommunication(defaultStream);
+          for( auto &block: *blocks )
+              lbKernel.inner( &block, defaultStream );
+          gpuComm.wait(defaultStream);
+          for( auto &block: *blocks )
+              lbKernel.outer( &block, defaultStream );
+      };
+
+      auto overlapTimeStep = [&]()
+      {
+         cuda::NvtxRange namedRange("timestep");
+         auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
+
+         innerOuterSection.run([&]( auto innerStream )
+         {
+            cuda::nameStream(innerStream, "inner stream");
+            for( auto &block: *blocks )
+            {
+               if(!disableBoundaries)
+               {
+                  auto p = boundaryInnerStreams.parallelSection( innerStream );
+                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } );
+                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.inner( &block, s ); } );
+               }
+               lbKernel.inner( &block, innerStream );
+            }
+         });
+
+         innerOuterSection.run([&]( auto outerStream )
+         {
+            cuda::nameStream(outerStream, "outer stream");
+            gpuComm( outerStream );
+
+            for( auto &block: *blocks )
+            {
+               if(!disableBoundaries)
+               {
+                  auto p = boundaryOuterStreams.parallelSection( outerStream );
+                  p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } );
+                  p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip.outer( &block, s ); } );
+               }
+               lbKernel.outer( &block, outerStream );
+            }
+         });
+         currentTimeStep += 1;
+      };
+
+
+      auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
+      auto normalTimeStep = [&]()
+      {
+         gpuComm();
+         for( auto &block: *blocks )
+         {
+            if(!disableBoundaries)
+            {
+               auto p = boundaryStreams.parallelSection( defaultStream );
+               p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } );
+               p.run( [&block, &noSlip]( cudaStream_t s ) { noSlip( &block, s ); } );
+            }
+            lbKernel( &block );
+         }
+      };
+
+      auto kernelOnlyFunc = [&] ()
+      {
+          for( auto &block: *blocks )
+              lbKernel( &block );
+      };
+
+      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
+
+      std::function<void()> timeStep;
+      if (timeStepStrategy == "noOverlap")
+          timeStep = std::function<void()>( normalTimeStep );
+      else if (timeStepStrategy == "complexOverlap")
+          timeStep = std::function<void()>( overlapTimeStep );
+      else if (timeStepStrategy == "simpleOverlap")
+          timeStep = simpleOverlapTimeStep;
+      else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") {
+          WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!")
+          timeStep = kernelOnlyFunc;
+      }
+      else {
+          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'");
+      }
+
+      timeLoop.add() << BeforeFunction( timeStep  )
+                     << Sweep( []( IBlock * ) {}, "time step" );
+
+      pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
+
+      // VTK
+      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
+      if( vtkWriteFrequency > 0 )
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                          "simulation_step", false, true, true, false, 0 );
+         auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+         vtkOutput->addBeforeFunction( [&]() {
+             cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+             for( auto & block : *blocks )
+                 getterSweep( &block );
+         });
+         timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+      }
+
+
+
+      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
+      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
+      for(int i=0; i < warmupSteps; ++i )
+         timeLoop.singleStep();
+
+      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
+      if (remainingTimeLoggerFrequency > 0) {
+          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency );
+          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
+      }
+
+       for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
+       {
+           timeLoop.setCurrentTimeStepToZero();
+           WcTimer simTimer;
+           cudaDeviceSynchronize();
+           WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+           simTimer.start();
+           timeLoop.run();
+           cudaDeviceSynchronize();
+           simTimer.end();
+           WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+           auto time = simTimer.last();
+           auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+           auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+           WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+           WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+           WALBERLA_ROOT_SECTION()
+           {
+               python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+               if ( pythonCallbackResults.isCallable())
+               {
+                   const char * storagePattern = "twofield";
+                   pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                   pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
+                   pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
+                   pythonCallbackResults.data().exposeValue( "storagePattern", storagePattern );
+                   pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
+                   pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
+                   // Call Python function to report results
+                   pythonCallbackResults();
+               }
+           }
+       }
+
+   }
+
+   return 0;
+}
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.prm
similarity index 100%
rename from apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
rename to apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.prm
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py
new file mode 100644
index 0000000000000000000000000000000000000000..c861cb15578a7af152e382c0b7d4a607c52181f5
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU.py
@@ -0,0 +1,175 @@
+import sympy as sp
+import numpy as np
+import pystencils as ps
+from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
+from pystencils_walberla import generate_pack_info_from_kernel
+from lbmpy_walberla import generate_lattice_model, generate_boundary
+from pystencils_walberla import CodeGeneration, generate_sweep
+from pystencils.data_types import TypedSymbol
+from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
+from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
+
+omega = sp.symbols("omega")
+omega_free = sp.Symbol("omega_free")
+omega_fill = sp.symbols("omega_:10")
+compile_time_block_size = False
+
+if compile_time_block_size:
+    sweep_block_size = (128, 1, 1)
+else:
+    sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                        TypedSymbol("cudaBlockSize1", np.int32),
+                        1)
+
+sweep_params = {'block_size': sweep_block_size}
+
+options_dict = {
+    'srt': {
+        'method': 'srt',
+        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
+        'compressible': False,
+    },
+    'trt': {
+        'method': 'trt',
+        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
+    },
+    'mrt': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235],
+    },
+    'mrt_full': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2],
+                             omega_fill[3], omega_fill[4], omega_fill[5]],
+    },
+    'entropic': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'compressible': True,
+        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free, omega_free],
+        'entropic': True,
+    },
+    'entropic_kbc_n4': {
+        'method': 'trt-kbc-n4',
+        'stencil': 'D3Q27',
+        'compressible': True,
+        'relaxation_rates': [omega, omega_free],
+        'entropic': True,
+    },
+    'smagorinsky': {
+        'method': 'srt',
+        'stencil': 'D3Q19',
+        'smagorinsky': True,
+        'relaxation_rate': omega,
+    },
+    'cumulant': {
+        'method': 'cumulant',
+        'stencil': 'D3Q19',
+        'compressible': True,
+        'relaxation_rate': omega,
+    },
+}
+
+info_header = """
+#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
+const char * infoStencil = "{stencil}";
+const char * infoConfigName = "{configName}";
+const bool infoCseGlobal = {cse_global};
+const bool infoCsePdfs = {cse_pdfs};
+"""
+
+with CodeGeneration() as ctx:
+    accessor = StreamPullTwoFieldsAccessor()
+    # accessor = StreamPushTwoFieldsAccessor()
+    assert not accessor.is_inplace, "This app does not work for inplace accessors"
+
+    common_options = {
+        'field_name': 'pdfs',
+        'temporary_field_name': 'pdfs_tmp',
+        'kernel_type': accessor,
+        'optimization': {'cse_global': True,
+                         'cse_pdfs': False}
+    }
+    config_name = ctx.config
+    noopt = False
+    d3q27 = False
+    if config_name.endswith("_noopt"):
+        noopt = True
+        config_name = config_name[:-len("_noopt")]
+    if config_name.endswith("_d3q27"):
+        d3q27 = True
+        config_name = config_name[:-len("_d3q27")]
+
+    options = options_dict[config_name]
+    options.update(common_options)
+    options = options.copy()
+
+    if noopt:
+        options['optimization']['cse_global'] = False
+        options['optimization']['cse_pdfs'] = False
+    if d3q27:
+        options['stencil'] = 'D3Q27'
+
+    stencil_str = options['stencil']
+    q = int(stencil_str[stencil_str.find('Q') + 1:])
+    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
+    options['optimization']['symbolic_field'] = pdfs
+
+    vp = [
+        ('double', 'omega_0'),
+        ('double', 'omega_1'),
+        ('double', 'omega_2'),
+        ('double', 'omega_3'),
+        ('double', 'omega_4'),
+        ('double', 'omega_5'),
+        ('double', 'omega_6'),
+        ('int32_t', 'cudaBlockSize0'),
+        ('int32_t', 'cudaBlockSize1'),
+    ]
+    lb_method = create_lb_method(**options)
+    update_rule = create_lb_update_rule(lb_method=lb_method, **options)
+
+    if not noopt:
+        update_rule = insert_fast_divisions(update_rule)
+        update_rule = insert_fast_sqrts(update_rule)
+
+    # CPU lattice model - required for macroscopic value computation, VTK output etc.
+    options_without_opt = options.copy()
+    del options_without_opt['optimization']
+    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method,
+                                                                                        **options_without_opt))
+
+    # gpu LB sweep & boundaries
+    generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
+                   field_swaps=[('pdfs', 'pdfs_tmp')],
+                   inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
+                   varying_parameters=vp)
+
+    generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
+    generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
+
+    # getter & setter
+    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector, density=1.0)
+    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector, density=None)
+    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments)
+    generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
+
+    # communication
+    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
+
+    infoHeaderParams = {
+        'stencil': stencil_str,
+        'q': q,
+        'configName': ctx.config,
+        'cse_global': int(options['optimization']['cse_global']),
+        'cse_pdfs': int(options['optimization']['cse_pdfs']),
+    }
+    ctx.write_file("UniformGridGPU_Defines.h", info_header.format(**infoHeaderParams))
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
similarity index 100%
rename from apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
rename to apps/benchmarks/UniformGridGPU/old_ideas/UniformGridGPU_Communication.h
diff --git a/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py
new file mode 100755
index 0000000000000000000000000000000000000000..d13e155f7dd5903465d7a9274bf7ae8200148582
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/benchmark_configs.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication.
+Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1  then run e.g.
+ ./UniformGridGPU_d3q27_aa_srt simulation_setup/benchmark_configs.py
+
+Look at the end of the file to select the benchmark to run
+"""
+
+import os
+import waLBerla as wlb
+from waLBerla.tools.config import block_decomposition
+from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
+from copy import deepcopy
+from functools import reduce
+import operator
+import sys
+import sqlite3
+
+# Number of time steps run for a workload of 128^3 per GPU
+# if double as many cells are on the GPU, half as many time steps are run etc.
+# increase this to get more reliable measurements
+TIME_STEPS_FOR_128_BLOCK = 200
+DB_FILE = "gpu_benchmark.sqlite3"
+
+BASE_CONFIG = {
+    'DomainSetup': {
+        'cellsPerBlock': (256, 128, 128),
+        'periodic': (1, 1, 1),
+    },
+    'Parameters': {
+        'omega': 1.8,
+        'cudaEnabledMPI': False,
+        'warmupSteps': 5,
+        'outerIterations': 3,
+    }
+}
+
+
+def prod(seq):
+    return reduce(operator.mul, seq, 1)
+
+
+def num_time_steps(block_size, time_steps_for_128_block=200):
+    cells = block_size[0] * block_size[1] * block_size[2]
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
+    return int(time_steps)
+
+
+def cuda_block_size_ok(block_size, regs_per_threads=168):
+    """Checks if a given CUDA block size does not exceed the SM register limit.
+    168 registers per thread was obtained using cuobjdump on both SRT and Cumulant
+    kernels. You might want to validate that for your own kernels."""
+
+    return prod(block_size) * regs_per_threads < 64 * (2 ** 10)
+
+
+def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
+    """Checks if a single block of given size fits into GPU memory"""
+    return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem
+
+
+class Scenario:
+    def __init__(self, cells_per_block=(256, 128, 128), db_file=DB_FILE, **kwargs):
+        self.db_file = db_file
+        self.config_dict = deepcopy(BASE_CONFIG)
+        self.config_dict['Parameters'].update(kwargs)
+        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
+        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
+
+    @wlb.member_callback
+    def config(self):
+        from pprint import pformat
+        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
+        # Write out the configuration as text-based prm:
+        # print(toPrm(self.config_dict))
+        return self.config_dict
+
+    @wlb.member_callback
+    def results_callback(self, **kwargs):
+        data = {}
+        data.update(self.config_dict['Parameters'])
+        data.update(self.config_dict['DomainSetup'])
+        data.update(kwargs)
+        data['executable'] = sys.argv[0]
+        data['compile_flags'] = wlb.build_info.compiler_flags
+        data['walberla_version'] = wlb.build_info.version
+        data['build_machine'] = wlb.build_info.build_machine
+        sequenceValuesToScalars(data)
+
+        result = data
+        sequenceValuesToScalars(result)
+        num_tries = 4
+        # check multiple times e.g. may fail when multiple benchmark processes are running
+        for num_try in range(num_tries):
+            try:
+                checkAndUpdateSchema(result, "runs", self.db_file)
+                storeSingle(result, "runs", self.db_file)
+                break
+            except sqlite3.OperationalError as e:
+                wlb.log_warning("Sqlite DB writing failed: try {}/{}  {}".format(num_try + 1, num_tries, str(e)))
+
+
+# -------------------------------------- Functions trying different parameter sets -----------------------------------
+
+
+def overlap_benchmark():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running different communication overlap strategies")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
+                          (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
+                          (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
+
+    # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
+    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+        # no overlap
+        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy,
+                               innerOuterSplit=(1, 1, 1)))
+
+        # overlap
+        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
+            for inner_outer_split in inner_outer_splits:
+                scenario = Scenario(timeStepStrategy=overlap_strategy,
+                                    communicationScheme=comm_strategy,
+                                    innerOuterSplit=inner_outer_split,
+                                    timesteps=num_time_steps((256, 128, 128)))
+                scenarios.add(scenario)
+
+
+def communication_compare():
+    """Tests different communication strategies"""
+    wlb.log_info_on_root("Running benchmarks to compare communication strategies")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
+        for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
+
+            sc = Scenario(cells_per_block=block_size,
+                          gpuBlockSize=(128, 1, 1),
+                          timeStepStrategy='noOverlap',
+                          communicationScheme=comm_strategy,
+                          timesteps=num_time_steps(block_size))
+            scenarios.add(sc)
+            for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
+                # ensure that the inner part of the domain is still large enough
+                if 3 * inner_outer_split[0] > block_size[0]:
+                    continue
+                sc = Scenario(cells_per_block=block_size,
+                              gpuBlockSize=(128, 1, 1),
+                              timeStepStrategy='simpleOverlap',
+                              innerOuterSplit=inner_outer_split,
+                              communicationScheme=comm_strategy,
+                              timesteps=num_time_steps(block_size))
+                scenarios.add(sc)
+
+
+def single_gpu_benchmark():
+    """Benchmarks only the LBM compute kernel"""
+    wlb.log_info_on_root("Running single GPU benchmarks")
+    wlb.log_info_on_root("")
+
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+    gpu_type = os.environ.get('GPU_TYPE')
+
+    kwargs = {}
+    if gpu_type is not None:
+        kwargs['gpu_type'] = gpu_type
+
+    scenarios = wlb.ScenarioManager()
+    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
+                   (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
+                   (32, 4, 1), (64, 4, 1), (128, 4, 1),
+                   (32, 8, 1), (64, 8, 1),
+                   (32, 16, 1)]
+    for block_size in block_sizes:
+        for cuda_block_size in cuda_blocks:
+            if not cuda_block_size_ok(cuda_block_size):
+                wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.")
+                continue
+            if not domain_block_size_ok(block_size, gpu_mem):
+                wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+                continue
+            scenario = Scenario(cells_per_block=block_size,
+                                gpuBlockSize=cuda_block_size,
+                                timeStepStrategy='kernelOnly',
+                                timesteps=num_time_steps(block_size),
+                                **kwargs)
+            scenarios.add(scenario)
+
+
+# -------------------------------------- Optional job script generation for PizDaint ---------------------------------
+
+
+job_script_header = """
+#!/bin/bash -l
+#SBATCH --job-name=scaling
+#SBATCH --time=0:30:00
+#SBATCH --nodes={nodes}
+#SBATCH -o out_scaling_{nodes}_%j.txt
+#SBATCH -e err_scaling_{nodes}_%j.txt
+#SBATCH --ntasks-per-core=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=1
+#SBATCH --partition=normal
+#SBATCH --constraint=gpu
+#SBATCH --account=d105
+
+cd {folder}
+
+source ~/env.sh
+
+module load daint-gpu
+module load craype-accel-nvidia60
+export MPICH_RDMA_ENABLED_CUDA=1  # allow GPU-GPU data transfer
+export CRAY_CUDA_MPS=1            # allow GPU sharing
+export MPICH_G2G_PIPELINE=256     # adapt maximum number of concurrent in-flight messages
+
+export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+export CRAY_CUDA_MPS=1
+
+export MPICH_RANK_REORDER_METHOD=3
+export PMI_MMAP_SYNC_WAIT_TIME=300
+
+
+# grid_order -R -H -c 1,1,8 -g 16,16,8
+
+ulimit -c 0
+"""
+
+job_script_exe_part = """
+
+export WALBERLA_SCENARIO_IDX=0
+while srun -n {nodes} ./{app} {config}
+do
+ ((WALBERLA_SCENARIO_IDX++))
+done
+"""
+
+all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
+                   'UniformGridBenchmarkGPU_smagorinsky_d3q27',
+                   'UniformGridBenchmarkGPU_cumulant'
+                   'UniformGridBenchmarkGPU_cumulant_d3q27')
+
+
+def generate_jobscripts(exe_names=all_executables):
+    for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
+        folder_name = "scaling_{:04d}".format(node_count)
+        os.makedirs(folder_name, exist_ok=True)
+
+        # run grid_order
+        import subprocess
+        decomposition = block_decomposition(node_count)
+        decomposition_str = ",".join(str(e) for e in decomposition)
+        subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
+
+        job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
+        for exe in exe_names:
+            job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count,
+                                                     config='../communication_compare.py')
+
+        with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
+            f.write(job_script)
+
+
+if __name__ == '__main__':
+    print("Called without waLBerla - generating job scripts for PizDaint")
+    generate_jobscripts()
+else:
+    wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE))
+    # Select the benchmark you want to run
+    single_gpu_benchmark()
+    # benchmarks different CUDA block sizes and domain sizes and measures single
+    # GPU performance of compute kernel (no communication)
+    # communication_compare(): benchmarks different communication routines, with and without overlap
+    # overlap_benchmark(): benchmarks different communication overlap options
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/single_node.prm
similarity index 89%
rename from apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm
rename to apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/single_node.prm
index ef257af8bc013da08ad5d04765e786c44a756a5e..1b22fcaaf2fe65fff1f32f162c959bd012995b97 100644
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm
+++ b/apps/benchmarks/UniformGridGPU/old_ideas/simulation_setup/single_node.prm
@@ -8,6 +8,7 @@ DomainSetup
 
 Parameters
 {
+    initShearFlow False;
     cudaEnabledMPI False;           // set to true, if MPI was compiled with CUDA
     outerIterations 3;              // number of measurements to make
     timeStepStrategy simpleOverlap; // one of simpleOverlap, noOverlap, the non-AA version also supports complexOverlap
@@ -35,3 +36,10 @@ Parameters
     //   GPUPackInfo_Baseline:       old implementation based on communication mechanism for CPUs
     //   GPUPackInfo_Streams:        same as above but with CUDA streams
 }
+
+Boundaries {
+    Border { direction T; walldistance -1; flag UBB; }
+    Border { direction B; walldistance -1; flag NoSlip; }
+    Border { direction W; walldistance -1; flag NoSlip; }
+    Border { direction E; walldistance -1; flag NoSlip; }
+}
diff --git a/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods b/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods
new file mode 100644
index 0000000000000000000000000000000000000000..99ce7d0a33ff710ea40be7735533adcbb06b91d1
Binary files /dev/null and b/apps/benchmarks/UniformGridGPU/roofline_model/roofline.ods differ
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
index 5fbf36a9488874e87318ca402c747378f5a15f46..b3e7ac03396100609e509149f35ab7179712f945 100755
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -2,7 +2,7 @@
 """
 This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication.
 Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1  then run e.g.
- ./UniformGridBenchmarkGPU_AA_trt simulation_setup/benchmark_configs.py
+ ./UniformGridGPU_d3q27_aa_srt simulation_setup/benchmark_configs.py
 
 Look at the end of the file to select the benchmark to run
 """
@@ -11,14 +11,15 @@ import os
 import waLBerla as wlb
 from waLBerla.tools.config import block_decomposition
 from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
-from copy import deepcopy
+from functools import reduce
+import operator
 import sys
 import sqlite3
 
 # Number of time steps run for a workload of 128^3 per GPU
 # if double as many cells are on the GPU, half as many time steps are run etc.
 # increase this to get more reliable measurements
-TIME_STEPS_FOR_128_BLOCK = 200
+TIME_STEPS_FOR_128_BLOCK = 500
 DB_FILE = "gpu_benchmark.sqlite3"
 
 BASE_CONFIG = {
@@ -35,26 +36,82 @@ BASE_CONFIG = {
 }
 
 
-def num_time_steps(block_size):
+def prod(seq):
+    return reduce(operator.mul, seq, 1)
+
+
+def num_time_steps(block_size, time_steps_for_128_block=200):
     cells = block_size[0] * block_size[1] * block_size[2]
-    time_steps = (128 ** 3 / cells) * TIME_STEPS_FOR_128_BLOCK
+    time_steps = (128 ** 3 / cells) * time_steps_for_128_block
     return int(time_steps)
 
 
+def cuda_block_size_ok(block_size, regs_per_threads=168):
+    """Checks if a given CUDA block size does not exceed the SM register limit.
+    168 registers per thread was obtained using cuobjdump on both SRT and Cumulant
+    kernels. You might want to validate that for your own kernels."""
+
+    return prod(block_size) * regs_per_threads < 64 * (2 ** 10)
+
+
+def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
+    """Checks if a single block of given size fits into GPU memory"""
+    return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem
+
+
 class Scenario:
-    def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
-        self.config_dict = deepcopy(BASE_CONFIG)
-        self.config_dict['Parameters'].update(kwargs)
-        self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
-        self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
+    def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(256, 1, 1),
+                 timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False,
+                 inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, init_shear_flow=False,
+                 additional_info=None):
+
+        self.blocks = block_decomposition(wlb.mpi.numProcesses())
+
+        self.cells_per_block = cells_per_block
+        self.periodic = periodic
+
+        self.time_step_strategy = time_step_strategy
+        self.omega = omega
+        self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
+        self.cuda_enabled_mpi = cuda_enabled_mpi
+        self.inner_outer_split = inner_outer_split
+        self.init_shear_flow = init_shear_flow
+        self.warmup_steps = warmup_steps
+        self.outer_iterations = outer_iterations
+        self.cuda_blocks = cuda_blocks
+
+        self.vtk_write_frequency = 0
+
+        self.config_dict = self.config(print_dict=False)
+        self.additional_info = additional_info
 
     @wlb.member_callback
-    def config(self, **kwargs):
+    def config(self, print_dict=True):
         from pprint import pformat
-        wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
-        # Write out the configuration as text-based prm:
-        # print(toPrm(self.config_dict))
-        return self.config_dict
+        config_dict = {
+            'DomainSetup': {
+                'blocks': self.blocks,
+                'cellsPerBlock': self.cells_per_block,
+                'periodic': self.periodic,
+            },
+            'Parameters': {
+                'omega': self.omega,
+                'cudaEnabledMPI': self.cuda_enabled_mpi,
+                'warmupSteps': self.warmup_steps,
+                'outerIterations': self.outer_iterations,
+                'timeStepStrategy': self.time_step_strategy,
+                'timesteps': self.timesteps,
+                'initShearFlow': self.init_shear_flow,
+                'gpuBlockSize': self.cuda_blocks,
+                'innerOuterSplit': self.inner_outer_split,
+                'vtkWriteFrequency': self.vtk_write_frequency
+            }
+        }
+        if print_dict:
+            wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
+            if self.additional_info:
+                wlb.log_info_on_root("Additional Info:\n" + pformat(self.additional_info))
+        return config_dict
 
     @wlb.member_callback
     def results_callback(self, **kwargs):
@@ -62,6 +119,10 @@ class Scenario:
         data.update(self.config_dict['Parameters'])
         data.update(self.config_dict['DomainSetup'])
         data.update(kwargs)
+
+        if self.additional_info is not None:
+            data.update(self.additional_info)
+
         data['executable'] = sys.argv[0]
         data['compile_flags'] = wlb.build_info.compiler_flags
         data['walberla_version'] = wlb.build_info.version
@@ -78,7 +139,22 @@ class Scenario:
                 storeSingle(result, "runs", DB_FILE)
                 break
             except sqlite3.OperationalError as e:
-                wlb.log_warning("Sqlite DB writing failed: try {}/{}  {}".format(num_try + 1, num_tries, str(e)))
+                wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries}  {str(e)}")
+
+
+# -------------------------------------- Profiling -----------------------------------
+def profiling():
+    """Tests different communication overlapping strategies"""
+    wlb.log_info_on_root("Running 2 timesteps for profiling")
+    wlb.log_info_on_root("")
+
+    scenarios = wlb.ScenarioManager()
+    cells = (128, 128, 128)
+    cuda_enabled_mpi = False
+
+    scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly',
+                           inner_outer_split=(1, 1, 1), timesteps=2, cuda_enabled_mpi=cuda_enabled_mpi,
+                           outer_iterations=1, warmup_steps=0))
 
 
 # -------------------------------------- Functions trying different parameter sets -----------------------------------
@@ -90,52 +166,21 @@ def overlap_benchmark():
     wlb.log_info_on_root("")
 
     scenarios = wlb.ScenarioManager()
+    cuda_enabled_mpi = False
     inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
                           (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
                           (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
 
-    # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
-    for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
-        # no overlap
-        scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy,
-                               innerOuterSplit=(1, 1, 1)))
-
-        # overlap
-        for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
-            for inner_outer_split in inner_outer_splits:
-                scenario = Scenario(timeStepStrategy=overlap_strategy,
-                                    communicationScheme=comm_strategy,
-                                    innerOuterSplit=inner_outer_split,
-                                    timesteps=num_time_steps((256, 128, 128)))
-                scenarios.add(scenario)
-
-
-def communication_compare():
-    """Tests different communication strategies"""
-    wlb.log_info_on_root("Running benchmarks to compare communication strategies")
-    wlb.log_info_on_root("")
+    # no overlap
+    scenarios.add(Scenario(time_step_strategy='noOverlap',
+                           inner_outer_split=(1, 1, 1),
+                           cuda_enabled_mpi=cuda_enabled_mpi))
 
-    scenarios = wlb.ScenarioManager()
-    for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
-        for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
-
-            sc = Scenario(cells_per_block=block_size,
-                          gpuBlockSize=(128, 1, 1),
-                          timeStepStrategy='noOverlap',
-                          communicationScheme=comm_strategy,
-                          timesteps=num_time_steps(block_size))
-            scenarios.add(sc)
-            for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
-                # ensure that the inner part of the domain is still large enough
-                if 3 * inner_outer_split[0] > block_size[0]:
-                    continue
-                sc = Scenario(cells_per_block=block_size,
-                              gpuBlockSize=(128, 1, 1),
-                              timeStepStrategy='simpleOverlap',
-                              innerOuterSplit=inner_outer_split,
-                              communicationScheme=comm_strategy,
-                              timesteps=num_time_steps(block_size))
-                scenarios.add(sc)
+    for inner_outer_split in inner_outer_splits:
+        scenario = Scenario(time_step_strategy='simpleOverlap',
+                            inner_outer_split=inner_outer_split,
+                            cuda_enabled_mpi=cuda_enabled_mpi)
+        scenarios.add(scenario)
 
 
 def single_gpu_benchmark():
@@ -143,8 +188,16 @@ def single_gpu_benchmark():
     wlb.log_info_on_root("Running single GPU benchmarks")
     wlb.log_info_on_root("")
 
+    gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
+    gpu_mem = gpu_mem_gb * (2 ** 30)
+    gpu_type = os.environ.get('GPU_TYPE')
+
+    additional_info = {}
+    if gpu_type is not None:
+        additional_info['gpu_type'] = gpu_type
+
     scenarios = wlb.ScenarioManager()
-    block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
+    block_sizes = [(i, i, i) for i in (64, 128, 256, 320, 384, 448, 512)]
     cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
                    (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
                    (32, 4, 1), (64, 4, 1), (128, 4, 1),
@@ -152,10 +205,17 @@ def single_gpu_benchmark():
                    (32, 16, 1)]
     for block_size in block_sizes:
         for cuda_block_size in cuda_blocks:
+            if not cuda_block_size_ok(cuda_block_size):
+                wlb.log_info_on_root(f"Cuda block size {cuda_block_size} would exceed register limit. Skipping.")
+                continue
+            if not domain_block_size_ok(block_size, gpu_mem):
+                wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
+                continue
             scenario = Scenario(cells_per_block=block_size,
-                                gpuBlockSize=cuda_block_size,
-                                timeStepStrategy='kernelOnly',
-                                timesteps=num_time_steps(block_size))
+                                cuda_blocks=cuda_block_size,
+                                time_step_strategy='kernelOnly',
+                                timesteps=num_time_steps(block_size),
+                                additional_info=additional_info)
             scenarios.add(scenario)
 
 
@@ -207,7 +267,6 @@ do
 done
 """
 
-
 all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
                    'UniformGridBenchmarkGPU_smagorinsky_d3q27',
                    'UniformGridBenchmarkGPU_cumulant'
@@ -238,10 +297,9 @@ if __name__ == '__main__':
     print("Called without waLBerla - generating job scripts for PizDaint")
     generate_jobscripts()
 else:
-    wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE))
+    wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
     # Select the benchmark you want to run
-    single_gpu_benchmark()
-    # benchmarks different CUDA block sizes and domain sizes and measures single
-    # GPU performance of compute kernel (no communication)
-    # communication_compare(): benchmarks different communication routines, with and without overlap
-    # overlap_benchmark(): benchmarks different communication overlap options
+    single_gpu_benchmark()  # benchmarks different CUDA block sizes and domain sizes and measures single GPU
+    # performance of compute kernel (no communication)
+    # overlap_benchmark()  # benchmarks different communication overlap options
+    # profiling()  # run only two timesteps on a smaller domain for profiling only
diff --git a/python/pystencils_walberla/utility.py b/python/pystencils_walberla/utility.py
index 0bdf6e9923fe4fb106ab2975109a7c31a7f0d633..cda1ddbec3a6f98a9ea75d96037866c5018875b8 100644
--- a/python/pystencils_walberla/utility.py
+++ b/python/pystencils_walberla/utility.py
@@ -11,7 +11,8 @@ def generate_info_header(ctx: CodeGenerationContext,
                          field_typedefs: dict = None,
                          additional_headers: set = None,
                          headers_to_ignore: set = None,
-                         additional_typedefs: dict = None):
+                         additional_typedefs: dict = None,
+                         additional_code: str = ""):
     """Generates an info header, consolidating required information about the generated code.
     The info header #includes all generated header files, and is thus the only header the
     application needs to #include. It can also contain aliases for waLBerla stencil types and
@@ -24,7 +25,8 @@ def generate_info_header(ctx: CodeGenerationContext,
         field_typedefs: dict mapping type names to pystencils `Field` instances
         additional_headers: additional header files to be included
         headers_to_ignore: headers which should not be included
-        additional_typedefs: dict mapping aliases to types.
+        additional_typedefs: dict mapping aliases to types
+        additional_code: additional code which gets appended on the file
     """
     stencil_typedefs = stencil_typedefs if stencil_typedefs is not None else dict()
     field_typedefs = field_typedefs if field_typedefs is not None else dict()
@@ -50,7 +52,7 @@ def generate_info_header(ctx: CodeGenerationContext,
     if path.splitext(filename)[1] not in HEADER_EXTENSIONS:
         filename += '.h'
 
-    ctx.write_file(filename, lines)
+    ctx.write_file(filename, lines + additional_code)
 
 
 #   ------------------------------------- INTERNAL -------------------------------------------------------------
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index b701f144f972b8eb496849d260968f21306542eb..46b22c43a0056afc407c98341384e8d22b5147f8 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -4,7 +4,7 @@
 #
 ###################################################################################################
 
-waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition executiontree field stencil
+waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition executiontree field stencil lbm
                      BUILD_ONLY_IF_FOUND CUDA )
 
 ###################################################################################################
\ No newline at end of file
diff --git a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h b/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c7d2c0fbd9f1d152793bcbe84d577dc708ed9
--- /dev/null
+++ b/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h
@@ -0,0 +1,86 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CombinedInPlaceGpuPackInfo.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#define IS_EVEN(x) ((x & 1) ^ 1)
+
+#include "cuda/communication/GeneratedGPUPackInfo.h"
+
+#include "lbm/inplace_streaming/TimestepTracker.h"
+
+namespace walberla {
+namespace lbm {
+
+template< typename EvenPackInfo, typename OddPackInfo >
+class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo
+{
+ public:
+   template< typename... Args >
+   CombinedInPlaceGpuPackInfo(std::shared_ptr< lbm::TimestepTracker >& tracker, Args&&... args)
+      : tracker_(tracker), evenPackInfo_(std::forward< Args >(args)...), oddPackInfo_(std::forward< Args >(args)...)
+   {}
+
+   virtual ~CombinedInPlaceGpuPackInfo() = default;
+
+   void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override
+   {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         evenPackInfo_.pack(dir, buffer, block, stream);
+      }
+      else
+      {
+         oddPackInfo_.pack(dir, buffer, block, stream);
+      }
+   }
+
+   void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block, cudaStream_t stream) override {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         evenPackInfo_.unpack(dir, buffer, block, stream);
+      }
+      else
+      {
+         oddPackInfo_.unpack(dir, buffer, block, stream);
+      }
+   }
+
+   uint_t size(stencil::Direction dir, IBlock* block) override {
+      if (IS_EVEN(tracker_->getCounter()))
+      {
+         return evenPackInfo_.size(dir, block);
+      }
+      else
+      {
+         return oddPackInfo_.size(dir, block);
+      }
+   }
+
+ private:
+   const std::shared_ptr< lbm::TimestepTracker >& tracker_;
+   EvenPackInfo evenPackInfo_;
+   OddPackInfo oddPackInfo_;
+};
+
+} // namespace lbm
+} // namespace walberla
+
+
diff --git a/src/field/Field.h b/src/field/Field.h
index a47fe47e7314011fe0bae2df37ca090facabb321..a01fc76cddc32539c7e482e0159a249f1189c54f 100644
--- a/src/field/Field.h
+++ b/src/field/Field.h
@@ -253,10 +253,10 @@ namespace field {
 
       inline Layout layout() const { return layout_; }
 
-      cell_idx_t xStride() const { return xfact_; }
-      cell_idx_t yStride() const { return yfact_; }
-      cell_idx_t zStride() const { return zfact_; }
-      cell_idx_t fStride() const { return ffact_; }
+      int64_t xStride() const { return xfact_; }
+      int64_t yStride() const { return yfact_; }
+      int64_t zStride() const { return zfact_; }
+      int64_t fStride() const { return ffact_; }
 
       cell_idx_t xOff() const { return xOff_; }
       cell_idx_t yOff() const { return yOff_; }
@@ -369,10 +369,10 @@ namespace field {
       Layout layout_;        //!< Determines in which order the values are stored
 
       uint_t     allocSize_; //!< The overall size of the T* (padding included)
-      cell_idx_t ffact_;     //!< Access multiplication factor for the f-dimension.
-      cell_idx_t xfact_;     //!< Access multiplication factor for the x-dimension.
-      cell_idx_t yfact_;     //!< Access multiplication factor for the y-dimension.
-      cell_idx_t zfact_;     //!< Access multiplication factor for the z-dimension.
+      int64_t ffact_;     //!< Access multiplication factor for the f-dimension.
+      int64_t xfact_;     //!< Access multiplication factor for the x-dimension.
+      int64_t yfact_;     //!< Access multiplication factor for the y-dimension.
+      int64_t zfact_;     //!< Access multiplication factor for the z-dimension.
 
       shared_ptr<FieldAllocator<T> > allocator_; //!< Allocator for the field
 
diff --git a/src/field/Field.impl.h b/src/field/Field.impl.h
index 20ee14bb95a2df3b50975f42c7ce96fb047c1dcf..c5fec45c77194c58cf10f2e0ba633740d54d89eb 100644
--- a/src/field/Field.impl.h
+++ b/src/field/Field.impl.h
@@ -322,14 +322,14 @@ namespace field {
          const uint_t alignment = 64;
 #elif defined(__ARM_NEON)
          const uint_t alignment = 16;
+#elif defined(__BIGGEST_ALIGNMENT__)
+         const uint_t alignment = __BIGGEST_ALIGNMENT__;
 #elif defined(__AVX512F__)
          const uint_t alignment = 64;
 #elif defined(__AVX__)
          const uint_t alignment = 32;
 #elif defined(__SSE__) || defined(_MSC_VER)
          const uint_t alignment = 16;
-#elif defined(__BIGGEST_ALIGNMENT__)
-         const uint_t alignment = __BIGGEST_ALIGNMENT__;
 #else
          const uint_t alignment = 64;
 #endif
@@ -361,24 +361,24 @@ namespace field {
          fAllocSize_ = fSize_;
 
          WALBERLA_CHECK_LESS_EQUAL( fSize_ * xAllocSize_ * yAllocSize_ * zAllocSize_ + xSize_ + ySize_ * xAllocSize_ + zSize_ * xAllocSize_ * yAllocSize_,
-                                    std::numeric_limits< cell_idx_t >::max(),
-                                    "The data type 'cell_idx_t' is too small for your field size! Your field is too large.\nYou may have to set 'cell_idx_t' to an 'int64_t'." );
+                                    std::numeric_limits< int64_t >::max(),
+                                    "The data type 'int64_t' is too small for your field size! Your field is too large." );
 
-         ffact_ = cell_idx_c(xAllocSize_ * yAllocSize_ * zAllocSize_);
-         zfact_ = cell_idx_c(xAllocSize_ * yAllocSize_);
-         yfact_ = cell_idx_c(xAllocSize_);
+         ffact_ = int64_t(xAllocSize_) * int64_t(yAllocSize_) * int64_t(zAllocSize_);
+         zfact_ = int64_t(xAllocSize_) * int64_t(yAllocSize_);
+         yfact_ = int64_t(xAllocSize_);
          xfact_ = 1;
       } else {
          values_ = allocator_->allocate(zSize_, ySize_, xSize_, fSize_, yAllocSize_, xAllocSize_, fAllocSize_);
          zAllocSize_ = zSize_;
 
          WALBERLA_CHECK_LESS_EQUAL( fSize_ + xSize_ * fAllocSize_ + ySize_ * fAllocSize_ * xAllocSize_ + zSize_ * fAllocSize_ * xAllocSize_ * yAllocSize_,
-                                    std::numeric_limits< cell_idx_t >::max(),
-                                    "The data type 'cell_idx_t' is too small for your field size! Your field is too large.\nYou may have to set 'cell_idx_t' to an 'int64_t'." );
+                                    std::numeric_limits< int64_t >::max(),
+                                    "The data type 'int64_t' is too small for your field size! Your field is too large." );
 
-         zfact_ = cell_idx_c(fAllocSize_ * xAllocSize_ * yAllocSize_);
-         yfact_ = cell_idx_c(fAllocSize_ * xAllocSize_);
-         xfact_ = cell_idx_c(fAllocSize_);
+         zfact_ = int64_t (fAllocSize_) * int64_t(xAllocSize_) * int64_t(yAllocSize_);
+         yfact_ = int64_t(fAllocSize_) * int64_t(xAllocSize_);
+         xfact_ = int64_t (fAllocSize_);
          ffact_ = 1;
       }
 
@@ -721,7 +721,7 @@ namespace field {
    {
       assertValidCoordinates( x, y, z, f );
 
-      const cell_idx_t index = f*ffact_+ x*xfact_+ y*yfact_+ z*zfact_;
+      const int64_t index = f*int64_t(ffact_) + int64_t(x)*int64_t(xfact_) + int64_t(y)*int64_t(yfact_) + int64_t(z)*int64_t(zfact_);
 
       WALBERLA_ASSERT_LESS( int64_c(index) + int64_c(valuesWithOffset_ - values_), int64_c(allocSize_) );
       WALBERLA_ASSERT_GREATER_EQUAL( int64_c(index) + int64_c(valuesWithOffset_ - values_), int64_c(0) );
@@ -1102,7 +1102,7 @@ namespace field {
       xSize_ = xs;
       ySize_ = ys;
       zSize_ = zs;
-      const auto offset = xOff_*xfact_+ yOff_*yfact_+ zOff_*zfact_;
+      const int64_t offset = int64_t(xOff_)*int64_t(xfact_) + int64_t(yOff_)*int64_t(yfact_) + int64_t(zOff_)*int64_t(zfact_);
       valuesWithOffset_ = values_ + offset;
    }
 
diff --git a/src/lbm/all.h b/src/lbm/all.h
index d4b0a4f1559eaa977cb17e43b41ef75f48323f7b..dc581a251d6f31ce10032c74ccbf2147a7f90780 100644
--- a/src/lbm/all.h
+++ b/src/lbm/all.h
@@ -33,6 +33,7 @@
 #include "field/all.h"
 #include "geometry/all.h"
 #include "gui/all.h"
+#include "inplace_streaming/all.h"
 #include "lattice_model/all.h"
 #include "refinement/all.h"
 #include "sweeps/all.h"
diff --git a/src/lbm/inplace_streaming/TimestepTracker.h b/src/lbm/inplace_streaming/TimestepTracker.h
index c0ca22635fea5d9a454f99e2d4fe08a519578eef..daba8db2380af5895f15c1f5688cf59f06b28f8f 100644
--- a/src/lbm/inplace_streaming/TimestepTracker.h
+++ b/src/lbm/inplace_streaming/TimestepTracker.h
@@ -29,10 +29,10 @@ namespace lbm
 class TimestepTracker
 {
  private:
-   uint8_t counter_;
+   uint8_t counter_{ 0 };
 
  public:
-   TimestepTracker() : counter_(0) {}
+   TimestepTracker() = default;
    TimestepTracker(uint8_t zeroth_timestep) : counter_(zeroth_timestep & 1) {}
 
    void advance() { counter_ = (counter_ + 1) & 1; }
@@ -43,6 +43,7 @@ class TimestepTracker
    }
 
    uint8_t getCounter() const { return counter_; }
+   uint8_t getCounterPlusOne() const { return (counter_ + 1) & 1; }
 
 }; // class TimestepTracker
 
diff --git a/src/lbm/inplace_streaming/all.h b/src/lbm/inplace_streaming/all.h
new file mode 100644
index 0000000000000000000000000000000000000000..b984e282387c2b39c07819ddeebc58c0bf9327fb
--- /dev/null
+++ b/src/lbm/inplace_streaming/all.h
@@ -0,0 +1,23 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file all.h
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+
+#pragma once
+
+#include "TimestepTracker.h"