diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
index 68aab5882901317263c04045b69ac0a6de403adc..48f029deedca7e127d170206c35b778920b5d6ee 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
@@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora
 };
 
 auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block,
-                           real_t inflow_velocity) {
-   Cell globalCell;
-   CellInterval domain = SbF->getDomainCellBB();
-   real_t h_y          = domain.yMax() - domain.yMin();
-   real_t h_z          = domain.zMax() - domain.zMin();
-   SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
+                           real_t inflow_velocity, const bool constant_inflow = true) {
+   if (constant_inflow)
+   {
+      Vector3< real_t > result(inflow_velocity, 0.0, 0.0);
+      return result;
+   }
+   else
+   {
+      Cell globalCell;
+      CellInterval domain = SbF->getDomainCellBB();
+      real_t h_y          = real_c(domain.ySize());
+      real_t h_z          = real_c(domain.zSize());
+      SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
 
-   real_t y1 = globalCell[1] - (h_y / 2.0 + 0.5);
-   real_t z1 = globalCell[2] - (h_z / 2.0 + 0.5);
+      real_t y1 = real_c(globalCell[1] - (h_y / 2.0 - 0.5));
+      real_t z1 = real_c(globalCell[2] - (h_z / 2.0 - 0.5));
 
-   real_t u = (inflow_velocity * 16) / (h_y * h_y * h_z * h_z) * (h_y / 2.0 - y1) * (h_y / 2 + y1) * (h_z / 2 - z1) *
-              (h_z / 2 + z1);
+      real_t u = (inflow_velocity * real_c(16)) / (h_y * h_y * h_z * h_z) * (h_y / real_c(2.0) - y1) *
+                 (h_y / real_c(2.0) + y1) * (h_z / real_c(2.0) - z1) * (h_z / real_c(2.0) + z1);
 
-   Vector3< real_t > result(u, 0.0, 0.0);
-   return result;
+      Vector3< real_t > result(u, 0.0, 0.0);
+      return result;
+   }
 };
 
 class AlternatingBeforeFunction
@@ -147,6 +155,7 @@ int main(int argc, char** argv)
       const real_t u_max           = parameters.getParameter< real_t >("u_max", real_t(0.05));
       const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000));
       const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
+      const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
 
       const double remainingTimeLoggerFrequency =
          parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
@@ -204,7 +213,7 @@ int main(int argc, char** argv)
       auto boundariesConfig = config->getOneBlock("Boundaries");
 
       std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
-         velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max);
+         velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max, constant_inflow);
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
       lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation);
@@ -236,10 +245,9 @@ int main(int argc, char** argv)
       AlternatingBeforeFunction communication(evenComm, oddComm, tracker);
 
       // add LBM sweep and communication to time loop
-      timeloop.add() << BeforeFunction(communication, "communication")
-                     << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
+      timeloop.add() << BeforeFunction(communication, "communication") << Sweep(ubb.getSweep(tracker), "ubb boundary");
       timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary");
-      timeloop.add() << Sweep(ubb.getSweep(tracker), "ubb boundary");
+      timeloop.add() << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
       timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement")
                      << Sweep(lbSweep.getSweep(tracker), "LB update rule");
 
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
index 42b9f33d24cdbc2e060a313033e0b2458eded091..c565baa589f21dc6681bdd40826aef643b0afd2a 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
@@ -14,51 +14,54 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_
 
 import sympy as sp
 
-stencil = get_stencil("D3Q27")
-q = len(stencil)
-dim = len(stencil[0])
-streaming_pattern = 'esotwist'
-timesteps = get_timesteps(streaming_pattern)
-
-pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : double[{dim}D]", layout='fzyx')
-omega = sp.Symbol("omega")
-u_max = sp.Symbol("u_max")
-
-output = {
-    'density': density_field,
-    'velocity': velocity_field
-}
-
-opt = {'symbolic_field': pdfs,
-       'cse_global': False,
-       'cse_pdfs': False}
-
-method_params = {'method': 'cumulant',
-                 'stencil': stencil,
-                 'relaxation_rate': omega,
-                 'galilean_correction': True,
-                 'field_name': 'pdfs',
-                 'streaming_pattern': streaming_pattern,
-                 'output': output,
-                 'optimization': opt}
-
-collision_rule = create_lb_collision_rule(**method_params)
-lb_method = collision_rule.method
-
-# getter & setter
-setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
-                                               pdfs=pdfs, density=1.0,
-                                               streaming_pattern=streaming_pattern,
-                                               previous_timestep=timesteps[0])
-
-# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
-
-stencil_typedefs = {'Stencil_T': stencil}
-field_typedefs = {'PdfField_T': pdfs,
-                  'VelocityField_T': velocity_field,
-                  'ScalarField_T': density_field}
-
 with CodeGeneration() as ctx:
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    stencil = get_stencil("D3Q27")
+    q = len(stencil)
+    dim = len(stencil[0])
+    streaming_pattern = 'esotwist'
+    timesteps = get_timesteps(streaming_pattern)
+
+    pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : {data_type}[{dim}D]",
+                                                 layout='fzyx')
+    omega = sp.Symbol("omega")
+    u_max = sp.Symbol("u_max")
+
+    output = {
+        'density': density_field,
+        'velocity': velocity_field
+    }
+
+    opt = {'symbolic_field': pdfs,
+           'cse_global': False,
+           'cse_pdfs': False,
+           'double_precision': True if ctx.double_accuracy else False}
+
+    method_params = {'method': 'cumulant',
+                     'stencil': stencil,
+                     'relaxation_rate': omega,
+                     'galilean_correction': True,
+                     'field_name': 'pdfs',
+                     'streaming_pattern': streaming_pattern,
+                     'output': output,
+                     'optimization': opt}
+
+    collision_rule = create_lb_collision_rule(**method_params)
+    lb_method = collision_rule.method
+
+    # getter & setter
+    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs, density=1.0,
+                                                   streaming_pattern=streaming_pattern,
+                                                   previous_timestep=timesteps[0])
+
+    # opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
+
+    stencil_typedefs = {'Stencil_T': stencil}
+    field_typedefs = {'PdfField_T': pdfs,
+                      'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
+
     if ctx.cuda:
         target = 'gpu'
     else:
diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
index 6d046735fd3867cb83670e13090d43f0e52166d4..41d38d16218d97a633ccca62c951356b16c2f446 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
@@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
 
 class Scenario:
     def __init__(self):
-        self.timesteps = 101
-        self.vtkWriteFrequency = 2000
+        self.timesteps = 1001
+        self.vtkWriteFrequency = 100
 
         self.cells = (384, 128, 128)
         self.blocks = (1, 1, 1)
         self.periodic = (0, 0, 0)
 
+        self.constant_inflow = True
+
         self.diameter_sphere = min(self.cells) // 2
         self.u_max = 0.1
         self.reynolds_number = 1000000
@@ -38,7 +40,8 @@ class Scenario:
                 'omega': self.omega,
                 'u_max': self.u_max,
                 'reynolds_number': self.reynolds_number,
-                'diameter_sphere': self.diameter_sphere
+                'diameter_sphere': self.diameter_sphere,
+                'constant_inflow': self.constant_inflow
             },
             'Boundaries': {
                 'Border': [
@@ -54,7 +57,7 @@ class Scenario:
                      'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2),
                      'radius': self.diameter_sphere // 2,
                      'flag': 'NoSlip'}
-                ]
+                ],
             },
         }
 
diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index b3ef93bb589a84f807ac00464fa84cd141c1eb0c..a3a430204b232de43d07819e1a1647d6e444b999 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -4,9 +4,9 @@ waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )
 
 
-foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
-    foreach(stencil d3q27) # choose from {d3q19 d3q27}
-        foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky}
+foreach(streaming_pattern pull push aa esotwist)
+    foreach(stencil d3q19 d3q27)
+        foreach (collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax)
             set(config ${stencil}_${streaming_pattern}_${collision_setup})
             waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
                     FILE UniformGridGPU.py
@@ -17,6 +17,7 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
                     UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
                     UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
                     UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
+                    UniformGridGPU_StreamOnlyKernel.cu UniformGridGPU_StreamOnlyKernel.h
                     UniformGridGPU_InfoHeader.h
                     )
 
@@ -25,6 +26,16 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
                     FILES UniformGridGPU.cpp
                     DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
             set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
+
+            # all configs are excluded from all except for pull d3q27.
+            if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
+                set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
+                set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
+            else()
+                set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+                set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+            endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
+
         endforeach ()
     endforeach()
 endforeach()
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index bdcffe3ad3a8c1992d7746f4e40fd34fc9325c4a..b12c534a4a1fbe05fb58fd4c4571263c5e6d6ff9 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -1,3 +1,25 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file UniformGridGPU.cpp
+//! \author Martin Bauer <martin.bauer@fau.de>
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
+
 #include "blockforest/Initialization.h"
 
 #include "core/Environment.h"
@@ -7,9 +29,9 @@
 
 #include "cuda/AddGPUFieldToStorage.h"
 #include "cuda/DeviceSelectMPI.h"
+#include "cuda/FieldCopy.h"
 #include "cuda/ParallelStreams.h"
 #include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/FieldCopy.h"
 #include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
 
 #include "field/AddToStorage.h"
@@ -27,14 +49,13 @@
 
 #include "timeloop/SweepTimeloop.h"
 
-#include "InitShearVelocity.h"
-
 #include <cmath>
 
+#include "InitShearVelocity.h"
 #include "UniformGridGPU_InfoHeader.h"
 using namespace walberla;
 
-using FlagField_T            = FlagField<uint8_t>;
+using FlagField_T = FlagField< uint8_t >;
 
 int main(int argc, char** argv)
 {
@@ -58,10 +79,10 @@ int main(int argc, char** argv)
       Vector3< uint_t > cellsPerBlock =
          config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
       // Reading parameters
-      auto parameters        = config->getOneBlock("Parameters");
-      const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.4));
-      const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
-      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
+      auto parameters          = config->getOneBlock("Parameters");
+      const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const uint_t timesteps   = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
 
       // Creating fields
       BlockDataID pdfFieldCpuID =
@@ -69,7 +90,8 @@ int main(int argc, char** argv)
       BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
 
       // Initialize velocity on cpu
-      if( initShearFlow ){
+      if (initShearFlow)
+      {
          WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
          initShearVelocity(blocks, velFieldCpuID);
       }
@@ -91,9 +113,7 @@ int main(int argc, char** argv)
       for (uint_t i = 0; i < 3; ++i)
       {
          if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
-         {
-            WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
-         }
+         { WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") }
       }
 
       Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
@@ -117,23 +137,26 @@ int main(int argc, char** argv)
       LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
       lbSweep.setOuterPriority(streamHighPriority);
 
+      pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1],
+                                                                   gpuBlockSize[2]);
+
       // Boundaries
-      const FlagUID fluidFlagUID( "Fluid" );
-      BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field");
-      auto boundariesConfig = config->getBlock( "Boundaries" );
-      bool boundaries = false;
-      if( boundariesConfig )
+      const FlagUID fluidFlagUID("Fluid");
+      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
+      auto boundariesConfig   = config->getBlock("Boundaries");
+      bool boundaries         = false;
+      if (boundariesConfig)
       {
          boundaries = true;
-         geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
-         geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
+         geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
+         geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
       }
 
       lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
-      noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
+      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
 
       lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
-      ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
+      ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
 
       // Initial setup is the post-collision state of an even time step
       auto tracker = make_shared< lbm::TimestepTracker >(0);
@@ -143,7 +166,8 @@ int main(int argc, char** argv)
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
       UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
-      auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
+      auto packInfo =
+         make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
       comm.addPackInfo(packInfo);
 
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -152,17 +176,17 @@ int main(int argc, char** argv)
 
       auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
 
-      auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+      auto boundarySweep = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
          noSlip.run(block, t, stream);
          ubb.run(block, t, stream);
       };
 
-      auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+      auto boundaryInner = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
          noSlip.inner(block, t, stream);
          ubb.inner(block, t, stream);
       };
 
-      auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+      auto boundaryOuter = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
          noSlip.outer(block, t, stream);
          ubb.outer(block, t, stream);
       };
@@ -170,13 +194,15 @@ int main(int argc, char** argv)
       auto simpleOverlapTimeStep = [&]() {
          // Communicate post-collision values of previous timestep...
          comm.startCommunication(defaultStream);
-         for (auto& block : *blocks){
-            if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
+         for (auto& block : *blocks)
+         {
+            if (boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
             lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
          }
          comm.wait(defaultStream);
-         for (auto& block : *blocks){
-            if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
+         for (auto& block : *blocks)
+         {
+            if (boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
             lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
          }
 
@@ -185,8 +211,9 @@ int main(int argc, char** argv)
 
       auto normalTimeStep = [&]() {
          comm.communicate(defaultStream);
-         for (auto& block : *blocks){
-            if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
+         for (auto& block : *blocks)
+         {
+            if (boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
             lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
          }
 
@@ -201,6 +228,12 @@ int main(int argc, char** argv)
             lbSweep(&block, tracker->getCounter(), defaultStream);
       };
 
+      // Stream only function to test a streaming pattern without executing lbm operations inside
+      auto StreamOnlyFunc = [&]() {
+         for (auto& block : *blocks)
+            StreamOnlyKernel(&block, defaultStream);
+      };
+
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       ///                                             TIME LOOP SETUP                                                ///
       //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -221,6 +254,13 @@ int main(int argc, char** argv)
          comm.communicate();
          timeStep = kernelOnlyFunc;
       }
+      else if (timeStepStrategy == "StreamOnly")
+      {
+         WALBERLA_LOG_INFO_ON_ROOT(
+            "Running only streaming kernel without LBM - this makes only sense for benchmarking!")
+         // Run initial communication once to provide any missing stream-in populations
+         timeStep = StreamOnlyFunc;
+      }
       else
       {
          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
@@ -239,7 +279,7 @@ int main(int argc, char** argv)
          vtkOutput->addCellDataWriter(velWriter);
 
          vtkOutput->addBeforeFunction([&]() {
-           cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
+            cuda::fieldCpy< VelocityField_T, cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
          });
          timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
       }
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
index 7d3c8e7f21e602128564f4ea8e22a119e7080989..b615be83289620e218ce160d57f7588ba0886647 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -11,6 +11,8 @@ from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import create_lb_collision_rule
 from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
 from lbmpy.stencils import get_stencil
+from lbmpy.updatekernels import create_stream_only_kernel
+from lbmpy.fieldaccess import *
 
 from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
 from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary
@@ -44,7 +46,7 @@ options_dict = {
     },
     'mrt-overrelax': {
         'method': 'mrt',
-        'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
     },
     'cumulant': {
         'method': 'cumulant',
@@ -59,7 +61,7 @@ options_dict = {
     'entropic': {
         'method': 'mrt',
         'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
+        'relaxation_rates': [omega, omega] + [omega_free] * 6,
         'entropic': True,
     },
     'smagorinsky': {
@@ -81,6 +83,7 @@ const bool infoCsePdfs = {cse_pdfs};
 optimize = True
 
 with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
     config_tokens = ctx.config.split('_')
 
     assert len(config_tokens) >= 3
@@ -99,7 +102,8 @@ with CodeGeneration() as ctx:
     q = len(stencil)
     dim = len(stencil[0])
     assert dim == 3, "This app supports only three-dimensional stencils"
-    pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : double[3D]", layout='fzyx')
+    pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : {field_type}[3D]",
+                                               layout='fzyx')
 
     common_options = {
         'stencil': stencil,
@@ -110,7 +114,7 @@ with CodeGeneration() as ctx:
             'cse_pdfs': False,
             'symbolic_field': pdfs,
             'field_layout': 'fzyx',
-            'gpu_indexing_params': gpu_indexing_params,
+            'gpu_indexing_params': gpu_indexing_params
         }
     }
 
@@ -128,6 +132,14 @@ with CodeGeneration() as ctx:
         ('int32_t', 'cudaBlockSize2')
     ]
 
+    # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
+    # is_inplace is set to False to ensure that the streaming is done with src and dst field.
+    # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
+    accessor = CollideOnlyInplaceAccessor()
+    accessor.is_inplace = False
+    field_swaps_stream_only = [(pdfs, pdfs_tmp)]
+    stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
+
     # LB Sweep
     collision_rule = create_lb_collision_rule(**options)
 
@@ -148,6 +160,10 @@ with CodeGeneration() as ctx:
                                                    previous_timestep=Timestep.EVEN)
     generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu')
 
+    # Stream only kernel
+    generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
+                   gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target='gpu')
+
     # Boundaries
     noslip = NoSlip()
     ubb = UBB((0.05, 0, 0))
diff --git a/python/lbmpy_walberla/walberla_lbm_generation.py b/python/lbmpy_walberla/walberla_lbm_generation.py
index 936fcae85ccaff437f97cd28c8e4bf5f74c30cb1..21463f79ca7df3e1fc56c63f6025da7291fe1cd3 100644
--- a/python/lbmpy_walberla/walberla_lbm_generation.py
+++ b/python/lbmpy_walberla/walberla_lbm_generation.py
@@ -9,7 +9,7 @@ import pystencils as ps
 from lbmpy.fieldaccess import CollideOnlyInplaceAccessor, StreamPullTwoFieldsAccessor
 from lbmpy.relaxationrates import relaxation_rate_scaling
 from lbmpy.stencils import get_stencil
-from lbmpy.updatekernels import create_lbm_kernel, create_stream_pull_only_kernel
+from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel
 from pystencils import AssignmentCollection, create_kernel
 from pystencils.astnodes import SympyAssignment
 from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, get_headers
@@ -31,7 +31,6 @@ def __lattice_model(generation_context, class_name, lb_method, stream_collide_as
     if not stencil_name:
         raise ValueError("lb_method uses a stencil that is not supported in waLBerla")
 
-
     communication_stencil_name = stencil_name if stencil_name != "D3Q15" else "D3Q27"
     is_float = not generation_context.double_accuracy
     dtype_string = "float32" if is_float else "float64"
@@ -165,8 +164,8 @@ def generate_lattice_model(generation_context, class_name, collision_rule, field
     collide_ast.function_name = 'kernel_collide'
     collide_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']
 
-    stream_update_rule = create_stream_pull_only_kernel(lb_method.stencil, None, 'pdfs', 'pdfs_tmp', field_layout,
-                                                        dtype)
+    stream_update_rule = create_stream_only_kernel(lb_method.stencil, src_field, dst_field,
+                                                   accessor=StreamPullTwoFieldsAccessor())
     stream_ast = create_kernel(stream_update_rule, **create_kernel_params)
     stream_ast.function_name = 'kernel_stream'
     stream_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']