Enable single precision for benchmark cases

ce49cdbf · Markus Holzer · Helen Schottenhamml · b4b0cb4b · ce49cdbf · ce49cdbf
Commit ce49cdbf authored 3 years ago by Markus Holzer Committed by Helen Schottenhamml 3 years ago
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
@@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora
 };
 auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block,
-                           real_t inflow_velocity) {
+                           real_t inflow_velocity, const bool constant_inflow = true) {
-   Cell globalCell;
+   if (constant_inflow)
-   CellInterval domain = SbF->getDomainCellBB();
+   {
-   real_t h_y          = domain.yMax() - domain.yMin();
+      Vector3< real_t > result(inflow_velocity, 0.0, 0.0);
-   real_t h_z          = domain.zMax() - domain.zMin();
+      return result;
-   SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
+   }
+   else
+   {
+      Cell globalCell;
+      CellInterval domain = SbF->getDomainCellBB();
+      real_t h_y          = real_c(domain.ySize());
+      real_t h_z          = real_c(domain.zSize());
+      SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
-   real_t y1 = globalCell[1] - (h_y / 2.0 + 0.5);
+      real_t y1 = real_c(globalCell[1] - (h_y / 2.0 - 0.5));
-   real_t z1 = globalCell[2] - (h_z / 2.0 + 0.5);
+      real_t z1 = real_c(globalCell[2] - (h_z / 2.0 - 0.5));
-   real_t u = (inflow_velocity * 16) / (h_y * h_y * h_z * h_z) * (h_y / 2.0 - y1) * (h_y / 2 + y1) * (h_z / 2 - z1) *
+      real_t u = (inflow_velocity * real_c(16)) / (h_y * h_y * h_z * h_z) * (h_y / real_c(2.0) - y1) *
-              (h_z / 2 + z1);
+                 (h_y / real_c(2.0) + y1) * (h_z / real_c(2.0) - z1) * (h_z / real_c(2.0) + z1);
-   Vector3< real_t > result(u, 0.0, 0.0);
+      Vector3< real_t > result(u, 0.0, 0.0);
-   return result;
+      return result;
+   }
 };
 class AlternatingBeforeFunction
@@ -147,6 +155,7 @@ int main(int argc, char** argv)
      const real_t u_max           = parameters.getParameter< real_t >("u_max", real_t(0.05));
      const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000));
      const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
+      const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
      const double remainingTimeLoggerFrequency =
         parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
@@ -204,7 +213,7 @@ int main(int argc, char** argv)
      auto boundariesConfig = config->getOneBlock("Boundaries");
      std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
-         velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max);
+         velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max, constant_inflow);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
      lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation);
@@ -236,10 +245,9 @@ int main(int argc, char** argv)
      AlternatingBeforeFunction communication(evenComm, oddComm, tracker);
      // add LBM sweep and communication to time loop
-      timeloop.add() << BeforeFunction(communication, "communication")
+      timeloop.add() << BeforeFunction(communication, "communication") << Sweep(ubb.getSweep(tracker), "ubb boundary");
-                     << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
      timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary");
-      timeloop.add() << Sweep(ubb.getSweep(tracker), "ubb boundary");
+      timeloop.add() << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
      timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement")
                     << Sweep(lbSweep.getSweep(tracker), "LB update rule");

--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.py
@@ -14,51 +14,54 @@ from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_
 import sympy as sp
-stencil = get_stencil("D3Q27")
-q = len(stencil)
-dim = len(stencil[0])
-streaming_pattern = 'esotwist'
-timesteps = get_timesteps(streaming_pattern)
-pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : double[{dim}D]", layout='fzyx')
-omega = sp.Symbol("omega")
-u_max = sp.Symbol("u_max")
-output = {
-    'density': density_field,
-    'velocity': velocity_field
-}
-opt = {'symbolic_field': pdfs,
-       'cse_global': False,
-       'cse_pdfs': False}
-method_params = {'method': 'cumulant',
-                 'stencil': stencil,
-                 'relaxation_rate': omega,
-                 'galilean_correction': True,
-                 'field_name': 'pdfs',
-                 'streaming_pattern': streaming_pattern,
-                 'output': output,
-                 'optimization': opt}
-collision_rule = create_lb_collision_rule(**method_params)
-lb_method = collision_rule.method
-# getter & setter
-setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
-                                               pdfs=pdfs, density=1.0,
-                                               streaming_pattern=streaming_pattern,
-                                               previous_timestep=timesteps[0])
-# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
-stencil_typedefs = {'Stencil_T': stencil}
-field_typedefs = {'PdfField_T': pdfs,
-                  'VelocityField_T': velocity_field,
-                  'ScalarField_T': density_field}
 with CodeGeneration() as ctx:
+    data_type = "float64" if ctx.double_accuracy else "float32"
+    stencil = get_stencil("D3Q27")
+    q = len(stencil)
+    dim = len(stencil[0])
+    streaming_pattern = 'esotwist'
+    timesteps = get_timesteps(streaming_pattern)
+    pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : {data_type}[{dim}D]",
+                                                 layout='fzyx')
+    omega = sp.Symbol("omega")
+    u_max = sp.Symbol("u_max")
+    output = {
+        'density': density_field,
+        'velocity': velocity_field
+    }
+    opt = {'symbolic_field': pdfs,
+           'cse_global': False,
+           'cse_pdfs': False,
+           'double_precision': True if ctx.double_accuracy else False}
+    method_params = {'method': 'cumulant',
+                     'stencil': stencil,
+                     'relaxation_rate': omega,
+                     'galilean_correction': True,
+                     'field_name': 'pdfs',
+                     'streaming_pattern': streaming_pattern,
+                     'output': output,
+                     'optimization': opt}
+    collision_rule = create_lb_collision_rule(**method_params)
+    lb_method = collision_rule.method
+    # getter & setter
+    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs, density=1.0,
+                                                   streaming_pattern=streaming_pattern,
+                                                   previous_timestep=timesteps[0])
+    # opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
+    stencil_typedefs = {'Stencil_T': stencil}
+    field_typedefs = {'PdfField_T': pdfs,
+                      'VelocityField_T': velocity_field,
+                      'ScalarField_T': density_field}
    if ctx.cuda:
        target = 'gpu'
    else:

--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGenParameters.py
@@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
 class Scenario:
    def __init__(self):
-        self.timesteps = 101
+        self.timesteps = 1001
-        self.vtkWriteFrequency = 2000
+        self.vtkWriteFrequency = 100
        self.cells = (384, 128, 128)
        self.blocks = (1, 1, 1)
        self.periodic = (0, 0, 0)
+        self.constant_inflow = True
        self.diameter_sphere = min(self.cells) // 2
        self.u_max = 0.1
        self.reynolds_number = 1000000
@@ -38,7 +40,8 @@ class Scenario:
                'omega': self.omega,
                'u_max': self.u_max,
                'reynolds_number': self.reynolds_number,
-                'diameter_sphere': self.diameter_sphere
+                'diameter_sphere': self.diameter_sphere,
+                'constant_inflow': self.constant_inflow
            },
            'Boundaries': {
                'Border': [
@@ -54,7 +57,7 @@ class Scenario:
                     'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2),
                     'radius': self.diameter_sphere // 2,
                     'flag': 'NoSlip'}
-                ]
+                ],
            },
        }

--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -4,9 +4,9 @@ waLBerla_link_files_to_builddir( "*.py" )
 waLBerla_link_files_to_builddir( "simulation_setup" )
-foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
+foreach(streaming_pattern pull push aa esotwist)
-    foreach(stencil d3q27) # choose from {d3q19 d3q27}
+    foreach(stencil d3q19 d3q27)
-        foreach (collision_setup srt trt mrt cumulant) # choose from {srt trt mrt cumulant entropic smagorinsky}
+        foreach (collision_setup srt trt mrt cumulant entropic smagorinsky mrt-overrelax cumulant-overrelax)
            set(config ${stencil}_${streaming_pattern}_${collision_setup})
            waLBerla_generate_target_from_python(NAME UniformGridGPUGenerated_${config}
                    FILE UniformGridGPU.py
@@ -17,6 +17,7 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
                    UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
                    UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
                    UniformGridGPU_MacroSetter.cu UniformGridGPU_MacroSetter.h
+                    UniformGridGPU_StreamOnlyKernel.cu UniformGridGPU_StreamOnlyKernel.h
                    UniformGridGPU_InfoHeader.h
                    )
@@ -25,6 +26,16 @@ foreach(streaming_pattern aa) # choose from {pull, push, aa, esotwist}
                    FILES UniformGridGPU.cpp
                    DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk UniformGridGPUGenerated_${config})
            set_target_properties( UniformGridGPU_${config} PROPERTIES CXX_VISIBILITY_PRESET hidden)
+            # all configs are excluded from all except for pull d3q27.
+            if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
+                set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
+                set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE)
+            else()
+                set_target_properties( UniformGridGPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+                set_target_properties( UniformGridGPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+            endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
        endforeach ()
    endforeach()
 endforeach()
\ No newline at end of file
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file UniformGridGPU.cpp
+//! \author Martin Bauer <martin.bauer@fau.de>
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//! \author Markus Holzer <markus.holzer@fau.de>
+//
+//======================================================================================================================
 #include "blockforest/Initialization.h"
 #include "core/Environment.h"
@@ -7,9 +29,9 @@
 #include "cuda/AddGPUFieldToStorage.h"
 #include "cuda/DeviceSelectMPI.h"
+#include "cuda/FieldCopy.h"
 #include "cuda/ParallelStreams.h"
 #include "cuda/communication/UniformGPUScheme.h"
-#include "cuda/FieldCopy.h"
 #include "cuda/lbm/CombinedInPlaceGpuPackInfo.h"
 #include "field/AddToStorage.h"
@@ -27,14 +49,13 @@
 #include "timeloop/SweepTimeloop.h"
-#include "InitShearVelocity.h"
 #include <cmath>
+#include "InitShearVelocity.h"
 #include "UniformGridGPU_InfoHeader.h"
 using namespace walberla;
-using FlagField_T            = FlagField<uint8_t>;
+using FlagField_T = FlagField< uint8_t >;
 int main(int argc, char** argv)
 {
@@ -58,10 +79,10 @@ int main(int argc, char** argv)
      Vector3< uint_t > cellsPerBlock =
         config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
      // Reading parameters
-      auto parameters        = config->getOneBlock("Parameters");
+      auto parameters          = config->getOneBlock("Parameters");
-      const real_t omega     = parameters.getParameter< real_t >("omega", real_c(1.4));
+      const real_t omega       = parameters.getParameter< real_t >("omega", real_c(1.4));
-      const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
+      const uint_t timesteps   = parameters.getParameter< uint_t >("timesteps", uint_c(50));
-      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
+      const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true);
      // Creating fields
      BlockDataID pdfFieldCpuID =
@@ -69,7 +90,8 @@ int main(int argc, char** argv)
      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
      // Initialize velocity on cpu
-      if( initShearFlow ){
+      if (initShearFlow)
+      {
         WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow")
         initShearVelocity(blocks, velFieldCpuID);
      }
@@ -91,9 +113,7 @@ int main(int argc, char** argv)
      for (uint_t i = 0; i < 3; ++i)
      {
         if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2)
-         {
+         { WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") }
-            WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock")
-         }
      }
      Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
@@ -117,23 +137,26 @@ int main(int argc, char** argv)
      LbSweep lbSweep(pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2], innerOuterSplitCell);
      lbSweep.setOuterPriority(streamHighPriority);
+      pystencils::UniformGridGPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldGpuID, gpuBlockSize[0], gpuBlockSize[1],
+                                                                   gpuBlockSize[2]);
      // Boundaries
-      const FlagUID fluidFlagUID( "Fluid" );
+      const FlagUID fluidFlagUID("Fluid");
-      BlockDataID flagFieldID = field::addFlagFieldToStorage<FlagField_T>(blocks, "Boundary Flag Field");
+      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
-      auto boundariesConfig = config->getBlock( "Boundaries" );
+      auto boundariesConfig   = config->getBlock("Boundaries");
-      bool boundaries = false;
+      bool boundaries         = false;
-      if( boundariesConfig )
+      if (boundariesConfig)
      {
         boundaries = true;
-         geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig );
+         geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
-         geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID );
+         geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
      }
      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
-      noSlip.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
+      noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID);
      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
-      ubb.fillFromFlagField<FlagField_T>(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
+      ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID);
      // Initial setup is the post-collision state of an even time step
      auto tracker = make_shared< lbm::TimestepTracker >(0);
@@ -143,7 +166,8 @@ int main(int argc, char** argv)
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      UniformGPUScheme< Stencil_T > comm(blocks, cudaEnabledMPI);
-      auto packInfo = make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
+      auto packInfo =
+         make_shared< lbm::CombinedInPlaceGpuPackInfo< PackInfoEven, PackInfoOdd > >(tracker, pdfFieldGpuID);
      comm.addPackInfo(packInfo);
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -152,17 +176,17 @@ int main(int argc, char** argv)
      auto defaultStream = cuda::StreamRAII::newPriorityStream(streamLowPriority);
-      auto boundarySweep = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+      auto boundarySweep = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
         noSlip.run(block, t, stream);
         ubb.run(block, t, stream);
      };
-      auto boundaryInner = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+      auto boundaryInner = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
         noSlip.inner(block, t, stream);
         ubb.inner(block, t, stream);
      };
-      auto boundaryOuter = [&](IBlock * block, uint8_t t, cudaStream_t stream){
+      auto boundaryOuter = [&](IBlock* block, uint8_t t, cudaStream_t stream) {
         noSlip.outer(block, t, stream);
         ubb.outer(block, t, stream);
      };
@@ -170,13 +194,15 @@ int main(int argc, char** argv)
      auto simpleOverlapTimeStep = [&]() {
         // Communicate post-collision values of previous timestep...
         comm.startCommunication(defaultStream);
-         for (auto& block : *blocks){
+         for (auto& block : *blocks)
-            if(boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
+         {
+            if (boundaries) boundaryInner(&block, tracker->getCounter(), defaultStream);
            lbSweep.inner(&block, tracker->getCounterPlusOne(), defaultStream);
         }
         comm.wait(defaultStream);
-         for (auto& block : *blocks){
+         for (auto& block : *blocks)
-            if(boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
+         {
+            if (boundaries) boundaryOuter(&block, tracker->getCounter(), defaultStream);
            lbSweep.outer(&block, tracker->getCounterPlusOne(), defaultStream);
         }
@@ -185,8 +211,9 @@ int main(int argc, char** argv)
      auto normalTimeStep = [&]() {
         comm.communicate(defaultStream);
-         for (auto& block : *blocks){
+         for (auto& block : *blocks)
-            if(boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
+         {
+            if (boundaries) boundarySweep(&block, tracker->getCounter(), defaultStream);
            lbSweep(&block, tracker->getCounterPlusOne(), defaultStream);
         }
@@ -201,6 +228,12 @@ int main(int argc, char** argv)
            lbSweep(&block, tracker->getCounter(), defaultStream);
      };
+      // Stream only function to test a streaming pattern without executing lbm operations inside
+      auto StreamOnlyFunc = [&]() {
+         for (auto& block : *blocks)
+            StreamOnlyKernel(&block, defaultStream);
+      };
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
      ///                                             TIME LOOP SETUP                                                ///
      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -221,6 +254,13 @@ int main(int argc, char** argv)
         comm.communicate();
         timeStep = kernelOnlyFunc;
      }
+      else if (timeStepStrategy == "StreamOnly")
+      {
+         WALBERLA_LOG_INFO_ON_ROOT(
+            "Running only streaming kernel without LBM - this makes only sense for benchmarking!")
+         // Run initial communication once to provide any missing stream-in populations
+         timeStep = StreamOnlyFunc;
+      }
      else
      {
         WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', "
@@ -239,7 +279,7 @@ int main(int argc, char** argv)
         vtkOutput->addCellDataWriter(velWriter);
         vtkOutput->addBeforeFunction([&]() {
-           cuda::fieldCpy< VelocityField_T , cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
+            cuda::fieldCpy< VelocityField_T, cuda::GPUField< real_t > >(blocks, velFieldCpuID, velFieldGpuID);
         });
         timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
      }

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -11,6 +11,8 @@ from lbmpy.boundaries import NoSlip, UBB
 from lbmpy.creationfunctions import create_lb_collision_rule
 from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
 from lbmpy.stencils import get_stencil
+from lbmpy.updatekernels import create_stream_only_kernel
+from lbmpy.fieldaccess import *
 from pystencils_walberla import CodeGeneration, generate_info_header, generate_sweep
 from lbmpy_walberla import generate_alternating_lbm_sweep, generate_lb_pack_info, generate_alternating_lbm_boundary
@@ -44,7 +46,7 @@ options_dict = {
    },
    'mrt-overrelax': {
        'method': 'mrt',
-        'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
+        'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)],
    },
    'cumulant': {
        'method': 'cumulant',
@@ -59,7 +61,7 @@ options_dict = {
    'entropic': {
        'method': 'mrt',
        'compressible': True,
-        'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free],
+        'relaxation_rates': [omega, omega] + [omega_free] * 6,
        'entropic': True,
    },
    'smagorinsky': {
@@ -81,6 +83,7 @@ const bool infoCsePdfs = {cse_pdfs};
 optimize = True
 with CodeGeneration() as ctx:
+    field_type = "float64" if ctx.double_accuracy else "float32"
    config_tokens = ctx.config.split('_')
    assert len(config_tokens) >= 3
@@ -99,7 +102,8 @@ with CodeGeneration() as ctx:
    q = len(stencil)
    dim = len(stencil[0])
    assert dim == 3, "This app supports only three-dimensional stencils"
-    pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : double[3D]", layout='fzyx')
+    pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : {field_type}[3D]",
+                                               layout='fzyx')
    common_options = {
        'stencil': stencil,
@@ -110,7 +114,7 @@ with CodeGeneration() as ctx:
            'cse_pdfs': False,
            'symbolic_field': pdfs,
            'field_layout': 'fzyx',
-            'gpu_indexing_params': gpu_indexing_params,
+            'gpu_indexing_params': gpu_indexing_params
        }
    }
@@ -128,6 +132,14 @@ with CodeGeneration() as ctx:
        ('int32_t', 'cudaBlockSize2')
    ]
+    # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM.
+    # is_inplace is set to False to ensure that the streaming is done with src and dst field.
+    # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense.
+    accessor = CollideOnlyInplaceAccessor()
+    accessor.is_inplace = False
+    field_swaps_stream_only = [(pdfs, pdfs_tmp)]
+    stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor)
    # LB Sweep
    collision_rule = create_lb_collision_rule(**options)
@@ -148,6 +160,10 @@ with CodeGeneration() as ctx:
                                                   previous_timestep=Timestep.EVEN)
    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments, target='gpu')
+    # Stream only kernel
+    generate_sweep(ctx, 'UniformGridGPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
+                   gpu_indexing_params=gpu_indexing_params, varying_parameters=vp, target='gpu')
    # Boundaries
    noslip = NoSlip()
    ubb = UBB((0.05, 0, 0))

--- a/python/lbmpy_walberla/walberla_lbm_generation.py
+++ b/python/lbmpy_walberla/walberla_lbm_generation.py
@@ -9,7 +9,7 @@ import pystencils as ps
 from lbmpy.fieldaccess import CollideOnlyInplaceAccessor, StreamPullTwoFieldsAccessor
 from lbmpy.relaxationrates import relaxation_rate_scaling
 from lbmpy.stencils import get_stencil
-from lbmpy.updatekernels import create_lbm_kernel, create_stream_pull_only_kernel
+from lbmpy.updatekernels import create_lbm_kernel, create_stream_only_kernel
 from pystencils import AssignmentCollection, create_kernel
 from pystencils.astnodes import SympyAssignment
 from pystencils.backends.cbackend import CBackend, CustomSympyPrinter, get_headers
@@ -31,7 +31,6 @@ def __lattice_model(generation_context, class_name, lb_method, stream_collide_as
    if not stencil_name:
        raise ValueError("lb_method uses a stencil that is not supported in waLBerla")
    communication_stencil_name = stencil_name if stencil_name != "D3Q15" else "D3Q27"
    is_float = not generation_context.double_accuracy
    dtype_string = "float32" if is_float else "float64"
@@ -165,8 +164,8 @@ def generate_lattice_model(generation_context, class_name, collision_rule, field
    collide_ast.function_name = 'kernel_collide'
    collide_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']
-    stream_update_rule = create_stream_pull_only_kernel(lb_method.stencil, None, 'pdfs', 'pdfs_tmp', field_layout,
+    stream_update_rule = create_stream_only_kernel(lb_method.stencil, src_field, dst_field,
-                                                        dtype)
+                                                   accessor=StreamPullTwoFieldsAccessor())
    stream_ast = create_kernel(stream_update_rule, **create_kernel_params)
    stream_ast.function_name = 'kernel_stream'
    stream_ast.assumed_inner_stride_one = create_kernel_params['cpu_vectorize_info']['assume_inner_stride_one']