UniformGridGPU: New app with AA pattern (single PDF field)

6a61fc77 · Martin Bauer · 0b75cce0 · 6a61fc77 · 6a61fc77 · 6a61fc77
Commit 6a61fc77 authored 5 years ago by Martin Bauer
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -8,11 +8,31 @@ waLBerla_python_file_generates(UniformGridGPU.py
        UniformGridGPU_NoSlip.cu UniformGridGPU_NoSlip.h
        UniformGridGPU_UBB.cu UniformGridGPU_UBB.h
        UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
+        UniformGridGPU_MacroSetter.cpp UniformGridGPU_MacroSetter.h
+        UniformGridGPU_MacroGetter.cpp UniformGridGPU_MacroGetter.h
        )

 foreach(config srt trt mrt smagorinsky entropic )
    waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_${config}
                              FILES UniformGridGPU.cpp UniformGridGPU.py
-                              DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk
+                              DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
                              CODEGEN_CFG ${config})
 endforeach()
+
+
+
+
+waLBerla_python_file_generates(UniformGridGPU_AA.py
+        UniformGridGPU_AA_LbKernelEven.cu UniformGridGPU_AA_LbKernelEven.h
+        UniformGridGPU_AA_LbKernelOdd.cu  UniformGridGPU_AA_LbKernelOdd.h
+        UniformGridGPU_AA_PackInfoPull.cu UniformGridGPU_AA_PackInfoPull.h
+        UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h
+        UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h
+        UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h
+        )
+
+set(config "srt")
+waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config}
+        FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py
+        DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui
+        CODEGEN_CFG ${config})
--- a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
+++ b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h
+#include "core/math/Random.h"
+#include "domain_decomposition/StructuredBlockStorage.h"
+
+
+namespace walberla {
+
+
+inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
+                              const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
+{
+    math::seedRandomGenerator(0);
+    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
+    for( auto & block: *blocks)
+    {
+        auto velField = block.getData<GhostLayerField<real_t, 3> >( velFieldID );
+        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
+                                                         Cell globalCell;
+        blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+        real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
+        velField->get(x, y, z, 1) = real_t(0);
+        velField->get(x, y, z, 2) = randomReal;
+
+        if( globalCell[2] >= halfZ ) {
+            velField->get(x, y, z, 0) = xMagnitude;
+        } else {
+            velField->get(x, y, z, 0) = -xMagnitude;
+        }
+        );
+    }
+}
+
+
+}
\ No newline at end of file
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -5,12 +5,10 @@
 #include "python_coupling/PythonCallback.h"
 #include "python_coupling/DictWrapper.h"
 #include "blockforest/Initialization.h"
-#include "lbm/field/PdfField.h"
-#include "lbm/field/AddToStorage.h"
 #include "field/FlagField.h"
 #include "field/AddToStorage.h"
-#include "lbm/communication/PdfFieldPackInfo.h"
-#include "lbm/vtk/VTKOutput.h"
+#include "field/vtk/VTKWriter.h"
+#include "field/communication/PackInfo.h"
 #include "lbm/PerformanceLogger.h"
 #include "blockforest/communication/UniformBufferedScheme.h"
 #include "timeloop/all.h"
@@ -25,8 +23,9 @@
 #include "cuda/AddGPUFieldToStorage.h"
 #include "cuda/communication/UniformGPUScheme.h"
 #include "cuda/DeviceSelectMPI.h"
-#include "lbm/sweeps/CellwiseSweep.h"
 #include "domain_decomposition/SharedSweep.h"
+#include "gui/Gui.h"
+#include "lbm/gui/Connection.h"

 #include "UniformGridGPU_LatticeModel.h"
 #include "UniformGridGPU_LbKernel.h"
@@ -34,37 +33,45 @@
 #include "UniformGridGPU_UBB.h"
 #include "UniformGridGPU_NoSlip.h"
 #include "UniformGridGPU_Communication.h"
+#include "UniformGridGPU_MacroSetter.h"
+#include "UniformGridGPU_MacroGetter.h"


 using namespace walberla;

 using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;

+const auto Q = LatticeModel_T::Stencil::Q;
+
+
 using Stencil_T = LatticeModel_T::Stencil;
 using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
-using PdfField_T = lbm::PdfField<LatticeModel_T>;
+using PdfField_T = GhostLayerField<real_t, Q>;
 using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
-
+using VelocityField_T = GhostLayerField<real_t, 3>;
 using flag_t = walberla::uint8_t;
 using FlagField_T = FlagField<flag_t>;


-void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfFieldID,
+void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID,
                       const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
 {
    math::seedRandomGenerator(0);
    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
    for( auto & block: *blocks)
    {
-        PdfField_T * pdfField = block.getData<PdfField_T>( pdfFieldID );
-        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdfField,
+        auto velField = block.getData<VelocityField_T>( velFieldID );
+        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(velField,
            Cell globalCell;
            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
            real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
+            velField->get(x, y, z, 1) = real_t(0);
+            velField->get(x, y, z, 2) = randomReal;
+
            if( globalCell[2] >= halfZ ) {
-                pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(xMagnitude, 0, randomReal), real_t(1.0));
+                velField->get(x, y, z, 0) = xMagnitude;
            } else {
-                pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(-xMagnitude, 0,randomReal), real_t(1.0));
+                velField->get(x, y, z, 0) = -xMagnitude;
            }
        );
    }
@@ -89,16 +96,24 @@ int main( int argc, char **argv )
      auto parameters = config->getOneBlock( "Parameters" );
      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
-      const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() );
      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);

      // Creating fields
-      auto latticeModel = LatticeModel_T( omega );
-      BlockDataID pdfFieldCpuID = lbm::addPdfFieldToStorage( blocks, "pdfs on CPU", latticeModel, initialVelocity, real_t(1), field::fzyx );
+      BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(99.8), field::fzyx);
+      BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx);
+
      if( initShearFlow ) {
          WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow");
-          initShearVelocity( blocks, pdfFieldCpuID );
+          initShearVelocity( blocks, velFieldCpuID );
      }
+      pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID);
+      for( auto & block : *blocks )
+          setterSweep( &block );
+      // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here
+      blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks);
+      initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) );
+      initialComm();
+

      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
@@ -253,17 +268,21 @@ int main( int argc, char **argv )
      timeLoop.add() << BeforeFunction( timeStep  )
                     << Sweep( []( IBlock * ) {}, "time step" );

+      pystencils::UniformGridGPU_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
+
      // VTK
      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
      if( vtkWriteFrequency > 0 )
      {
         auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
                                                          "simulation_step", false, true, true, false, 0 );
-         vtkOutput->addCellDataWriter(
-                 make_shared<lbm::VelocityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Velocity" ));
-         vtkOutput->addCellDataWriter( make_shared<lbm::DensityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Density" ));
-         vtkOutput->addBeforeFunction(
-                 cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ));
+         auto velWriter = make_shared< field::VTKWriter<VelocityField_T> >(velFieldCpuID, "vel");
+         vtkOutput->addCellDataWriter(velWriter);
+         vtkOutput->addBeforeFunction( [&]() {
+             cuda::fieldCpy<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+             for( auto & block : *blocks )
+                 getterSweep( &block );
+         });
         timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
      }

@@ -280,31 +299,41 @@ int main( int argc, char **argv )
          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
      }

-      for(int outerIteration = 0; outerIteration < outerIterations; ++outerIteration)
+      bool useGui = parameters.getParameter<bool>( "useGui", false );
+      if( useGui )
+      {
+          GUI gui( timeLoop, blocks, argc, argv);
+          lbm::connectToGui<LatticeModel_T>(gui);
+          gui.run();
+      }
+      else
      {
-          timeLoop.setCurrentTimeStepToZero();
-          WcTimer simTimer;
-          cudaDeviceSynchronize();
-          WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
-          simTimer.start();
-          timeLoop.run();
-          cudaDeviceSynchronize();
-          simTimer.end();
-          WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
-          auto time = simTimer.last();
-          auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-          auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-          WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
-          WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
-          WALBERLA_ROOT_SECTION()
+          for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
          {
-              python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-              if ( pythonCallbackResults.isCallable())
+              timeLoop.setCurrentTimeStepToZero();
+              WcTimer simTimer;
+              cudaDeviceSynchronize();
+              WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+              simTimer.start();
+              timeLoop.run();
+              cudaDeviceSynchronize();
+              simTimer.end();
+              WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+              auto time = simTimer.last();
+              auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+              auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+              WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+              WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+              WALBERLA_ROOT_SECTION()
              {
-                  pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                  pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
-                  // Call Python function to report results
-                  pythonCallbackResults();
+                  python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+                  if ( pythonCallbackResults.isCallable())
+                  {
+                      pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                      pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
+                      // Call Python function to report results
+                      pythonCallbackResults();
+                  }
              }
          }
      }

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
 DomainSetup
 {
-   blocks        <  2,    1,   1 >;
-   cellsPerBlock <  64, 64, 64 >;
+   blocks        <  1,    1,   1 >;
+   cellsPerBlock <  512, 256, 256 >;
   periodic      <  1,    1,   1 >;
 }

 Parameters 
 {

-	timesteps       10000;  // time steps of one performance measurement
+	timesteps       2000; //10000;  // time steps of one performance measurement
 	warmupSteps     0;      // number of steps to run before measurement starts
    outerIterations 1;      // how many measurements to conduct

    // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy
    communicationScheme UniformGPUScheme_Baseline;

-	vtkWriteFrequency 100;             // write a VTK file every n'th step, if zero VTK output is disabled
+	vtkWriteFrequency 0; //100;             // write a VTK file every n'th step, if zero VTK output is disabled
 	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation

 	timeStepStrategy noOverlap;      // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
@@ -25,6 +25,7 @@ Parameters

 	omega 1.92;
 	initShearFlow 1;
+	useGui 0;
 }

 /*

--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
 import sympy as sp
 import numpy as np
+import pystencils as ps
 from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
 from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor, StreamPushTwoFieldsAccessor
 from pystencils_walberla import generate_pack_info_from_kernel
 from lbmpy_walberla import generate_lattice_model, generate_boundary
 from pystencils_walberla import CodeGeneration, generate_sweep
 from pystencils.data_types import TypedSymbol
 from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
+from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter

 omega = sp.symbols("omega")
 # sweep_block_size = (128, 1, 1)
@@ -16,7 +19,7 @@ sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),

 sweep_params = {'block_size': sweep_block_size}

-options = {
+options_dict = {
    'srt': {
        'method': 'srt',
        'stencil': 'D3Q19',
@@ -49,17 +52,25 @@ options = {
 }

 with CodeGeneration() as ctx:
+    accessor = StreamPullTwoFieldsAccessor()
+    #accessor = StreamPushTwoFieldsAccessor()
+    assert not accessor.is_inplace, "This app does not work for inplace accessors"

    common_options = {
        'field_name': 'pdfs',
        'temporary_field_name': 'pdfs_tmp',
-        #'kernel_type': 'collide_stream_push',
+        'kernel_type': accessor,
        'optimization': {'cse_global': True,
                         'cse_pdfs': False}
    }
-    options = options[ctx.config]
+    options = options_dict.get(ctx.config, options_dict['srt'])
    options.update(common_options)

+    stencil_str = options['stencil']
+    q = int(stencil_str[stencil_str.find('Q')+1:])
+    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
+    options['optimization']['symbolic_field'] = pdfs
+
    vp = [
        ('int32_t', 'cudaBlockSize0'),
        ('int32_t', 'cudaBlockSize1')
@@ -84,5 +95,13 @@ with CodeGeneration() as ctx:
    generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
    generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')

+    # getter & setter
+    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector, density=1)
+    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector,  density=None)
+    generate_sweep(ctx, 'UniformGridGPU_MacroSetter', setter_assignments)
+    generate_sweep(ctx, 'UniformGridGPU_MacroGetter', getter_assignments)
+
    # communication
    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_PackInfo', update_rule, target='gpu')
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp
+#include "core/Environment.h"
+#include "core/logging/Initialization.h"
+#include "python_coupling/CreateConfig.h"
+#include "python_coupling/PythonCallback.h"
+#include "python_coupling/DictWrapper.h"
+#include "blockforest/Initialization.h"
+#include "field/FlagField.h"
+#include "field/AddToStorage.h"
+#include "field/vtk/VTKWriter.h"
+#include "field/communication/PackInfo.h"
+#include "lbm/PerformanceLogger.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "timeloop/all.h"
+#include "geometry/all.h"
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/communication/GPUPackInfo.h"
+#include "cuda/ParallelStreams.h"
+#include "core/timing/TimingPool.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "cuda/AddGPUFieldToStorage.h"
+#include "cuda/communication/UniformGPUScheme.h"
+#include "cuda/DeviceSelectMPI.h"
+#include "domain_decomposition/SharedSweep.h"
+#include "stencil/D3Q19.h"
+#include "stencil/D3Q27.h"
+#include "InitShearVelocity.h"
+
+#include "UniformGridGPU_AA_PackInfoPush.h"
+#include "UniformGridGPU_AA_PackInfoPull.h"
+#include "UniformGridGPU_AA_MacroSetter.h"
+#include "UniformGridGPU_AA_MacroGetter.h"
+#include "UniformGridGPU_AA_LbKernelEven.h"
+#include "UniformGridGPU_AA_LbKernelOdd.h"
+
+
+using namespace walberla;
+
+using Stencil_T = stencil::D3Q19; //TODO make generic - and determine from python script
+using CommunicationStencil_T = Stencil_T;
+using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
+using VelocityField_T = GhostLayerField< real_t, 3 >;
+
+
+int main( int argc, char **argv )
+{
+    mpi::Environment env( argc, argv );
+    cuda::selectDeviceBasedOnMpiRank();
+
+    for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+    {
+        WALBERLA_MPI_WORLD_BARRIER();
+
+        auto config = *cfg;
+        logging::configureLogging( config );
+        auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+
+        Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" );
+        // Reading parameters
+        auto parameters = config->getOneBlock( "Parameters" );
+        const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 ));
+        const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 ));
+
+        // Creating fields
+        BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( 42.0 ), field::fzyx );
+        BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
+
+        WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" );
+        initShearVelocity( blocks, velFieldCpuID );
+
+        pystencils::UniformGridGPU_AA_MacroGetter getterSweep( pdfFieldCpuID, velFieldCpuID );
+        pystencils::UniformGridGPU_AA_MacroSetter setterSweep( pdfFieldCpuID, velFieldCpuID );
+
+        for ( auto &block : *blocks )
+            setterSweep( &block );
+
+        BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
+
+        Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
+        Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
+        bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
+        Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
+
+        int streamHighPriority = 0;
+        int streamLowPriority = 0;
+        WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange( &streamLowPriority, &streamHighPriority ));
+        WALBERLA_CHECK( gpuBlockSize[2] == 1 );
+
+
+        using KernelEven = pystencils::UniformGridGPU_AA_LbKernelEven;
+        using KernelOdd = pystencils::UniformGridGPU_AA_LbKernelOdd;
+        using PackInfoPull = pystencils::UniformGridGPU_AA_PackInfoPull;
+        using PackInfoPush = pystencils::UniformGridGPU_AA_PackInfoPush;
+        using cuda::communication::UniformGPUScheme;
+
+        KernelEven kernelEven( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
+        KernelOdd  kernelOdd ( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], innerOuterSplitCell );
+
+        kernelEven.setOuterPriority( streamHighPriority );
+        kernelOdd .setOuterPriority( streamHighPriority );
+
+        auto pullScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
+        auto pushScheme = make_shared< UniformGPUScheme< Stencil_T > >( blocks, cudaEnabledMPI );
+        pullScheme->addPackInfo( make_shared< PackInfoPull >( pdfFieldGpuID ) );
+        pushScheme->addPackInfo( make_shared< PackInfoPush >( pdfFieldGpuID ) );
+
+
+        auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
+
+        auto setupPhase = [&]() {
+            for ( auto &block: *blocks )
+                kernelEven( &block );
+
+            pullScheme->communicate();
+
+            for ( auto &block: *blocks )
+                kernelOdd( &block );
+        };
+
+
+        auto tearDownPhase = [&]() {
+            pushScheme->communicate();
+            cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID );
+            for ( auto &block : *blocks )
+                getterSweep( &block );
+        };
+
+
+        auto simpleOverlapTimeStep = [&]()
+        {
+            // Even
+            pushScheme->startCommunication( defaultStream );
+            for ( auto &block: *blocks )
+                kernelEven.inner( &block, defaultStream );
+            pushScheme->wait( defaultStream );
+            for ( auto &block: *blocks )
+                kernelEven.outer( &block, defaultStream );
+
+            // Odd
+            pullScheme->startCommunication( defaultStream );
+            for ( auto &block: *blocks )
+                kernelOdd.inner( &block, defaultStream );
+            pullScheme->wait( defaultStream );
+            for ( auto &block: *blocks )
+                kernelOdd.outer( &block, defaultStream );
+        };
+
+        auto normalTimeStep = [&]()
+        {
+            pushScheme->communicate( defaultStream );
+            for ( auto &block: *blocks )
+                kernelEven( &block, defaultStream );
+
+            pullScheme->communicate( defaultStream );
+            for ( auto &block: *blocks )
+                kernelOdd( &block, defaultStream );
+        };
+
+        auto kernelOnlyFunc = [&]()
+        {
+            for ( auto &block: *blocks )
+                kernelEven( &block, defaultStream );
+            for ( auto &block: *blocks )
+                kernelOdd( &block, defaultStream );
+        };
+
+        SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
+
+        const std::string timeStepStrategy = parameters.getParameter< std::string >( "timeStepStrategy", "normal" );
+        std::function< void() > timeStep;
+        if ( timeStepStrategy == "noOverlap" )
+            timeStep = std::function< void() >( normalTimeStep );
+        else if ( timeStepStrategy == "simpleOverlap" )
+            timeStep = simpleOverlapTimeStep;
+        else if ( timeStepStrategy == "kernelOnly" )
+        {
+            WALBERLA_LOG_INFO_ON_ROOT( "Running only compute kernel without boundary - this makes only sense for benchmarking!" )
+            timeStep = kernelOnlyFunc;
+        }
+        else
+        {
+            WALBERLA_ABORT_NO_DEBUG_INFO(
+                    "Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'" );
+        }
+
+        timeLoop.add() << BeforeFunction( timeStep )
+                       << Sweep( []( IBlock * ) {}, "time step" );
+
+
+        // VTK
+        uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 );
+        if ( vtkWriteFrequency > 0 )
+        {
+            auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                             "simulation_step", false, true, true, false, 0 );
+            auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldCpuID, "vel" );
+            vtkOutput->addCellDataWriter( velWriter );
+            vtkOutput->addBeforeFunction( [&]()
+                                          {
+                                              tearDownPhase();
+                                              setupPhase();
+                                          } );
+            timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+        }
+
+        int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 );
+        int outerIterations = parameters.getParameter< int >( "outerIterations", 1 );
+        setupPhase();
+        for ( int i = 0; i < warmupSteps; ++i )
+            timeLoop.singleStep();
+
+        double  remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
+        if ( remainingTimeLoggerFrequency > 0 )
+        {
+            auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency );
+            timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
+        }
+
+        for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
+        {
+            timeLoop.setCurrentTimeStepToZero();
+            WcTimer simTimer;
+            cudaDeviceSynchronize();
+            WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
+            simTimer.start();
+            timeLoop.run();
+            cudaDeviceSynchronize();
+            simTimer.end();
+            WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
+            auto time = simTimer.last();
+            auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+
+            auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+            WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
+            WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
+            WALBERLA_ROOT_SECTION()
+            {
+                python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+                if ( pythonCallbackResults.isCallable())
+                {
+                    pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                    pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 );
+                    // Call Python function to report results
+                    pythonCallbackResults();
+                }
+            }
+        }
+    }
+
+    return 0;
+}
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py
+import sympy as sp
+import numpy as np
+import pystencils as ps
+from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
+from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
+from pystencils_walberla import generate_pack_info_from_kernel
+from pystencils_walberla import CodeGeneration, generate_sweep
+from pystencils.data_types import TypedSymbol
+from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
+from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
+
+omega = sp.symbols("omega")
+# sweep_block_size = (128, 1, 1)
+sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                    TypedSymbol("cudaBlockSize1", np.int32),
+                    1)
+
+sweep_params = {'block_size': sweep_block_size}
+
+options_dict = {
+    'srt': {
+        'method': 'srt',
+        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
+        'compressible': False,
+    },
+    'trt': {
+        'method': 'trt',
+        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
+    },
+    'mrt': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1],
+    },
+    'entropic': {
+        'method': 'mrt3',
+        'stencil': 'D3Q19',
+        'compressible': True,
+        'relaxation_rates': [omega, omega, sp.Symbol("omega_free")],
+        'entropic': True,
+    },
+    'smagorinsky': {
+        'method': 'srt',
+        'stencil': 'D3Q19',
+        'smagorinsky': True,
+        'relaxation_rate': omega,
+    }
+}
+
+with CodeGeneration() as ctx:
+    accessors = {
+        'Even': AAEvenTimeStepAccessor(),
+        'Odd': AAOddTimeStepAccessor()
+    }
+
+    common_options = {
+        'field_name': 'pdfs',
+        'optimization': {'cse_global': True,
+                         'cse_pdfs': False}
+    }
+    options = options_dict.get(ctx.config, options_dict['srt'])
+    options.update(common_options)
+
+    stencil_str = options['stencil']
+    q = int(stencil_str[stencil_str.find('Q') + 1:])
+    pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
+    options['optimization']['symbolic_field'] = pdfs
+
+    vp = [
+        ('int32_t', 'cudaBlockSize0'),
+        ('int32_t', 'cudaBlockSize1')
+    ]
+    lb_method = create_lb_method(**options)
+
+    # Kernels
+    options_without_opt = options.copy()
+    del options_without_opt['optimization']
+    update_rules = {}
+    for name, accessor in accessors.items():
+        update_rule = create_lb_update_rule(lb_method=lb_method, kernel_type=accessor, **options)
+        update_rule = insert_fast_divisions(update_rule)
+        update_rule = insert_fast_sqrts(update_rule)
+        update_rules[name] = update_rule
+        generate_sweep(ctx, 'UniformGridGPU_AA_LbKernel' + name, update_rule,
+                       inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
+                       varying_parameters=vp)
+
+    # getter & setter
+    setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector, density=1)
+    getter_assignments = macroscopic_values_getter(lb_method, velocity=velocity_field.center_vector,
+                                                   pdfs=pdfs.center_vector, density=None)
+    generate_sweep(ctx, 'UniformGridGPU_AA_MacroSetter', setter_assignments)
+    generate_sweep(ctx, 'UniformGridGPU_AA_MacroGetter', getter_assignments)
+
+    # communication
+    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'], kind='pull', target='gpu')
+    generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'], kind='push', target='gpu')