diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index b53cb24ca8ad9be070ab93d40ae86346a9c20f32..c3052b52393513f988d00b8730aad3ba6d6d1e81 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -10,6 +10,9 @@ waLBerla_python_file_generates(UniformGridGPU.py
         UniformGridGPU_PackInfo.cu UniformGridGPU_PackInfo.h
         )
 
-waLBerla_add_executable ( NAME UniformGridBenchmarkGPU
-                          FILES UniformGridGPU.cpp UniformGridGPU.py
-                          DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk )
+foreach(config srt trt mrt smagorinsky entropic )
+    waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_${config}
+                              FILES UniformGridGPU.cpp UniformGridGPU.py
+                              DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk
+                              CODEGEN_CFG ${config})
+endforeach()
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index 32d06a834473ff86e05f75a4d0c4cccec3a07e67..181f981bc05c74a122f770fb88e3164f560d100a 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -1,5 +1,6 @@
 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
+#include "core/math/Random.h"
 #include "python_coupling/CreateConfig.h"
 #include "python_coupling/PythonCallback.h"
 #include "python_coupling/DictWrapper.h"
@@ -48,6 +49,27 @@ using flag_t = walberla::uint8_t;
 using FlagField_T = FlagField<flag_t>;
 
 
+void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfFieldID,
+                       const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
+{
+    math::seedRandomGenerator(0);
+    auto halfZ = blocks->getDomainCellBB().zMax() / 2;
+    for( auto & block: *blocks)
+    {
+        PdfField_T * pdfField = block.getData<PdfField_T>( pdfFieldID );
+        WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdfField,
+            Cell globalCell;
+            blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
+            real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
+            if( globalCell[2] >= halfZ ) {
+                pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(xMagnitude, 0, randomReal), real_t(1.0));
+            } else {
+                pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(-xMagnitude, 0,randomReal), real_t(1.0));
+            }
+        );
+    }
+}
+
 
 int main( int argc, char **argv )
 {
@@ -63,19 +85,25 @@ int main( int argc, char **argv )
       auto blocks = blockforest::createUniformBlockGridFromConfig( config );
 
       Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
-
       // Reading parameters
       auto parameters = config->getOneBlock( "Parameters" );
       const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
       const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
       const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() );
+      const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);
 
       // Creating fields
       auto latticeModel = LatticeModel_T( omega );
       BlockDataID pdfFieldCpuID = lbm::addPdfFieldToStorage( blocks, "pdfs on CPU", latticeModel, initialVelocity, real_t(1), field::fzyx );
+      if( initShearFlow ) {
+          WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow");
+          initShearVelocity( blocks, pdfFieldCpuID );
+      }
+
       BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
       BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
 
+
       // Boundaries
       const FlagUID fluidFlagUID( "Fluid" );
       auto boundariesConfig = config->getBlock( "Boundaries" );
@@ -95,7 +123,7 @@ int main( int argc, char **argv )
 
        // Communication setup
       bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
-
+      Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
       const std::string communicationSchemeStr = parameters.getParameter<std::string>("communicationScheme", "UniformGPUScheme_Baseline");
       CommunicationSchemeType communicationScheme;
       if( communicationSchemeStr == "GPUPackInfo_Baseline")
@@ -106,6 +134,8 @@ int main( int argc, char **argv )
           communicationScheme = UniformGPUScheme_Baseline;
       else if (communicationSchemeStr == "UniformGPUScheme_Memcpy")
           communicationScheme = UniformGPUScheme_Memcpy;
+      else if (communicationSchemeStr == "MPIDatatypes")
+          communicationScheme = MPIDatatypes;
       else {
           WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme")
       }
@@ -116,8 +146,9 @@ int main( int argc, char **argv )
       int streamHighPriority = 0;
       int streamLowPriority = 0;
       WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
-
-      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
+      WALBERLA_CHECK(gpuBlockSize[2] == 1);
+      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1],
+                                                    Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) );
       lbKernel.setOuterPriority( streamHighPriority );
       UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > >
          gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI );
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
index 81a0996f872ab2f409a0bef1a9619de8a2095e6b..9c0796a49e48af5e8ae999f80a69c8b8de19a6d3 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -1,7 +1,7 @@
 DomainSetup
 {
-   blocks        <  1,    1,   1 >;
-   cellsPerBlock <  256, 256, 128 >;
+   blocks        <  2,    1,   1 >;
+   cellsPerBlock <  64, 64, 64 >;
    periodic      <  1,    1,   1 >;
 }
 
@@ -15,13 +15,16 @@ Parameters
     // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy
     communicationScheme UniformGPUScheme_Baseline;
 
-	vtkWriteFrequency 0;             // write a VTK file every n'th step, if zero VTK output is disabled
+	vtkWriteFrequency 100;             // write a VTK file every n'th step, if zero VTK output is disabled
 	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation
 
 	timeStepStrategy noOverlap;      // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
-	innerOuterSplit < 32, 1, 1>;     // slice-thickness that 'outer'-kernels process when overlapping
+	innerOuterSplit < 8, 1, 1>;     // slice-thickness that 'outer'-kernels process when overlapping
 
-	remainingTimeLoggerFrequency 0;  // interval in seconds to log the estimated remaining time
+	remainingTimeLoggerFrequency 5;  // interval in seconds to log the estimated remaining time
+
+	omega 1.92;
+	initShearFlow 1;
 }
 
 /*
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
index e5b05555a6c5cf47a6a01bba93f8da312a22818d..6e0539753bf8e0efbbae8d2475cba6a9f6e4f92d 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py
@@ -1,37 +1,86 @@
 import sympy as sp
+import numpy as np
 from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
 from lbmpy.boundaries import NoSlip, UBB
 from pystencils_walberla import generate_pack_info_from_kernel
 from lbmpy_walberla import generate_lattice_model, generate_boundary
 from pystencils_walberla import CodeGeneration, generate_sweep
+from pystencils.data_types import TypedSymbol
+from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisions
 
+omega = sp.symbols("omega")
+# sweep_block_size = (128, 1, 1)
+sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32),
+                    TypedSymbol("cudaBlockSize1", np.int32),
+                    1)
 
-sweep_block_size = (128, 1, 1)
 sweep_params = {'block_size': sweep_block_size}
 
-
-with CodeGeneration() as ctx:
-    # LB options
-    options = {
+options = {
+    'srt': {
         'method': 'srt',
         'stencil': 'D3Q19',
-        'relaxation_rate': sp.Symbol("omega"),
-        'field_name': 'pdfs',
+        'relaxation_rate': omega,
         'compressible': False,
+    },
+    'trt': {
+        'method': 'trt',
+        'stencil': 'D3Q19',
+        'relaxation_rate': omega,
+    },
+    'mrt': {
+        'method': 'mrt',
+        'stencil': 'D3Q19',
+        'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1],
+    },
+    'entropic': {
+        'method': 'mrt3',
+        'stencil': 'D3Q19',
+        'compressible': True,
+        'relaxation_rates': [omega, omega, sp.Symbol("omega_free")],
+        'entropic': True,
+    },
+    'smagorinsky': {
+        'method': 'srt',
+        'stencil': 'D3Q19',
+        'smagorinsky': True,
+        'relaxation_rate': omega,
+    }
+}
+
+with CodeGeneration() as ctx:
+
+    common_options = {
+        'field_name': 'pdfs',
         'temporary_field_name': 'pdfs_tmp',
+        #'kernel_type': 'collide_stream_push',
         'optimization': {'cse_global': True,
-                         'cse_pdfs': False,
-                         }
+                         'cse_pdfs': False}
     }
+    options = options[ctx.config]
+    options.update(common_options)
+
+    vp = [
+        ('int32_t', 'cudaBlockSize0'),
+        ('int32_t', 'cudaBlockSize1')
+    ]
     lb_method = create_lb_method(**options)
     update_rule = create_lb_update_rule(lb_method=lb_method, **options)
 
+    update_rule = insert_fast_divisions(update_rule)
+    update_rule = insert_fast_sqrts(update_rule)
+
     # CPU lattice model - required for macroscopic value computation, VTK output etc.
-    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method)
+    options_without_opt = options.copy()
+    del options_without_opt['optimization']
+    generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method, update_rule_params=options_without_opt)
 
     # gpu LB sweep & boundaries
-    generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, field_swaps=[('pdfs', 'pdfs_tmp')],
-                   inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params)
+    generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
+                   field_swaps=[('pdfs', 'pdfs_tmp')],
+                   inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params,
+                   varying_parameters=vp)
+
     generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu')
     generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu')
 
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
index 3e45caa94d6f34bafdc302e97422522b072a615e..6a3bf6b532fe67b753cf2d64737a25d7747e067d 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h
@@ -4,6 +4,8 @@
 #include "blockforest/Block.h"
 #include "blockforest/StructuredBlockForest.h"
 #include "blockforest/communication/UniformBufferedScheme.h"
+#include "blockforest/communication/UniformDirectScheme.h"
+#include "field/communication/StencilRestrictedMPIDatatypeInfo.h"
 #include "cuda/communication/GPUPackInfo.h"
 #include "cuda/communication/UniformGPUScheme.h"
 #include "cuda/communication/MemcpyPackInfo.h"
@@ -17,7 +19,8 @@ enum CommunicationSchemeType {
     GPUPackInfo_Baseline = 0,
     GPUPackInfo_Streams = 1,
     UniformGPUScheme_Baseline = 2,
-    UniformGPUScheme_Memcpy = 3
+    UniformGPUScheme_Memcpy = 3,
+    MPIDatatypes = 4
 };
 
 
@@ -28,8 +31,12 @@ public:
     explicit UniformGridGPU_Communication(weak_ptr_wrapper<StructuredBlockForest> bf, const BlockDataID & bdId,
                                           CommunicationSchemeType commSchemeType, bool cudaEnabledMPI = false)
         : _commSchemeType(commSchemeType), _cpuCommunicationScheme(nullptr), _gpuPackInfo(nullptr),
-          _gpuCommunicationScheme(nullptr), _generatedPackInfo(nullptr)
+          _gpuCommunicationScheme(nullptr), _directScheme(nullptr)
     {
+        auto generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId );
+        auto memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
+        auto dataTypeInfo = make_shared< field::communication::StencilRestrictedMPIDatatypeInfo< GPUFieldType, StencilType > >( bdId );
+
         switch(_commSchemeType)
         {
             case GPUPackInfo_Baseline:
@@ -44,13 +51,17 @@ public:
                 break;
             case UniformGPUScheme_Baseline:
                 _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
-                _generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId );
-                _gpuCommunicationScheme->addPackInfo( _generatedPackInfo );
+                _gpuCommunicationScheme->addPackInfo( generatedPackInfo );
                 break;
             case UniformGPUScheme_Memcpy:
                 _gpuCommunicationScheme = make_shared< cuda::communication::UniformGPUScheme< StencilType > >( bf, cudaEnabledMPI );
-                _memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId );
-                _gpuCommunicationScheme->addPackInfo( _memcpyPackInfo );
+                _gpuCommunicationScheme->addPackInfo( memcpyPackInfo );
+                break;
+            case MPIDatatypes:
+                if( ! cudaEnabledMPI ) {
+                    WALBERLA_ABORT("MPI datatype-based communication not possible if no cudaEnabledMPI is available.");
+                }
+                _directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfo );
                 break;
             default:
                 WALBERLA_ABORT("Invalid GPU communication scheme specified!");
@@ -91,6 +102,10 @@ public:
                 WALBERLA_ASSERT_NOT_NULLPTR( _gpuCommunicationScheme );
                 _gpuCommunicationScheme->startCommunication( communicationStream );
                 break;
+            case MPIDatatypes:
+                WALBERLA_ASSERT_NOT_NULLPTR( _directScheme );
+                _directScheme->startCommunication();
+                break;
         }
     }
 
@@ -115,6 +130,10 @@ public:
                 WALBERLA_ASSERT_NOT_NULLPTR( _gpuCommunicationScheme );
                 _gpuCommunicationScheme->wait( communicationStream );
                 break;
+            case MPIDatatypes:
+                WALBERLA_ASSERT_NOT_NULLPTR( _directScheme );
+                _directScheme->wait();
+                break;
         }
     }
 
@@ -123,6 +142,5 @@ private:
     shared_ptr< blockforest::communication::UniformBufferedScheme< StencilType > > _cpuCommunicationScheme;
     shared_ptr< cuda::communication::GPUPackInfo< GPUFieldType > > _gpuPackInfo;
     shared_ptr< cuda::communication::UniformGPUScheme< StencilType > > _gpuCommunicationScheme;
-    shared_ptr< pystencils::UniformGridGPU_PackInfo > _generatedPackInfo;
-    shared_ptr< cuda::communication::MemcpyPackInfo< GPUFieldType > > _memcpyPackInfo;
+    shared_ptr< blockforest::communication::UniformDirectScheme<StencilType> > _directScheme;
 };