diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
index 836baeefa42b5cdfe5c668eb5cdb14e4e288e93d..5d3f2dca099b2b2d3a92d6a61f2036f2452fd927 100644
--- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt
@@ -5,6 +5,7 @@ waLBerla_python_file_generates(UniformGridGenerated.py
         GenMacroGetter.cpp GenMacroSetter.cpp
         GenPackInfo.cpp GenPackInfoAAPush.cpp GenPackInfoAAPull.cpp
         GenLbKernel.cpp GenLbKernelAAEven.cpp GenLbKernelAAOdd.cpp
+        GenMpiDtypeInfo.h GenMpiDtypeInfoAAPull.h GenMpiDtypeInfoAAPush.h
         GenDefines.h)
 
 
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
index ae22ab6b594604258e5f8fbf08a90b727684d46d..e53b3cd4f4ccdc0c1c1abd14fc3a05c0f6d8c8f7 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
+++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
@@ -1,21 +1,22 @@
 DomainSetup
 {
    blocks        <  1,    1,   1 >;
-   cellsPerBlock <  256, 128, 128 >;
+   cellsPerBlock <  64, 64, 64 >;
    periodic      <  1,    1,   1 >;
 }
 
 Parameters 
 {
 
-	timesteps       400;   // time steps of one performance measurement
-	warmupSteps     1;    // number of steps to run before measurement starts
+	timesteps       3000;   // time steps of one performance measurement
+	warmupSteps     1;      // number of steps to run before measurement starts
     outerIterations 1;      // how many measurements to conduct
 
-	vtkWriteFrequency 0;           // write a VTK file every n'th step, if zero VTK output is disabled
+	vtkWriteFrequency 100;           // write a VTK file every n'th step, if zero VTK output is disabled
 	cudaEnabledMPI false;            // switch on if you have a CUDA-enabled MPI implementation
-	timeStepMode aaKernelOnly;                 // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
+	timeStepMode aa;                 // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
 	remainingTimeLoggerFrequency 0;  // interval in seconds to log the estimated remaining time
+    directComm 1;
 
 	omega 1.8;
 	useGui 0;
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
index bbc904de35e6a8167ffb5be76224327f639de2e4..d6671b3b2f68662ba0971a8a4bdb94d6343f1098 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
@@ -7,6 +7,7 @@
 #include "field/vtk/VTKWriter.h"
 #include "field/AddToStorage.h"
 #include "blockforest/communication/UniformBufferedScheme.h"
+#include "blockforest/communication/UniformDirectScheme.h"
 #include "timeloop/all.h"
 #include "core/timing/TimingPool.h"
 #include "core/timing/RemainingTimeLogger.h"
@@ -25,6 +26,9 @@
 #include "GenPackInfo.h"
 #include "GenPackInfoAAPush.h"
 #include "GenPackInfoAAPull.h"
+#include "GenMpiDtypeInfo.h"
+#include "GenMpiDtypeInfoAAPull.h"
+#include "GenMpiDtypeInfoAAPush.h"
 
 
 using namespace walberla;
@@ -52,7 +56,7 @@ int main( int argc, char **argv )
       const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
             uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 ));
       const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", 0.08);
-
+      const bool directComm = parameters.getParameter<bool>("directComm", false);
 
       auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage * const storage) {
           return new PdfField_T(storage->getNumberOfXCells(*block),
@@ -74,6 +78,7 @@ int main( int argc, char **argv )
       for( auto & b : *blocks)
           setterKernel(&b);
 
+      // Buffered Comm
       blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks );
       twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) );
 
@@ -83,12 +88,23 @@ int main( int argc, char **argv )
       blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks);
       aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId));
 
+      // Direct Comm
+      blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect(blocks);
+      twoFieldCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfo>(pdfFieldId));
+
+      blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect(blocks);
+      aaPullCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPull>(pdfFieldId));
+
+      blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect(blocks);
+      aaPushCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPush>(pdfFieldId));
+
+      using F = std::function<void()>;
       SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
       if( timeStepMode == "twoField")
       {
-          timeLoop.add() << BeforeFunction(twoFieldComm, "communication" )
+          timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
                          << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide1" );
-          timeLoop.add() << BeforeFunction(twoFieldComm, "communication" )
+          timeLoop.add()  << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
                          << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" );
 
       } else if ( timeStepMode == "twoFieldKernelOnly") {
@@ -96,9 +112,9 @@ int main( int argc, char **argv )
           timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" );
       } else if ( timeStepMode == "aa") {
           timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
-          timeLoop.add() << BeforeFunction( aaPullComm )
+          timeLoop.add() << BeforeFunction( directComm ? F(aaPullCommDirect) : F(aaPullComm) )
                          << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd")
-                         << AfterFunction( aaPushComm );
+                         << AfterFunction( directComm ? F(aaPushCommDirect) : F(aaPushComm) );
       } else if ( timeStepMode == "aaKernelOnly") {
           timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
           timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd");
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
index 74315df21ff29588b48bc503eb378378ef95a714..16b04e122bd61f7496433d1adca4bd42668a1b73 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
@@ -1,7 +1,7 @@
 import sympy as sp
 import pystencils as ps
 from lbmpy.creationfunctions import create_lb_update_rule
-from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep
+from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep, generate_mpidtype_info_from_kernel
 from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter
 from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor
 
@@ -135,6 +135,10 @@ with CodeGeneration() as ctx:
     generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPush', update_rule_aa_odd, kind='push',
                                    cpu_vectorize_info={'instruction_set': None})
 
+    generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfo', update_rule_two_field)
+    generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPull', update_rule_aa_odd, kind='pull')
+    generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPush', update_rule_aa_odd, kind='push')
+
     # Info Header
     infoHeaderParams = {
         'stencil': stencil_str,