diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
index 22493880997cfecd041cb36e99c7a76fe1e2f122..41d29bbccba9df9ed7850b0daa7e0a71a4e157a8 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
+++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm
@@ -1,13 +1,13 @@
 DomainSetup
 {
    blocks        <  1,   1,  1 >;
-   cellsPerBlock <  64, 64, 64 >;
+   cellsPerBlock <  300, 64, 64 >;
    periodic      <  1,   1,  1 >;
 }
 
 Parameters 
 {
-	timeStepMode twoField;
+	timeStepMode aa;
         // twoField: normal src-dst update with two fields [default]
         // twoFieldKernelOnly: same as above but without communication and periodicity
         // aa: AA single-field udate pattern
@@ -22,11 +22,13 @@ Parameters
         // manualD3Q19: manual D3Q19
 
 
-	timesteps       200;             // time steps of one performance measurement default 60
+
+	timesteps       2000;             // time steps of one performance measurement default 60
 	warmupSteps     1;               // number of steps to run before measurement starts
-    outerIterations 4;               // how many measurements to conduct
-	vtkWriteFrequency 0;             // write a VTK file every n'th step, if zero VTK output is disabled
+    outerIterations 1;               // how many measurements to conduct
+	vtkWriteFrequency 100;             // write a VTK file every n'th step, if zero VTK output is disabled
 	remainingTimeLoggerFrequency 6;  // interval in seconds to log the estimated remaining time
+    fPadding  3;
 
 	useGui 0;
 
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
index e1b013e0adf606b2763ba54d69102c438d92f185..5ca6398df9172f0de561e8250fc97d5f3c171b86 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp
@@ -43,31 +43,91 @@ using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >;
 using VelocityField_T = GhostLayerField< real_t, 3 >;
 
 
-void pinOpenMP(const char * pinString)
+template <typename T>
+class OuterPaddingFieldAllocator : public field::FieldAllocator<T>
+{
+public:
+    OuterPaddingFieldAllocator( uint_t paddingElements )
+        : padding_(paddingElements)
+    {}
+
+    virtual T * allocate(const field::Layout & layout,
+                         uint_t xSize, uint_t ySize, uint_t zSize, uint_t fSize,
+                         uint_t & xAllocSize, uint_t & yAllocSize,uint_t & zAllocSize,uint_t & fAllocSize,
+                         cell_idx_t & xStride, cell_idx_t & yStride, cell_idx_t & zStride, cell_idx_t & fStride)
+    {
+        T * ptr;
+
+        if (layout == field::fzyx ) {
+            ptr = field::FieldAllocator<T>::allocateField(fSize, zSize, ySize, xSize, fAllocSize, zAllocSize, yAllocSize, xAllocSize);
+
+            WALBERLA_CHECK_LESS_EQUAL( fSize * xAllocSize * yAllocSize * zAllocSize + xSize + ySize * xAllocSize + zSize * xAllocSize * yAllocSize,
+                                       std::numeric_limits< cell_idx_t >::max(),
+                                       "The data type 'cell_idx_t' is too small for your field size! Your field is too large.\nYou may have to set 'cell_idx_t' to an 'int64t'." );
+
+            fStride = cell_idx_c(xAllocSize * yAllocSize * zAllocSize + padding_);
+            zStride = cell_idx_c(xAllocSize * yAllocSize);
+            yStride = cell_idx_c(xAllocSize);
+            xStride = 1;
+        } else {
+            WALBERLA_ABORT("OuterPaddingFieldAllocator works only for fzyx layout");
+        }
+
+        return ptr;
+    }
+
+    virtual T * allocateMemory (  uint_t size0, uint_t size1, uint_t size2, uint_t size3,
+                                  uint_t & allocSize0, uint_t & allocSize1, uint_t & allocSize2, uint_t & allocSize3 )
+    {
+        allocSize0 = size0;
+        allocSize1 = size1;
+        allocSize2 = size2;
+        allocSize3 = size3;
+        return new T[allocSize0 * allocSize1 * allocSize2 * allocSize3 + padding_ * size0];
+    }
+
+    virtual T * allocateMemory ( uint_t size )
+    {
+        return new T[size];
+    }
+
+    virtual void deallocate(T *& values) {
+        delete[] values;
+        values = 0;
+    }
+private:
+    uint_t padding_;
+};
+
+
+void pinOpenMP( const char *pinString )
 {
 #ifdef WALBERLA_BUILD_WITH_OPENMP
-    if (pinString != NULL) {
-        #pragma omp parallel
+    if ( pinString != NULL )
+    {
+#pragma omp parallel
         {
             int threadId = omp_get_thread_num();
             int err;
 
-            err = PinCurrentThreadByCpuList(pinString, threadId);
+            err = PinCurrentThreadByCpuList( pinString, threadId );
 
-            if (err) {
-                WALBERLA_ABORT("Pinning of " << threadId << "failed");
+            if ( err )
+            {
+                WALBERLA_ABORT( "Pinning of " << threadId << "failed" );
             }
 
-            const char * cpuList = PinCpuListAsString();
-            WALBERLA_ASSERT(cpuList != NULL);
+            const char *cpuList = PinCpuListAsString();
+            WALBERLA_ASSERT( cpuList != NULL );
 
             // Not so nice hack to print the thread ids ordered.
-            #pragma omp for ordered
-            for (int i = 0; i < omp_get_num_threads(); ++i) {
-                #pragma omp ordered
-                WALBERLA_LOG_INFO("Thread " << threadId << " pinned to core(s) " << cpuList);
+#pragma omp for ordered
+            for ( int i = 0; i < omp_get_num_threads(); ++i )
+            {
+#pragma omp ordered
+                WALBERLA_LOG_INFO( "Thread " << threadId << " pinned to core(s) " << cpuList );
             }
-            free((void *)cpuList);
+            free((void *) cpuList );
         }
     }
 #endif
@@ -75,194 +135,219 @@ void pinOpenMP(const char * pinString)
 
 int main( int argc, char **argv )
 {
-   mpi::Environment env( argc, argv );
-
-   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
-   {
-      WALBERLA_MPI_WORLD_BARRIER();
-
-      auto config = *cfg;
-      logging::configureLogging( config );
-      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
-
-      Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t>  >( "cellsPerBlock" );
-      // Reading parameters
-      auto parameters = config->getOneBlock( "Parameters" );
-      const std::string timeStepMode = parameters.getParameter<std::string>( "timeStepMode", "twoField");
-      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
-            uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 ));
-      const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", 0.02);
-      const bool directComm = parameters.getParameter<bool>("directComm", false);
-
-      const std::string pinning = parameters.getParameter<std::string>("pinning", "");
-      if( !pinning.empty() )
-          pinOpenMP(pinning.c_str());
-
-      auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage * const storage) {
-          return new PdfField_T(storage->getNumberOfXCells(*block),
-                                storage->getNumberOfYCells(*block),
-                                storage->getNumberOfZCells(*block),
-                                uint_t(1),
-                                field::fzyx,
-                                make_shared<field::AllocateAligned<real_t, 64>>());
-      };
-
-      // Creating fields
-      BlockDataID pdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
-      BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
-
-      pystencils::GenMacroSetter setterKernel(pdfFieldId, velFieldId);
-      pystencils::GenMacroGetter getterKernel(pdfFieldId, velFieldId);
-
-      if( shearVelocityMagnitude > 0 )
-          initShearVelocity(blocks, velFieldId, shearVelocityMagnitude);
-      for( auto & b : *blocks)
-          setterKernel(&b);
-
-      // Buffered Comm
-      blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks );
-      twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) );
-
-      blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm(blocks);
-      aaPullComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPull>(pdfFieldId));
-
-      blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks);
-      aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId));
-
-      // Direct Comm
-      blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect(blocks);
-      twoFieldCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfo>(pdfFieldId));
-
-      blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect(blocks);
-      aaPullCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPull>(pdfFieldId));
-
-      blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect(blocks);
-      aaPushCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPush>(pdfFieldId));
-
-
-      const std::string twoFieldKernelType = parameters.getParameter<std::string>( "twoFieldKernelType", "generated");
-      std::function<void(IBlock*)> twoFieldKernel;
-      if( twoFieldKernelType == "generated") {
-          twoFieldKernel = pystencils::GenLbKernel(pdfFieldId, omega);
-      } else if (twoFieldKernelType == "manualGeneric") {
-          using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
-          BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
-          twoFieldKernel = StreamPullCollideGeneric<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
-      } else if (twoFieldKernelType == "manualD3Q19") {
-          using MyLM = lbm::D3Q19<lbm::collision_model::SRT>;
-          BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs");
-          twoFieldKernel = StreamPullCollideD3Q19<MyLM>(pdfFieldId, tmpPdfFieldId, omega);
-      } else {
-          WALBERLA_ABORT_NO_DEBUG_INFO("Invalid option for \"twoFieldKernelType\", "
-                                       "valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\"");
-      }
-
-      using F = std::function<void()>;
-      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
-      if( timeStepMode == "twoField")
-      {
-          timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
-                         << Sweep( twoFieldKernel, "LB stream & collide1" );
-          timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" )
-                         << Sweep( twoFieldKernel, "LB stream & collide2" );
-
-      } else if ( timeStepMode == "twoFieldKernelOnly") {
-          timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide1" );
-          timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" );
-      } else if ( timeStepMode == "aa") {
-          timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
-          timeLoop.add() << BeforeFunction( directComm ? F(aaPullCommDirect) : F(aaPullComm) )
-                         << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd")
-                         << AfterFunction( directComm ? F(aaPushCommDirect) : F(aaPushComm) );
-      } else if ( timeStepMode == "aaKernelOnly") {
-          timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" );
-          timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd");
-      } else {
-          WALBERLA_ABORT("Invalid value for timeStepMode");
-      }
-
-
-      int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
-      int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
-      for(int i=0; i < warmupSteps; ++i )
-         timeLoop.singleStep();
-
-      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
-      if (remainingTimeLoggerFrequency > 0) {
-          auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency );
-          timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
-      }
-
-      // VTK
-      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
-      if( vtkWriteFrequency > 0 )
-      {
-          auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
-                                                           "simulation_step", false, true, true, false, 0 );
-          auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" );
-          vtkOutput->addCellDataWriter( velWriter );
-          vtkOutput->addBeforeFunction( [&]()
-                                        { for( auto & b : *blocks)
-                                            getterKernel(&b);
-                                        } );
-          timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
-      }
-
-
-      bool useGui = parameters.getParameter<bool>( "useGui", false );
-      if( useGui )
-      {
-          GUI gui( timeLoop, blocks, argc, argv);
-          gui.run();
-      }
-      else
-      {
-          for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
-          {
-              timeLoop.setCurrentTimeStepToZero();
-              WcTimer simTimer;
-
-              auto threads = omp_get_max_threads();
-
-              simTimer.start();
-              timeLoop.run();
-              simTimer.end();
-              auto time = simTimer.last();
-              auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
-              auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
-
-              using std::setw;
-              WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode <<
-                                                     "  procs: " << setw(6) << MPIManager::instance()->numProcesses() <<
-                                                     "  threads: " << threads <<
-                                                     "  direct_comm: " << directComm <<
-                                                     "  time steps: " << timesteps <<
-                                                     setw(15) << "  block size: " << cellsPerBlock <<
-                                                     "  mlups/core:  " << int(mlupsPerProcess/ threads) <<
-                                                     "  mlups:  " << int(mlupsPerProcess) *  MPIManager::instance()->numProcesses());
-
-              WALBERLA_ROOT_SECTION()
-              {
-                  python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
-                  if ( pythonCallbackResults.isCallable())
-                  {
-                      pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
-                      pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
-                      pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
-                      pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode );
-                      pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType );
-                      pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict );
-                      pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() );
-                      pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() );
-                      pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() );
-
-                      // Call Python function to report results
-                      pythonCallbackResults();
-                  }
-              }
-          }
-      }
-   }
-
-   return 0;
+    mpi::Environment env( argc, argv );
+
+    for ( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+    {
+        WALBERLA_MPI_WORLD_BARRIER();
+
+        auto config = *cfg;
+        logging::configureLogging( config );
+        auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+
+        Vector3< uint_t > cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter< Vector3< uint_t > >( "cellsPerBlock" );
+        // Reading parameters
+        auto parameters = config->getOneBlock( "Parameters" );
+        const std::string timeStepMode = parameters.getParameter< std::string >( "timeStepMode", "twoField" );
+        const real_t omega = parameters.getParameter< real_t >( "omega", real_c( 1.4 ));
+        uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 60 ));
+        const real_t shearVelocityMagnitude = parameters.getParameter< real_t >( "shearVelocityMagnitude", 0.02 );
+        const bool directComm = parameters.getParameter< bool >( "directComm", false );
+        const uint_t fPadding = parameters.getParameter<uint_t>("fPadding", 0);
+
+        const std::string pinning = parameters.getParameter< std::string >( "pinning", "" );
+        if ( !pinning.empty())
+            pinOpenMP( pinning.c_str());
+
+        auto pdfFieldAdder = [fPadding]( IBlock *const block, StructuredBlockStorage *const storage )
+        {
+            shared_ptr< field::FieldAllocator<real_t> > allocator;
+            if( fPadding > 0)
+                allocator = make_shared< OuterPaddingFieldAllocator<real_t> >( fPadding );
+            else
+                allocator = make_shared< field::AllocateAligned< real_t, 64> >();
+
+            return new PdfField_T( storage->getNumberOfXCells( *block ),
+                                   storage->getNumberOfYCells( *block ),
+                                   storage->getNumberOfZCells( *block ),
+                                   uint_t( 1 ),
+                                   field::fzyx,
+                                   allocator);
+        };
+
+        // Creating fields
+        BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >( pdfFieldAdder, "pdfs" );
+        BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx );
+
+        pystencils::GenMacroSetter setterKernel( pdfFieldId, velFieldId );
+        pystencils::GenMacroGetter getterKernel( pdfFieldId, velFieldId );
+
+        if ( shearVelocityMagnitude > 0 )
+            initShearVelocity( blocks, velFieldId, shearVelocityMagnitude );
+        for ( auto &b : *blocks )
+            setterKernel( &b );
+
+        // Buffered Comm
+        blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm( blocks );
+        twoFieldComm.addPackInfo( make_shared< pystencils::GenPackInfo >( pdfFieldId ));
+
+        blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm( blocks );
+        aaPullComm.addPackInfo( make_shared< pystencils::GenPackInfoAAPull >( pdfFieldId ));
+
+        blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm( blocks );
+        aaPushComm.addPackInfo( make_shared< pystencils::GenPackInfoAAPush >( pdfFieldId ));
+
+        // Direct Comm
+        blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect( blocks );
+        twoFieldCommDirect.addDataToCommunicate( make_shared< pystencils::GenMpiDtypeInfo >( pdfFieldId ));
+
+        blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect( blocks );
+        aaPullCommDirect.addDataToCommunicate( make_shared< pystencils::GenMpiDtypeInfoAAPull >( pdfFieldId ));
+
+        blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect( blocks );
+        aaPushCommDirect.addDataToCommunicate( make_shared< pystencils::GenMpiDtypeInfoAAPush >( pdfFieldId ));
+
+
+        const std::string twoFieldKernelType = parameters.getParameter< std::string >( "twoFieldKernelType", "generated" );
+        std::function< void( IBlock * ) > twoFieldKernel;
+        if ( twoFieldKernelType == "generated" )
+        {
+            twoFieldKernel = pystencils::GenLbKernel( pdfFieldId, omega );
+        }
+        else if ( twoFieldKernelType == "manualGeneric" )
+        {
+            using MyLM = lbm::D3Q19< lbm::collision_model::SRT >;
+            BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData< PdfField_T >( pdfFieldAdder, "pdfs" );
+            twoFieldKernel = StreamPullCollideGeneric< MyLM >( pdfFieldId, tmpPdfFieldId, omega );
+        }
+        else if ( twoFieldKernelType == "manualD3Q19" )
+        {
+            using MyLM = lbm::D3Q19< lbm::collision_model::SRT >;
+            BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData< PdfField_T >( pdfFieldAdder, "pdfs" );
+            twoFieldKernel = StreamPullCollideD3Q19< MyLM >( pdfFieldId, tmpPdfFieldId, omega );
+        }
+        else
+        {
+            WALBERLA_ABORT_NO_DEBUG_INFO( "Invalid option for \"twoFieldKernelType\", "
+                                          "valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\"" );
+        }
+
+        using F = std::function< void() >;
+        SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 );
+        if ( timeStepMode == "twoField" )
+        {
+            timeLoop.add() << BeforeFunction( directComm ? F( twoFieldCommDirect ) : F( twoFieldComm ), "communication" )
+                           << Sweep( twoFieldKernel, "LB stream & collide1" );
+            timeLoop.add() << BeforeFunction( directComm ? F( twoFieldCommDirect ) : F( twoFieldComm ), "communication" )
+                           << Sweep( twoFieldKernel, "LB stream & collide2" );
+
+        }
+        else if ( timeStepMode == "twoFieldKernelOnly" )
+        {
+            timeLoop.add() << Sweep( pystencils::GenLbKernel( pdfFieldId, omega ), "LB stream & collide1" );
+            timeLoop.add() << Sweep( pystencils::GenLbKernel( pdfFieldId, omega ), "LB stream & collide2" );
+        }
+        else if ( timeStepMode == "aa" )
+        {
+            timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven( pdfFieldId, omega ), "AA Even" );
+            timeLoop.add() << BeforeFunction( directComm ? F( aaPullCommDirect ) : F( aaPullComm ))
+                           << Sweep( pystencils::GenLbKernelAAOdd( pdfFieldId, omega ), "AA Odd" )
+                           << AfterFunction( directComm ? F( aaPushCommDirect ) : F( aaPushComm ));
+        }
+        else if ( timeStepMode == "aaKernelOnly" )
+        {
+            timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven( pdfFieldId, omega ), "AA Even" );
+            timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd( pdfFieldId, omega ), "AA Odd" );
+        }
+        else
+        {
+            WALBERLA_ABORT( "Invalid value for timeStepMode" );
+        }
+
+
+        int warmupSteps = parameters.getParameter< int >( "warmupSteps", 2 );
+        int outerIterations = parameters.getParameter< int >( "outerIterations", 1 );
+        for ( int i = 0; i < warmupSteps; ++i )
+            timeLoop.singleStep();
+
+        auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
+        if ( remainingTimeLoggerFrequency > 0 )
+        {
+            auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency );
+            timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
+        }
+
+        // VTK
+        uint_t vtkWriteFrequency = parameters.getParameter< uint_t >( "vtkWriteFrequency", 0 );
+        if ( vtkWriteFrequency > 0 )
+        {
+            auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                             "simulation_step", false, true, true, false, 0 );
+            auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" );
+            vtkOutput->addCellDataWriter( velWriter );
+            vtkOutput->addBeforeFunction( [&]()
+                                          {
+                                              for ( auto &b : *blocks )
+                                                  getterKernel( &b );
+                                          } );
+            timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+        }
+
+
+        bool useGui = parameters.getParameter< bool >( "useGui", false );
+        if ( useGui )
+        {
+            GUI gui( timeLoop, blocks, argc, argv );
+            gui.run();
+        }
+        else
+        {
+            for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
+            {
+                timeLoop.setCurrentTimeStepToZero();
+                WcTimer simTimer;
+
+                auto threads = omp_get_max_threads();
+
+                simTimer.start();
+                timeLoop.run();
+                simTimer.end();
+                auto time = simTimer.last();
+                auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
+                auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
+
+                using std::setw;
+                WALBERLA_LOG_INFO_ON_ROOT( setw( 18 ) << timeStepMode <<
+                                                      "  procs: " << setw( 6 ) << MPIManager::instance()->numProcesses() <<
+                                                      "  threads: " << threads <<
+                                                      "  direct_comm: " << directComm <<
+                                                      "  time steps: " << timesteps <<
+                                                      setw( 15 ) << "  block size: " << cellsPerBlock <<
+                                                      "  mlups/core:  " << int( mlupsPerProcess / threads ) <<
+                                                      "  mlups:  " << int( mlupsPerProcess ) * MPIManager::instance()->numProcesses());
+
+                WALBERLA_ROOT_SECTION()
+                {
+                    python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
+                    if ( pythonCallbackResults.isCallable())
+                    {
+                        pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
+                        pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
+                        pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
+                        pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode );
+                        pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType );
+                        pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict );
+                        pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1());
+                        pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags());
+                        pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine());
+
+                        // Call Python function to report results
+                        pythonCallbackResults();
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
 }
diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
index 21ee9f0c80c3494be684aa69391b55c912e0b88f..0c8b7ff3c39003fa7efb65ba7e9376c72d7738b5 100644
--- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
+++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py
@@ -150,16 +150,19 @@ with CodeGeneration() as ctx:
                                                                  'cse_global': opts['aa_odd_cse_global'],
                                                                  'cse_pdfs': opts['aa_odd_cse_pdfs']}, **options)
 
-    vec = { 'assume_aligned': True, 'assume_inner_stride_one': True}
+    vec = {'assume_aligned': True, 'assume_inner_stride_one': True}
 
     # Sweeps
     vec['nontemporal'] = opts['two_field_nt_stores']
+    vec['assume_aligned'] = opts['two_field_nt_stores']
     generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')],
                    cpu_vectorize_info=vec)
     vec['nontemporal'] = opts['aa_even_nt_stores']
+    vec['assume_aligned'] = opts['aa_even_nt_stores']
     generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec,
                    cpu_openmp=True, ghost_layers=1)
     vec['nontemporal'] = opts['aa_odd_nt_stores']
+    vec['assume_aligned'] = opts['aa_odd_nt_stores']
     generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec,
                    cpu_openmp=True, ghost_layers=1)
 
diff --git a/apps/benchmarks/UniformGridGenerated/params.py b/apps/benchmarks/UniformGridGenerated/params.py
index 724238f4bb456be75906dafeff6b0683e4dbe3a4..f701bed0b2368491119c76e5144b64855d7e5198 100644
--- a/apps/benchmarks/UniformGridGenerated/params.py
+++ b/apps/benchmarks/UniformGridGenerated/params.py
@@ -51,7 +51,7 @@ def domain_decomposition_func_full(processes, threads, block_size):
 class BenchmarkScenario:
     def __init__(self, block_size=(256, 128, 128), direct_comm=True,
                  time_step_mode='aa', two_field_kernel_type='generated',
-                 domain_decomposition_func=domain_decomposition_func_z,
+                 domain_decomposition_func=domain_decomposition_func_z, pinning="", f_padding=0,
                  db_file_name='uniform_grid_gen.sqlite'):
         self.block_size = block_size
         self.direct_comm = direct_comm
@@ -61,6 +61,8 @@ class BenchmarkScenario:
         self.threads = int(os.environ['OMP_NUM_THREADS'])
         self.processes = wlb.mpi.numProcesses()
         self.db_file_name = db_file_name
+        self.pinning = pinning
+        self.f_padding = f_padding
 
     @wlb.member_callback
     def config(self, **kwargs):
@@ -81,6 +83,8 @@ class BenchmarkScenario:
                 'timeStepMode': self.time_step_mode,
                 'twoFieldKernelType': self.two_field_kernel_type,
                 'directComm': self.direct_comm,
+                'pinning': self.pinning,
+                'fPadding': self.f_padding,
             }
         }
         cfg['DomainSetup'].update(self.domain_decomposition_func(self.processes, self.threads, self.block_size))
@@ -168,4 +172,20 @@ def weak_scaling():
                 continue
             scenarios.add(sc)
 
-single_node_benchmark()
+
+def padding_test():
+    scenarios = wlb.ScenarioManager()
+    for block_size in [(300, 100, 100), (500, 100, 100)]:
+        for direct_comm in (False,):
+            for time_step_mode in ['aa', 'aaKernelOnly']:
+                for f_padding in range(16):
+                    sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm,
+                                           time_step_mode=time_step_mode, domain_decomposition_func=domain_decomposition_func_z,
+                                           f_padding=f_padding, pinning="0")
+                    if not block_size_ok(sc):
+                        continue
+                    scenarios.add(sc)
+
+
+#single_node_benchmark()
+padding_test()