diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
index 08e5de4d928d9a23e76104823721dfbb5d811f69..494424a562e25503c6b189fa5cf2d957fafca313 100644
--- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
+++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp
@@ -194,11 +194,11 @@ int main(int argc, char** argv)
 
       gpu::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false);
       comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU));
-      auto evenComm = std::function< void() >([&]() { comEven.communicate(nullptr); });
+      auto evenComm = std::function< void() >([&]() { comEven.communicate(); });
 
       gpu::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false);
       comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU));
-      auto oddComm = std::function< void() >([&]() { comODD.communicate(nullptr); });
+      auto oddComm = std::function< void() >([&]() { comODD.communicate(); });
 #else
       blockforest::communication::UniformBufferedScheme< Stencil_T > evenComm(blocks);
       evenComm.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldID));
diff --git a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
index de1a2c1db0cbd078a80879f964f6a046f98afc95..2b37ed6fb19797229ae5507f4d26f3f031491a87 100644
--- a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt
@@ -13,9 +13,9 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
 
 waLBerla_add_executable( NAME NonUniformGridGenerator
                          FILES NonUniformGridGenerator.cpp LdcSetup.h
-                         DEPENDS blockforest core domain_decomposition field geometry python_coupling vtk )
+                         DEPENDS blockforest core field python_coupling )
 
 
 waLBerla_add_executable( NAME NonUniformGridCPU
                          FILES NonUniformGridCPU.cpp LdcSetup.h
-                         DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated )
+                         DEPENDS blockforest boundary core domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridCPUGenerated )
diff --git a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
index 1657e85e0e58dc0a1107ba62505e438c1821a1c1..070656cb23582a4da83f954c3e108aa70e69b315 100644
--- a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
+++ b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h
@@ -14,12 +14,15 @@
 //  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
 //! \file LdcSetup.h
+//! \author Markus Holzer <markus.holzer@fau.de>
 //! \author Frederik Hennig <frederik.hennig@fau.de>
 //
 //======================================================================================================================
+#pragma once
 
 #include "blockforest/SetupBlock.h"
 #include "blockforest/SetupBlockForest.h"
+#include "blockforest/StructuredBlockForest.h"
 
 #include "core/all.h"
 
@@ -45,14 +48,14 @@ class LDCRefinement
    {
       const AABB & domain = forest.getDomain();
 
-      real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
+      const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
+      const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
 
-      AABB leftCorner( domain.xMin(), domain.yMax() - ySize, domain.zMin(),
-                      domain.xMin() + xSize, domain.yMax() + ySize, domain.zMax() );
+      const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
+                            domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
 
-      AABB rightCorner( domain.xMax() - xSize, domain.yMax() - ySize, domain.zMin(),
-                       domain.xMax(), domain.yMax(), domain.zMax() );
+      const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
+                             domain.xMax(), domain.yMin() + ySize, domain.zMax() );
 
       for(auto & block : forest)
       {
diff --git a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
index d6840007e14d5f5af685bb5b262c8bcfd6138d6e..f6b4e1ff3779f624c8fb9845425d9d6a86103ee9 100644
--- a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt
@@ -11,5 +11,5 @@ waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated
         NonUniformGridGPUBoundaryCollection.h
         NonUniformGridGPUInfoHeader.h)
 waLBerla_add_executable( NAME NonUniformGridGPU
-                         FILES NonUniformGridGPU.cpp
-                         DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridGPUGenerated )
\ No newline at end of file
+                         FILES NonUniformGridGPU.cpp LdcSetup.h
+                         DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridGPUGenerated )
\ No newline at end of file
diff --git a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
new file mode 100644
index 0000000000000000000000000000000000000000..238943a7daa9745054980e6011d46ef037ef27ec
--- /dev/null
+++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h
@@ -0,0 +1,110 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file LdcSetup.h
+//! \author Markus Holzer <markus.holzer@fau.de>
+//! \author Frederik Hennig <frederik.hennig@fau.de>
+//
+//======================================================================================================================
+#pragma once
+
+#include "blockforest/SetupBlock.h"
+#include "blockforest/SetupBlockForest.h"
+#include "blockforest/StructuredBlockForest.h"
+
+#include "core/all.h"
+
+#include "field/FlagField.h"
+
+#include "field/FlagUID.h"
+
+using namespace walberla;
+using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
+using FlagField_T          = FlagField< uint8_t >;
+
+class LDCRefinement
+{
+ private:
+   const uint_t refinementDepth_;
+
+ public:
+   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
+
+   void operator()(SetupBlockForest& forest) const
+   {
+      const AABB & domain = forest.getDomain();
+
+      const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
+      const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
+
+      const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
+                             domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
+
+      const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
+                              domain.xMax(), domain.yMin() + ySize, domain.zMax() );
+
+      for(auto & block : forest)
+      {
+         auto & aabb = block.getAABB();
+         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
+         {
+            if( block.getLevel() < refinementDepth_)
+               block.setMarker( true );
+         }
+      }
+   }
+};
+
+class LDC
+{
+ private:
+   const std::string refinementProfile_;
+   const uint_t refinementDepth_;
+
+   const FlagUID noSlipFlagUID_;
+   const FlagUID ubbFlagUID_;
+
+ public:
+   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
+
+   RefinementSelectionFunctor refinementSelector() const
+   {
+      return LDCRefinement(refinementDepth_);
+   }
+
+   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
+   {
+      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
+      {
+         auto& b           = dynamic_cast< Block& >(*bIt);
+         const uint_t level       = b.getLevel();
+         auto flagField     = b.getData< FlagField_T >(flagFieldID);
+         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
+         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
+         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
+         {
+            const Cell localCell = cIt.cell();
+            Cell globalCell(localCell);
+            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
+            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
+            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
+                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
+            {
+               flagField->addFlag(localCell, noslipFlag);
+            }
+         }
+      }
+   }
+};
diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
index 8110dbbb4437b8d3c56d9b855de501fd55521454..919755d6d7cd481dd90a2f1310965e0c6947432c 100644
--- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
+++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp
@@ -19,7 +19,6 @@
 //======================================================================================================================
 
 #include "blockforest/Initialization.h"
-#include "blockforest/SetupBlockForest.h"
 #include "blockforest/loadbalancing/StaticCurve.h"
 
 #include "core/Environment.h"
@@ -54,6 +53,7 @@
 
 #include <cmath>
 
+#include "LdcSetup.h"
 #include "NonUniformGridGPUInfoHeader.h"
 using namespace walberla;
 
@@ -69,82 +69,6 @@ using BoundaryCollection_T = lbm::NonUniformGridGPUBoundaryCollection< FlagField
 using SweepCollection_T = lbm::NonUniformGridGPUSweepCollection;
 
 using gpu::communication::NonUniformGPUScheme;
-using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
-
-class LDCRefinement
-{
- private:
-   const uint_t refinementDepth_;
-
- public:
-   explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){};
-
-   void operator()(SetupBlockForest& forest) const
-   {
-      const AABB & domain = forest.getDomain();
-
-      real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
-      real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
-
-      AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
-                       domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
-
-      AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
-                        domain.xMax(), domain.yMin() + ySize, domain.zMax() );
-
-      for(auto & block : forest)
-      {
-         auto & aabb = block.getAABB();
-         if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) )
-         {
-            if( block.getLevel() < refinementDepth_)
-               block.setMarker( true );
-         }
-      }
-   }
-};
-
-class LDC
-{
- private:
-   const std::string refinementProfile_;
-   const uint_t refinementDepth_;
-
-   const FlagUID noSlipFlagUID_;
-   const FlagUID ubbFlagUID_;
-
- public:
-   explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){};
-
-   RefinementSelectionFunctor refinementSelector() const
-   {
-      return LDCRefinement(refinementDepth_);
-   }
-
-   void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID)
-   {
-      for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt)
-      {
-         auto& b           = dynamic_cast< Block& >(*bIt);
-         const uint_t level       = b.getLevel();
-         auto flagField     = b.getData< FlagField_T >(flagFieldID);
-         const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_);
-         const uint8_t ubbFlag    = flagField->registerFlag(ubbFlagUID_);
-         for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt)
-         {
-            const Cell localCell = cIt.cell();
-            Cell globalCell(localCell);
-            sbfs.transformBlockLocalToGlobalCell(globalCell, b);
-            if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
-            else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
-                     globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
-            {
-               flagField->addFlag(localCell, noslipFlag);
-            }
-         }
-      }
-   }
-};
 
 namespace {
 void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) {
diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
index ccfcecacfb5c0f5a79b56aea13409cc3da4d2748..34bc6caa92b92d5239c9ca1409660b062247a469 100644
--- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py
@@ -26,7 +26,7 @@ class Scenario:
             },
             'Parameters': {
                 'omega': 1.95,
-                'timesteps': 10001,
+                'timesteps': 30001,
 
                 'refinementDepth': self.refinement_depth,
                 'writeSetupForestAndReturn': False,
@@ -37,7 +37,7 @@ class Scenario:
 
                 'remainingTimeLoggerFrequency': 3,
 
-                'vtkWriteFrequency': 5000,
+                'vtkWriteFrequency': 10000,
             },
             'Logging': {
                 'logLevel': "info",
diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
index c8992a65afb93fa7dae572959a654651f14aabde..2a59e6be99b49942a169d1c24921a93cbe8abd1e 100644
--- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
+++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp
@@ -18,7 +18,6 @@
 //
 //======================================================================================================================
 #include "blockforest/Initialization.h"
-#include "blockforest/communication/UniformDirectScheme.h"
 
 #include "core/Environment.h"
 #include "core/logging/Initialization.h"
@@ -66,7 +65,7 @@ typedef gpu::GPUField< real_t > GPUField;
 
 int main(int argc, char** argv)
 {
-   mpi::Environment env(argc, argv);
+   const mpi::Environment env(argc, argv);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
    gpu::selectDeviceBasedOnMpiRank();
 #endif
@@ -92,14 +91,14 @@ int main(int argc, char** argv)
 
 #if defined(WALBERLA_BUILD_WITH_CUDA)
       // CPU fields
-      BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
+      const BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
       // GPU fields
-      BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+      const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
-      BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+      const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
-      BlockDataID vel_field_gpu =
+      const BlockDataID vel_field_gpu =
          gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
       BlockDataID phase_field_gpu =
          gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
@@ -215,15 +214,15 @@ int main(int argc, char** argv)
       auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
 #if defined(WALBERLA_BUILD_WITH_CUDA)
       auto normalTimeStep = [&]() {
-         Comm_velocity_based_distributions->startCommunication(defaultStream);
+         Comm_velocity_based_distributions->startCommunication();
          for (auto& block : *blocks)
             phase_field_LB_step(&block, defaultStream);
-         Comm_velocity_based_distributions->wait(defaultStream);
+         Comm_velocity_based_distributions->wait();
 
-         Comm_phase_field_distributions->startCommunication(defaultStream);
+         Comm_phase_field_distributions->startCommunication();
          for (auto& block : *blocks)
             hydro_LB_step(&block, defaultStream);
-         Comm_phase_field_distributions->wait(defaultStream);
+         Comm_phase_field_distributions->wait();
       };
       auto phase_only = [&]() {
          for (auto& block : *blocks)
diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
index fc229eb116f86ee6c967a8fba5a1948f7b73fc64..dfcd22a87e6942fb7ac2bc5789ac92fdd65fec9f 100644
--- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
+++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp
@@ -68,8 +68,8 @@ int main(int argc, char** argv)
 {
    const mpi::Environment env(argc, argv);
 
-   std::string input_filename(argv[1]);
-   bool inputIsPython = string_ends_with(input_filename, ".py");
+   const std::string input_filename(argv[1]);
+   const bool inputIsPython = string_ends_with(input_filename, ".py");
 
    for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
    {
diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
index c3076cb3e1f457ecae1f6a5090adac2314c34e97..21235056434f3daeebdbac212ec8de60d58810b4 100644
--- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py
@@ -22,12 +22,12 @@ def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK
 
 
 ldc_setup = {'Border': [
-    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
     {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'T', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
 ]}
 
 
diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
index 66a5b0fa4f4a3588f36ba4dbd5feb732131f76d0..2607004f3749f366f8155a0dd200202f00e45867 100644
--- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -25,7 +25,7 @@ foreach(streaming_pattern pull push aa esotwist)
 
             waLBerla_add_executable(NAME UniformGridGPU_${config}
                     FILES UniformGridGPU.cpp
-                    DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk UniformGridGPUGenerated_${config})
+                    DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk UniformGridGPUGenerated_${config})
 
             # all configs are excluded from all except for pull d3q27.
             if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27")
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
index ee022f457738fb6f8aa71f615441e9279fd25eca..fdc8969d626b866b978dfd1260565c50f96f01b8 100644
--- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -167,22 +167,22 @@ int main(int argc, char** argv)
 
       if (timeStepStrategy == "noOverlap") {
          if (boundariesConfig){
-            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication")
+            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
                            << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
             timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");
          }else {
-            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication")
+            timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication")
                            << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");}
 
       } else if (timeStepStrategy == "simpleOverlap") {
          if (boundariesConfig){
-            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication")
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
                            << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions");
             timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
             timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
                            << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER, defaultStream), "LBM StreamCollide Outer Frame");
          }else{
-            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication")
+            timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication")
                            << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame");
             timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication")
                            << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER,defaultStream), "LBM StreamCollide Outer Frame");}
@@ -240,14 +240,14 @@ int main(int argc, char** argv)
          WALBERLA_GPU_CHECK(gpuPeekAtLastError())
 
          timeLoop.setCurrentTimeStepToZero();
-         WcTimingPool const timeloopTiming;
+         WcTimingPool timeloopTiming;
          WcTimer simTimer;
 
          WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
          WALBERLA_GPU_CHECK( gpuPeekAtLastError() )
          WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
          simTimer.start();
-         timeLoop.run();
+         timeLoop.run(timeloopTiming);
          WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
          simTimer.end();
 
diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
index e1972d914a5acc26ab54aaf3cb86c615ac4d3b77..74be4378e0e2acef0bcb3c36f0f6d64916bba6c8 100755
--- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
+++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py
@@ -26,12 +26,13 @@ BASE_CONFIG = {
 }
 
 ldc_setup = {'Border': [
-    {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
+    {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
-    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
     {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
+    {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
+
 ]}
 
 
@@ -55,7 +56,7 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
 
 
 class Scenario:
-    def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(256, 1, 1),
+    def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(128, 1, 1),
                  timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False,
                  inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3,
                  init_shear_flow=False, boundary_setup=False,
@@ -110,7 +111,11 @@ class Scenario:
                 'innerOuterSplit': self.inner_outer_split,
                 'vtkWriteFrequency': self.vtk_write_frequency,
                 'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
+            },
+            'Logging': {
+                'logLevel': 'info',  # info progress detail tracing
             }
+
         }
         if self.boundary_setup:
             config_dict["Boundaries"] = ldc_setup
@@ -184,12 +189,14 @@ def overlap_benchmark():
     # no overlap
     scenarios.add(Scenario(time_step_strategy='noOverlap',
                            inner_outer_split=(1, 1, 1),
-                           cuda_enabled_mpi=cuda_enabled_mpi))
+                           cuda_enabled_mpi=cuda_enabled_mpi,
+                           outer_iterations=1))
 
     for inner_outer_split in inner_outer_splits:
         scenario = Scenario(time_step_strategy='simpleOverlap',
                             inner_outer_split=inner_outer_split,
-                            cuda_enabled_mpi=cuda_enabled_mpi)
+                            cuda_enabled_mpi=cuda_enabled_mpi,
+                            outer_iterations=1)
         scenarios.add(scenario)
 
 
@@ -228,6 +235,7 @@ def single_gpu_benchmark():
                                 cuda_blocks=cuda_block_size,
                                 time_step_strategy='kernelOnly',
                                 timesteps=num_time_steps(block_size, 2000),
+                                outer_iterations=1,
                                 additional_info=additional_info)
             scenarios.add(scenario)
 
@@ -237,18 +245,18 @@ def validation_run():
     wlb.log_info_on_root("Validation run")
     wlb.log_info_on_root("")
 
-    time_step_strategy = "noOverlap"  # "noOverlap"
+    time_step_strategy = "noOverlap"  # "simpleOverlap"
 
     scenarios = wlb.ScenarioManager()
-    scenario = Scenario(cells_per_block=(64, 64, 64),
+    scenario = Scenario(cells_per_block=(128, 128, 128),
                         time_step_strategy=time_step_strategy,
-                        timesteps=1000,
+                        timesteps=10001,
                         outer_iterations=1,
                         warmup_steps=0,
                         init_shear_flow=False,
                         boundary_setup=True,
-                        vtk_write_frequency=0,
-                        remaining_time_logger_frequency=10)
+                        vtk_write_frequency=5000,
+                        remaining_time_logger_frequency=30)
     scenarios.add(scenario)
 
 
diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
index 2800b98cb65008ef5d66aa98853fb5589087c8d5..9b883282a0437628203cfb04de660f0301b5758a 100644
--- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
+++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp
@@ -72,7 +72,7 @@ typedef gpu::GPUField< uint8_t > GPUField_int;
 
 int main(int argc, char** argv)
 {
-   mpi::Environment Env(argc, argv);
+   const mpi::Environment Env(argc, argv);
    gpu::selectDeviceBasedOnMpiRank();
    exportDataStructuresToPython();
 
@@ -114,17 +114,17 @@ int main(int argc, char** argv)
       BlockDataID vel_field   = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
       BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
       // GPU fields
-      BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+      const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1);
-      BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
+      const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >(
          blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1);
       BlockDataID vel_field_gpu =
          gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
       BlockDataID phase_field_gpu =
          gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
       // Flag field
-      BlockDataID flagFieldID     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
-      BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true);
+      const BlockDataID flagFieldID     = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
+      const BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true);
 
       auto physical_parameters     = config->getOneBlock("PhysicalParameters");
       const real_t density_liquid  = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0));
@@ -181,11 +181,11 @@ int main(int argc, char** argv)
                                                               interface_thickness);
       pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
 
-      pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu,
+      const pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu,
                                                           vel_field_gpu, mobility, interface_thickness, gpuBlockSize[0],
                                                           gpuBlockSize[1], gpuBlockSize[2]);
 
-      pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu,
+      const pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu,
                                               gravitational_acceleration, interface_thickness, density_liquid,
                                               density_gas, surface_tension, relaxation_time_liquid, relaxation_time_gas,
                                               gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
@@ -193,8 +193,8 @@ int main(int argc, char** argv)
       ////////////////////////
       // ADD COMMUNICATION //
       //////////////////////
-      int streamLowPriority  = 0;
-      int streamHighPriority = 0;
+      const int streamLowPriority  = 0;
+      const int streamHighPriority = 0;
       auto defaultStream     = gpu::StreamRAII::newPriorityStream(streamLowPriority);
       auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
 
@@ -204,20 +204,20 @@ int main(int argc, char** argv)
          make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
       UniformGPUSchemeVelocityDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
       auto Comm_velocity_based_distributions =
-         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->communicate(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->communicate(); });
       auto Comm_velocity_based_distributions_start =
-         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->startCommunication(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->startCommunication(); });
       auto Comm_velocity_based_distributions_wait =
-         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(); });
 
       auto UniformGPUSchemePhaseField =
          make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
       auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
       UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
-      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(defaultStream); });
+      auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
       auto Comm_phase_field_start =
-         std::function< void() >([&]() { UniformGPUSchemePhaseField->startCommunication(defaultStream); });
-      auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemePhaseField->startCommunication(); });
+      auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(); });
 
       auto UniformGPUSchemePhaseFieldDistributions =
          make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
@@ -225,11 +225,11 @@ int main(int argc, char** argv)
          make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
       UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
       auto Comm_phase_field_distributions =
-         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
       auto Comm_phase_field_distributions_start =
-         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
       auto Comm_phase_field_distributions_wait =
-         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(defaultStream); });
+         std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
 
       ////////////////////////
       // BOUNDARY HANDLING //
@@ -394,7 +394,7 @@ int main(int argc, char** argv)
                                                                                      targetRank, MPI_COMM_WORLD);
                   WALBERLA_EXCLUSIVE_WORLD_SECTION(targetRank)
                   {
-                     std::string path = "";
+                     const std::string path = "";
                      std::ostringstream out;
                      out << std::internal << std::setfill('0') << std::setw(6) << counter;
                      geometry::writeMesh(
diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
index 1856106c5b61880752c7216ee10eabda140485b1..0d57d59433ee7879ff3deb272d2a8c1cc8ff3419 100644
--- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
+++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp
@@ -205,7 +205,7 @@ int main(int argc, char** argv)
    const bool sendDirectlyFromGPU = false;
    gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU);
    com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
-   auto communication = std::function< void() >([&]() { com.communicate(nullptr); });
+   auto communication = std::function< void() >([&]() { com.communicate(); });
 #else
    blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks);
    communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId));
diff --git a/extern/pybind11 b/extern/pybind11
index 8b03ffa7c06cd9c8a38297b1c8923695d1ff1b07..f7b499615e14d70ab098a20deb0cdb3889998a1a 160000
--- a/extern/pybind11
+++ b/extern/pybind11
@@ -1 +1 @@
-Subproject commit 8b03ffa7c06cd9c8a38297b1c8923695d1ff1b07
+Subproject commit f7b499615e14d70ab098a20deb0cdb3889998a1a
diff --git a/src/blockforest/Initialization.cpp b/src/blockforest/Initialization.cpp
index b91923eebd5b53156fbe3e916c8aeb84963a283b..d800a75b10a112edba39a6655c84bfad53cb1426 100644
--- a/src/blockforest/Initialization.cpp
+++ b/src/blockforest/Initialization.cpp
@@ -130,7 +130,7 @@ shared_ptr< StructuredBlockForest > createUniformBlockGridFromConfig( const Conf
                                               cell_idx_c(cells[1])-1,
                                               cell_idx_c(cells[2])-1 );
 
-      uint_t nrOfProcesses = uint_c( MPIManager::instance()->numProcesses() );
+      const uint_t nrOfProcesses = uint_c( MPIManager::instance()->numProcesses() );
 
       calculateCellDistribution( cells, nrOfProcesses, blocks, cellsPerBlock );
    }
diff --git a/src/geometry/initializer/BoundaryFromDomainBorder.impl.h b/src/geometry/initializer/BoundaryFromDomainBorder.impl.h
index 38a97d82451d6fc114541f892aa023660c3a9b02..c3ae21b55cd3709ff28bdf7012151d51b6b311eb 100644
--- a/src/geometry/initializer/BoundaryFromDomainBorder.impl.h
+++ b/src/geometry/initializer/BoundaryFromDomainBorder.impl.h
@@ -65,7 +65,7 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block
    BoundarySetter<Handling> boundarySetter;
    boundarySetter.setConfigBlock( blockHandle );
 
-   std::string directionStr = blockHandle.getParameter<std::string>( "direction" );
+   const std::string directionStr = blockHandle.getParameter<std::string>( "direction" );
    cell_idx_t  wallDistance            = blockHandle.getParameter<cell_idx_t>( "walldistance", 0 );
    cell_idx_t  ghostLayersToInitialize = blockHandle.getParameter<cell_idx_t>( "ghostLayersToInitialize", std::numeric_limits<cell_idx_t>::max() );
 
@@ -75,8 +75,8 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block
    using stencil::D3Q7;
    for( auto dirIt = D3Q7::beginNoCenter(); dirIt != D3Q7::end(); ++dirIt )
    {
-      bool isAll = string_icompare( directionStr, "all" ) == 0;
-      bool isInDirectionStrings = std::find( directionStrings.begin(), directionStrings.end(),
+      const bool isAll = string_icompare( directionStr, "all" ) == 0;
+      const bool isInDirectionStrings = std::find( directionStrings.begin(), directionStrings.end(),
                                              stencil::dirToString[*dirIt] ) != directionStrings.end();
 
       if( isAll || isInDirectionStrings )
@@ -87,7 +87,7 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block
    }
 
    if ( ! atLeastOneBoundarySet )
-      WALBERLA_ABORT( "Invalid Direction " << directionStr << ". Allowed values: all, N,S,W,E,T,B ");
+      WALBERLA_ABORT( "Invalid Direction " << directionStr << ". Allowed values: all, N,S,W,E,T,B ")
 }
 
 
diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h
index 093ec4cad2a830a80042073f905bb1c7316bf8ae..745d28cc5f18e0df1ce6eeeda0cfbf5d478656ee 100644
--- a/src/gpu/communication/NonUniformGPUScheme.h
+++ b/src/gpu/communication/NonUniformGPUScheme.h
@@ -307,6 +307,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i
       for (auto it : headers_[EQUAL_LEVEL][index])
          bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(it.first).clear();
 
+   // wait until communication dependent kernels are finished
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
    // Start filling send buffers
    for (auto& iBlock : *forest)
    {
@@ -397,6 +400,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t
       for (auto it : headers_[COARSE_TO_FINE][index])
          bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear();
 
+   // wait until communication dependent kernels are finished
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
    // Start filling send buffers
    for (auto& iBlock : *forest)
    {
@@ -431,7 +437,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t
                {
                   WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
                   WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir))
-                  pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]);
+                  pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr);
                }
             }
             else
@@ -500,6 +506,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t
       for (auto it : headers_[FINE_TO_COARSE][index])
          bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear();
 
+   // wait until communication dependent kernels are finished
+   WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
    // Start filling send buffers
    for (auto& iBlock : *forest)
    {
@@ -532,7 +541,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t
             {
                WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur())
                WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir))
-               pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]);
+               pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr);
             }
          }
          else
diff --git a/src/gpu/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h
index 5c9604ccd8cc00e5cdb2d9f9c1085ace2f2e44a5..bc481d8950c25d4aa5196316c641e8b67e34318a 100644
--- a/src/gpu/communication/UniformGPUScheme.h
+++ b/src/gpu/communication/UniformGPUScheme.h
@@ -18,7 +18,6 @@
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
-
 #pragma once
 
 #include "blockforest/StructuredBlockForest.h"
@@ -32,9 +31,7 @@
 
 #include <thread>
 
-#include "gpu/GPURAII.h"
 #include "gpu/GPUWrapper.h"
-#include "gpu/ParallelStreams.h"
 #include "gpu/communication/CustomMemoryBuffer.h"
 #include "gpu/communication/GeneratedGPUPackInfo.h"
 
@@ -49,29 +46,34 @@ namespace communication {
    class UniformGPUScheme
    {
    public:
-       explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf,
-                                  bool sendDirectlyFromGPU = false,
-                                  bool useLocalCommunication = true,
+       explicit UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf,
+                                  const bool sendDirectlyFromGPU = false,
+                                  const bool useLocalCommunication = true,
                                   const int tag = 5432 );
 
-       explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf,
-                                 const Set<SUID> & requiredBlockSelectors,
-                                 const Set<SUID> & incompatibleBlockSelectors,
-                                 bool sendDirectlyFromGPU = false,
-                                 bool useLocalCommunication = true,
-                                 const int tag = 5432 );
+       explicit UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf,
+                                  const Set<SUID> & requiredBlockSelectors,
+                                  const Set<SUID> & incompatibleBlockSelectors,
+                                  const bool sendDirectlyFromGPU = false,
+                                  const bool useLocalCommunication = true,
+                                  const int tag = 5432 );
+       ~UniformGPUScheme()
+       {
+          for (uint_t i = 0; i < Stencil::Q; ++i)
+             WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[i]))
+       }
 
        void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
 
-       void startCommunication( gpuStream_t stream = nullptr);
-       void wait( gpuStream_t stream = nullptr);
+       void startCommunication();
+       void wait();
 
-       void operator()( gpuStream_t stream = nullptr )         { communicate( stream ); }
-       inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); }
+       void operator()()         { communicate( ); }
+       inline void communicate() { startCommunication(); wait(); }
 
-       std::function<void()> getCommunicateFunctor( gpuStream_t stream = nullptr );
-       std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr );
-       std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr );
+       std::function<void()> getCommunicateFunctor();
+       std::function<void()> getStartCommunicateFunctor();
+       std::function<void()> getWaitFunctor();
 
    private:
        void setupCommunication();
@@ -81,8 +83,8 @@ namespace communication {
 
        bool setupBeforeNextCommunication_;
        bool communicationInProgress_;
-       bool sendFromGPU_;
-       bool useLocalCommunication_;
+       const bool sendFromGPU_;
+       const bool useLocalCommunication_;
 
        using CpuBuffer_T = gpu::communication::PinnedMemoryBuffer;
        using GpuBuffer_T = gpu::communication::GPUMemoryBuffer;
@@ -92,8 +94,6 @@ namespace communication {
 
        std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_;
 
-       ParallelStreams parallelSectionManager_;
-
        struct Header
        {
            BlockID blockId;
@@ -103,6 +103,8 @@ namespace communication {
 
        Set<SUID> requiredBlockSelectors_;
        Set<SUID> incompatibleBlockSelectors_;
+
+       gpuStream_t streams_[Stencil::Q];
    };
 
 
diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h
index 93f6dd85e0e3f44293b9943e1bf252ce52c6ad33..84d9e0f22dd5661d1d428525d3758a5bb9a29488 100644
--- a/src/gpu/communication/UniformGPUScheme.impl.h
+++ b/src/gpu/communication/UniformGPUScheme.impl.h
@@ -19,10 +19,6 @@
 //
 //======================================================================================================================
 
-#include "core/mpi/MPIWrapper.h"
-
-#include "gpu/ParallelStreams.h"
-
 namespace walberla {
 namespace gpu
 {
@@ -30,9 +26,9 @@ namespace communication {
 
 
    template<typename Stencil>
-   UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
-                                                bool sendDirectlyFromGPU,
-                                                bool useLocalCommunication,
+   UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf,
+                                                const bool sendDirectlyFromGPU,
+                                                const bool useLocalCommunication,
                                                 const int tag )
         : blockForest_( bf ),
           setupBeforeNextCommunication_( true ),
@@ -41,7 +37,6 @@ namespace communication {
           useLocalCommunication_(useLocalCommunication),
           bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
           bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
-          parallelSectionManager_( -1 ),
           requiredBlockSelectors_( Set<SUID>::emptySet() ),
           incompatibleBlockSelectors_( Set<SUID>::emptySet() )
    {
@@ -52,14 +47,17 @@ namespace communication {
          WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
       }
+
+      for (uint_t i = 0; i < Stencil::Q; ++i)
+         WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
    }
 
    template<typename Stencil>
-   UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf,
+   UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf,
                                                 const Set<SUID> & requiredBlockSelectors,
                                                 const Set<SUID> & incompatibleBlockSelectors,
-                                                bool sendDirectlyFromGPU,
-                                                bool useLocalCommunication,
+                                                const bool sendDirectlyFromGPU,
+                                                const bool useLocalCommunication,
                                                 const int tag )
       : blockForest_( bf ),
         setupBeforeNextCommunication_( true ),
@@ -68,7 +66,6 @@ namespace communication {
         useLocalCommunication_(useLocalCommunication),
         bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
         bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
-        parallelSectionManager_( -1 ),
         requiredBlockSelectors_( requiredBlockSelectors ),
         incompatibleBlockSelectors_( incompatibleBlockSelectors )
    {
@@ -78,11 +75,14 @@ namespace communication {
          WALBERLA_CHECK(!sendDirectlyFromGPU)
 #endif
       }
+
+      for (uint_t i = 0; i < Stencil::Q; ++i)
+         WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i]))
    }
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::startCommunication( gpuStream_t stream )
+   void UniformGPUScheme<Stencil>::startCommunication( )
    {
       WALBERLA_ASSERT( !communicationInProgress_ )
       auto forest = blockForest_.lock();
@@ -102,9 +102,11 @@ namespace communication {
          for( auto it : headers_ )
             bufferSystemGPU_.sendBuffer( it.first ).clear();
 
+      // wait until communication dependent kernels are finished
+      WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
+
       // Start filling send buffers
       {
-         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto &iBlock : *forest )
          {
             auto senderBlock = dynamic_cast< Block * >( &iBlock );
@@ -127,7 +129,7 @@ namespace communication {
                   auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) );
                   for (auto& pi : packInfos_)
                   {
-                     pi->communicateLocal(*dir, senderBlock, receiverBlock, stream);
+                     pi->communicateLocal(*dir, senderBlock, receiverBlock, streams_[*dir]);
                   }
                }
                else
@@ -136,26 +138,27 @@ namespace communication {
 
                   for( auto &pi : packInfos_ )
                   {
-                     parallelSection.run([&](auto s) {
                      auto size = pi->size( *dir, senderBlock );
                      auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
                      WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                     pi->pack( *dir, gpuDataPtr, senderBlock, s );
+                     pi->pack( *dir, gpuDataPtr, senderBlock, streams_[*dir] );
 
                      if( !sendFromGPU_ )
                      {
                         auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
                         WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
-                        WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s ))
+                        WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir] ))
                      }
-                     });
                   }
                }
             }
          }
       }
       // wait for packing to finish
-      WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) );
+      for (uint_t i = 0; i < Stencil::Q; ++i)
+      {
+         WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+      }
 
       if( sendFromGPU_ )
          bufferSystemGPU_.sendAll();
@@ -167,7 +170,7 @@ namespace communication {
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::wait( gpuStream_t stream )
+   void UniformGPUScheme<Stencil>::wait()
    {
       WALBERLA_ASSERT( communicationInProgress_ )
 
@@ -175,7 +178,6 @@ namespace communication {
 
       if( sendFromGPU_ )
       {
-         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo )
          {
             recvInfo.buffer().clear();
@@ -188,16 +190,13 @@ namespace communication {
                   auto size = pi->size( header.dir, block );
                   auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size );
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                  parallelSection.run([&](auto s) {
-                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
-                  });
+                  pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[header.dir] );
                }
             }
          }
       }
       else
       {
-         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo )
          {
             auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank());
@@ -214,17 +213,17 @@ namespace communication {
                   auto gpuDataPtr = gpuBuffer.advanceNoResize( size );
                   WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr )
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr )
-                  parallelSection.run([&](auto s) {
                      WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
-                                                           gpuMemcpyHostToDevice, s ))
-                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
-                  });
+                                                           gpuMemcpyHostToDevice, streams_[header.dir] ))
+                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[header.dir] );
                }
             }
          }
       }
-
-      WALBERLA_GPU_CHECK( gpuDeviceSynchronize() )
+      for (uint_t i = 0; i < Stencil::Q; ++i)
+      {
+         WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i]))
+      }
       communicationInProgress_ = false;
    }
 
@@ -312,21 +311,21 @@ namespace communication {
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor(gpuStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor()
    {
-      return [this, stream]() { communicate( stream ); };
+      return [this]() { communicate( ); };
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor(gpuStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor()
    {
-      return [this, stream]() { startCommunication( stream ); };
+      return [this]() { startCommunication(); };
    }
 
    template< typename Stencil >
-   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(gpuStream_t stream)
+   std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor()
    {
-      return [this, stream]() { wait( stream ); };
+      return [this]() { wait(); };
    }
 
 } // namespace communication
diff --git a/tests/gpu/communication/CommTest.cpp b/tests/gpu/communication/CommTest.cpp
index 5bc87aa13f9a2c72351b532fb20e3614cdfc8d82..a8a60321d9ee45f131843dbcb9f974f41ace0b01 100644
--- a/tests/gpu/communication/CommTest.cpp
+++ b/tests/gpu/communication/CommTest.cpp
@@ -47,7 +47,7 @@ void hostToHost()
       hostField2.set(hostField1);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void hostToDevice()
@@ -61,7 +61,7 @@ void hostToDevice()
       gpu::fieldCpy(deviceField, hostField);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void deviceToHost()
@@ -76,7 +76,7 @@ void deviceToHost()
       gpu::fieldCpy(hostField, deviceField);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiHostToHost()
@@ -100,7 +100,7 @@ void mpiHostToHost()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiHostToDevice()
@@ -124,7 +124,7 @@ void mpiHostToDevice()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiDeviceToHost()
@@ -148,7 +148,7 @@ void mpiDeviceToHost()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiDeviceToDevice()
@@ -172,7 +172,7 @@ void mpiDeviceToDevice()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiCopyHostToDevice()
@@ -199,7 +199,7 @@ void mpiCopyHostToDevice()
       gpu::fieldCpy(deviceField, hostField2);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 void mpiCopyDeviceToHost()
@@ -226,7 +226,7 @@ void mpiCopyDeviceToHost()
       MPI_Wait(&request2, MPI_STATUS_IGNORE);
    }
    double const endTime = MPI_Wtime();
-   std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl;
+   std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n';
 }
 
 int main(int argc, char** argv)